In [168]:
import sys
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import requests
from bs4 import BeautifulSoup

import nltk

In [71]:
!pip install rake-nltk



In [72]:
from rake_nltk import Rake

In [3]:
MOVIE_FILE = '../Downloads/ml-25m/xsmall_movies.csv'
LINKS_FILE = '../Downloads/ml-25m/links.csv'

In [64]:
web_address_0 = 'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating'
web_address_1 = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=101&ref_=adv_nxt'
web_address_2 = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=201&ref_=adv_nxt'
web_address_3 = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=301&ref_=adv_nxt'
web_address_4 = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=401&ref_=adv_nxt'
web_address_5 = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=501&ref_=adv_nxt'
web_address_6 = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=601&ref_=adv_nxt'
web_address_7 = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=701&ref_=adv_nxt'
web_address_8 = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=801&ref_=adv_nxt'
web_address_9 = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=901&ref_=adv_nxt'
web_addresses = [web_address_0,web_address_1,web_address_2,web_address_3,web_address_4,web_address_5,web_address_6,
                 web_address_7,web_address_8,web_address_9]

In [62]:
def get_100(web_address):
    titles = []
    genres = []
    directors = []
    casts = []
    descriptions = []
    try:
        res = requests.get(web_address, headers={'User-Agent': 'Mozilla/5.0'})
    except requests.exceptions.RequestException as err:
        print("requests exception found", err)
    if res.status_code == 200:
        soup = BeautifulSoup(res.text, 'html.parser')
        top_1000 = soup.find_all("div", {"class": "lister-item-content"})
        for item in top_1000:
            title = item.h3.getText().strip().split('\n')
            title = title[1:]
            title = ' '.join(title)
            titles.append(title)
            genre = item.find("span", {"class": "genre"}).getText().strip().replace('\n', '')
            genres.append(genre)
            description = item.find_all("p")[1].getText().replace('\n', '')
            descriptions.append(description)
            data = item.find("p", {"class": ""}).getText()
            director, cast = data.split('|')
            director = director.split(':')[1].replace('\n', '')
            cast = cast.split(':')[1].replace('\n', '')
            directors.append(director)
            casts.append(cast)
            
        return titles, genres, descriptions, directors, casts

In [66]:
def build_df(web_addresses):
    data = []
    for i in range(10):
        titles, genres, descriptions, directors, casts = get_100(web_addresses[i])
        titles = pd.Series(titles)
        genres = pd.Series(genres)
        descriptions = pd.Series(descriptions)
        directors = pd.Series(directors)
        cast = pd.Series(casts)
        df = pd.concat([titles, genres, descriptions, directors, cast], axis=1)
        df.rename(columns={0: 'Title', 1: 'Genre', 2: 'Description', 3: 'Director', 4: 'Cast'}, inplace=True)
        data.append(df)
        
    df = pd.concat(data, ignore_index=True)
    return df
             

In [160]:
df = build_df(web_addresses)


In [135]:
df.head()

Unnamed: 0,Title,Genre,Description,Director,Cast
0,The Shawshank Redemption (1994),Drama,"Over the course of several years, two convicts...",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi..."
1,The Godfather (1972),"Crime, Drama","Don Vito Corleone, head of a mafia family, dec...",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Diane Ke..."
2,The Dark Knight (2008),"Action, Crime, Drama",When the menace known as the Joker wreaks havo...,Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,Schindler's List (1993),"Biography, Drama, History","In German-occupied Poland during World War II,...",Steven Spielberg,"Liam Neeson, Ralph Fiennes, Ben Kingsley, Caro..."
4,The Godfather Part II (1974),"Crime, Drama",The early life and career of Vito Corleone in ...,Francis Ford Coppola,"Al Pacino, Robert De Niro, Robert Duvall, Dian..."


In [83]:
def extract_keywords(text):
    
    r = Rake()

    r.extract_keywords_from_text(text)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
   
    key_words = list(key_words_dict_scores.keys())
    
    return key_words

In [161]:
def transform_df(df):
    df['Genre'] = df['Genre'].map(lambda x: x.replace(' ',''))
    df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))
    df['Director'] = df['Director'].map(lambda x: x.replace(' ',''))
    df['Director'] = df['Director'].map(lambda x: x.lower().split(','))
    df['Cast'] = df['Cast'].map(lambda x: x.replace(' ',''))
    df['Cast'] = df['Cast'].map(lambda x: x.lower().split(','))
    df['Keywords'] = df['Description'].map(extract_keywords)
    df['Bagofwords'] = df.Genre + df.Director + df.Cast + df.Keywords
    df['Bagofwords'] = df['Bagofwords'].map(lambda x: ' '.join(x))
    transformed_df = df.drop(columns=['Genre', 'Description', 'Director', 'Cast', 'Keywords'])
    
    return transformed_df
    

In [162]:
transformed_df = transform_df(df)

In [163]:
transformed_df.head()

Unnamed: 0,Title,Bagofwords
0,The Shawshank Redemption (1994),drama frankdarabont timrobbins morganfreeman b...
1,The Godfather (1972),crime drama francisfordcoppola marlonbrando al...
2,The Dark Knight (2008),action crime drama christophernolan christianb...
3,Schindler's List (1993),biography drama history stevenspielberg liamne...
4,The Godfather Part II (1974),crime drama francisfordcoppola alpacino robert...


In [177]:
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(transformed_df['Bagofwords'])
count_array = count_matrix.toarray()
count_df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())

In [178]:
count_df.head()

Unnamed: 0,00,000,007,10,100,100th,11,1183,1190s,11th,...,zulu,álvaroguerrero,çaganirmak,çetintekindor,émilevallée,érictoledano,ömerfaruksorak,ömervargi,özgeözberk,özgürerenkoç
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [169]:
count_array.shape

(1000, 9133)

In [179]:
count_df.insert(0, 'Title', transformed_df['Title'])

In [176]:
count_df.head()

Unnamed: 0,Title,00,000,007,10,100,100th,11,1183,1190s,...,zulu,álvaroguerrero,çaganirmak,çetintekindor,émilevallée,érictoledano,ömerfaruksorak,ömervargi,özgeözberk,özgürerenkoç
0,The Shawshank Redemption (1994),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Godfather (1972),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Dark Knight (2008),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Schindler's List (1993),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Godfather Part II (1974),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [213]:
'Toy Story (1995)' in transformed_df['Title'].values

True

First didn't realize 'Toy Story' was in the dataframe (used wrong command to check).

In [180]:
toy_story_bag = 'adventure animation children comedy fantasy johnlasseter tomhanks timallen donrickles tomhanks timallen donrickles cowboy doll profoundly threatened jealous new spaceman action figure supplants top toy boy bedroom '

In [182]:
v = vectorizer.transform([toy_story_bag])

In [184]:
v

<1x9133 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [193]:
cos_sim = cosine_similarity(count_df.iloc[:,1:],v)

In [196]:
cos_sim.shape

(1000, 1)

In [197]:
cos_sim  = cos_sim.flatten()

In [214]:
sorted = np.argsort(cos_sim)
reverse_sorted = sorted[::-1]
rec_indices = reverse_sorted[:10]
rec_indices

array([105, 811, 512, 112, 652, 259, 973, 655, 975, 378])

In [216]:
recommendations = count_df.iloc[rec_indices,0]
recommendations

105          Toy Story (1995)
811        Toy Story 4 (2019)
512        Toy Story 2 (1999)
112        Toy Story 3 (2010)
652       Isle of Dogs (2018)
259     The Iron Giant (1999)
973    The Jungle Book (1967)
655       Mary Poppins (1964)
975              Ponyo (2008)
378               Soul (2020)
Name: Title, dtype: object