In [16]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import textblob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
df = pd.read_csv("../data/imdb_top_1000.csv")

In [6]:
df.head(2)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411


In [7]:
df['data'] = df['Series_Title'] + ' ' + df['Director'] + ' ' + df['Genre']

In [8]:
df['data']

0          The Shawshank Redemption Frank Darabont Drama
1        The Godfather Francis Ford Coppola Crime, Drama
2      The Dark Knight Christopher Nolan Action, Crim...
3      The Godfather: Part II Francis Ford Coppola Cr...
4                 12 Angry Men Sidney Lumet Crime, Drama
                             ...                        
995    Breakfast at Tiffany's Blake Edwards Comedy, D...
996                  Giant George Stevens Drama, Western
997    From Here to Eternity Fred Zinnemann Drama, Ro...
998                 Lifeboat Alfred Hitchcock Drama, War
999    The 39 Steps Alfred Hitchcock Crime, Mystery, ...
Name: data, Length: 1000, dtype: object

In [9]:
# remove Punctuation - anything that is not a word or a space
df['data'] = df['data'].str.replace('[^\w\s]', '')
# lowercase
df['data'] = df['data'].str.lower()


In [10]:
df['data']

0          the shawshank redemption frank darabont drama
1        the godfather francis ford coppola crime, drama
2      the dark knight christopher nolan action, crim...
3      the godfather: part ii francis ford coppola cr...
4                 12 angry men sidney lumet crime, drama
                             ...                        
995    breakfast at tiffany's blake edwards comedy, d...
996                  giant george stevens drama, western
997    from here to eternity fred zinnemann drama, ro...
998                 lifeboat alfred hitchcock drama, war
999    the 39 steps alfred hitchcock crime, mystery, ...
Name: data, Length: 1000, dtype: object

```python -m nltk.downloader stopwords```

In [12]:
def remove_stopwords(text):
    words = text.split()
    return ' '.join([word for word in words if word not in stopwords.words('english')])

df['data'] = df['data'].apply(remove_stopwords)
df['data']

0              shawshank redemption frank darabont drama
1            godfather francis ford coppola crime, drama
2      dark knight christopher nolan action, crime, d...
3      godfather: part ii francis ford coppola crime,...
4                 12 angry men sidney lumet crime, drama
                             ...                        
995    breakfast tiffany's blake edwards comedy, dram...
996                  giant george stevens drama, western
997          eternity fred zinnemann drama, romance, war
998                 lifeboat alfred hitchcock drama, war
999    39 steps alfred hitchcock crime, mystery, thri...
Name: data, Length: 1000, dtype: object

In [20]:
# create a count vectorizer object
vectorizer = CountVectorizer()

# this steps generates word counts for the words in your docs
X = vectorizer.fit_transform(df['data']).toarray()
print(X.shape)
X

(1000, 2444)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
# Similarity Matrix
similarity = cosine_similarity(X,X)
print(similarity)
similarity.shape

[[1.         0.18257419 0.16903085 ... 0.18257419 0.2        0.        ]
 [0.18257419 1.         0.3086067  ... 0.16666667 0.18257419 0.15430335]
 [0.16903085 0.3086067  1.         ... 0.15430335 0.16903085 0.14285714]
 ...
 [0.18257419 0.16666667 0.15430335 ... 1.         0.36514837 0.        ]
 [0.2        0.18257419 0.16903085 ... 0.36514837 1.         0.3380617 ]
 [0.         0.15430335 0.14285714 ... 0.         0.3380617  1.        ]]


(1000, 1000)

In [28]:
# get the index of the movie
def get_index_from_title(title):
    try:
        return df[df.Series_Title == title].index[0]
    except:
        return None
def recommend_movie(title, limit = 10):
    index = get_index_from_title(title)
    if index is None:
        return None
    else:
        movie_scores = []
        for i in range(similarity.shape[0]):
            movie_scores.append((df['Series_Title'][i], similarity[index][i]))
        movie_scores.sort(key = lambda x: x[1], reverse = True)
        return movie_scores[1:limit+1]

In [37]:
recommend_movie('The Dark Knight', 2)

[('The Dark Knight Rises', 0.7142857142857141),
 ('Dunkirk', 0.6172133998483676)]