In [70]:
file_path = "../../datasets/Model_data.csv"

In [71]:
import pandas as pd

In [72]:
df = pd.read_csv(file_path)

In [73]:
df.iloc[0]

movie_id                                                     1
title                                                Toy Story
genres             Comedy|Adventure|Animation|Children|Fantasy
directors                                        John Lasseter
actors       Tom Hanks|Tim Allen|Jim Varney|Don Rickles|Wal...
overview     Led by Woody, Andy's toys live happily in his ...
Name: 0, dtype: object

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie_id   9551 non-null   int64 
 1   title      9551 non-null   object
 2   genres     9551 non-null   object
 3   directors  9551 non-null   object
 4   actors     9551 non-null   object
 5   overview   9551 non-null   object
dtypes: int64(1), object(5)
memory usage: 447.8+ KB


In [75]:
df.isnull().sum()

movie_id     0
title        0
genres       0
directors    0
actors       0
overview     0
dtype: int64

In [76]:
import nltk
from nltk.stem.porter import PorterStemmer

In [77]:
ps = PorterStemmer()

In [78]:
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return " ".join(y)

In [79]:
df['genres'] = df['genres'].apply(lambda x: x.split('|'))
df['actors'] = df['actors'].apply(lambda x: x.split('|'))
df['directors'] = df['directors'].apply(lambda x: x.split('|'))

In [80]:
df['genres'] = df['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
df['actors'] = df['actors'].apply(lambda x: [i.replace(" ", "") for i in x])
df['directors'] = df['directors'].apply(lambda x: [i.replace(" ", "") for i in x])
# df['overview'] = df['overview'].apply(stem)
df['overview'] = df['overview'].apply(lambda x: [x])

In [81]:
df['tags'] = df['genres'] + df['directors'] + df['actors'] + df['overview']

In [82]:
new_df = df[['movie_id', 'title','tags']].copy()

In [83]:
new_df.iloc[0]

movie_id                                                    1
title                                               Toy Story
tags        [Comedy, Adventure, Animation, Children, Fanta...
Name: 0, dtype: object

In [84]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

In [85]:
new_df['tags'][0]

"Comedy Adventure Animation Children Fantasy JohnLasseter TomHanks TimAllen JimVarney DonRickles WallaceShawn Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [86]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
cv = CountVectorizer(max_features=5000, stop_words='english')

In [88]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [89]:
print(cv.get_feature_names_out())

['000' '10' '100' ... 'zone' 'zoo' 'zooeydeschanel']


In [90]:
similarity = cosine_similarity(vectors)

In [91]:
similarity.shape

(9551, 9551)

In [92]:
# similarity

In [93]:
def recommend(movie):
  movie_index = new_df[new_df["movie_id"] == movie].index[0]
  print(movie_index)
  distances = similarity[movie_index]
  movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[:11]
  print(movie_list)
  movie_indexes = [i[0] for i in movie_list]
  print(movie_indexes)
  movies_ides = [[new_df.iloc[i].movie_id, new_df.iloc[i].title] for i in movie_indexes]
  return movies_ides

In [94]:
print(recommend(1))

0
[(0, np.float64(0.9999999999999998)), (7285, np.float64(0.5298129428260177)), (2350, np.float64(0.4954403570783588)), (5932, np.float64(0.30117993528988257)), (8601, np.float64(0.20951312035156963)), (1509, np.float64(0.19364916731037085)), (6150, np.float64(0.18681617943926832)), (5649, np.float64(0.181848241863327)), (922, np.float64(0.1756820922315766)), (9062, np.float64(0.1756820922315766)), (9409, np.float64(0.17213259316477408))]
[0, 7285, 2350, 5932, 8601, 1509, 6150, 5649, 922, 9062, 9409]
[[np.int64(1), 'Toy Story'], [np.int64(78499), 'Toy Story 3'], [np.int64(3114), 'Toy Story 2'], [np.int64(35836), 'The 40 Year Old Virgin'], [np.int64(125970), 'Halloweentown'], [np.int64(2041), 'Condorman'], [np.int64(45074), 'The Wild'], [np.int64(27731), 'The Cat Returns'], [np.int64(1223), 'A Grand Day Out'], [np.int64(151653), 'Welcome to Happiness'], [np.int64(175397), 'In the Blue Sea, in the White Foam...']]


In [95]:
new_df[new_df['movie_id'] == 78499]

Unnamed: 0,movie_id,title,tags
7285,78499,Toy Story 3,comedy adventure animation children fantasy le...


In [96]:
new_df.iloc[7285]

movie_id                                                78499
title                                             Toy Story 3
tags        comedy adventure animation children fantasy le...
Name: 7285, dtype: object

In [104]:
print(recommend(122912))

8581
[(8581, np.float64(1.0)), (8575, np.float64(0.3407771005482389)), (8582, np.float64(0.29329423004270666)), (7618, np.float64(0.2813874297663251)), (6458, np.float64(0.26681493838602804)), (5001, np.float64(0.25923792368260634)), (7552, np.float64(0.25883878884437284)), (8839, np.float64(0.25558282541117916)), (8584, np.float64(0.2514474228374849)), (8832, np.float64(0.24906774069335896)), (8586, np.float64(0.24722569302909875))]
[8581, 8575, 8582, 7618, 6458, 5001, 7552, 8839, 8584, 8832, 8586]
[[np.int64(122912), 'Avengers: Infinity War'], [np.int64(122892), 'Avengers: Age of Ultron'], [np.int64(122916), 'Thor: Ragnarok'], [np.int64(89745), 'The Avengers'], [np.int64(53464), 'Fantastic Four: Rise of the Silver Surfer'], [np.int64(7810), 'Babylon 5: A Call to Arms'], [np.int64(87430), 'Green Lantern'], [np.int64(136864), 'Batman v Superman: Dawn of Justice'], [np.int64(122920), 'Captain America: Civil War'], [np.int64(136800), 'Robot Overlords'], [np.int64(122924), 'X-Men: Apocaly

In [105]:
print(recommend(89745))

7618
[(7618, np.float64(1.0)), (8575, np.float64(0.38533731779422625)), (8582, np.float64(0.28426762180748066)), (8581, np.float64(0.2813874297663251)), (2921, np.float64(0.27524094128159016)), (4935, np.float64(0.27272727272727276)), (8584, np.float64(0.24370871833797703)), (2947, np.float64(0.24140227479263382)), (8586, np.float64(0.23961691914926153)), (5236, np.float64(0.23450882356048286)), (9541, np.float64(0.23262105259961777))]
[7618, 8575, 8582, 8581, 2921, 4935, 8584, 2947, 8586, 5236, 9541]
[[np.int64(89745), 'The Avengers'], [np.int64(122892), 'Avengers: Age of Ultron'], [np.int64(122916), 'Thor: Ragnarok'], [np.int64(122912), 'Avengers: Infinity War'], [np.int64(3926), 'Voyage to the Bottom of the Sea'], [np.int64(7481), 'Enemy Mine'], [np.int64(122920), 'Captain America: Civil War'], [np.int64(3959), 'The Time Machine'], [np.int64(122924), 'X-Men: Apocalypse'], [np.int64(8633), 'The Last Starfighter'], [np.int64(191005), 'Gintama']]


In [98]:
new_df[new_df['movie_id'] == 940]

Unnamed: 0,movie_id,title,tags
719,940,The Adventures of Robin Hood,adventure action romance michaelcurtiz william...


In [99]:
new_df.iloc[1608]

movie_id                                                 2153
title                                            The Avengers
tags        adventure action jeremiahs.chechik seanconnery...
Name: 1608, dtype: object

In [107]:
import pickle
pickle.dump(new_df.to_dict(), open('../models/movie_dict.pkl', 'wb'))
pickle.dump(similarity, open('../models/similarity.pkl', 'wb'))