In [1]:
# Dependencies.
# Data.
import pandas as pd
import numpy as np

# Model.
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Misc.
from difflib import *
from api_keys import ACCESS_TOKEN
from time import sleep, time

# Get rid of warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("resources/ml_movie.csv", index_col = False)

display(df.info())
display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3067 entries, 0 to 3066
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   title    3067 non-null   object 
 1   rating   3067 non-null   float64
 2   feature  3067 non-null   object 
dtypes: float64(1), object(2)
memory usage: 72.0+ KB


None

Unnamed: 0,title,rating,feature
0,The Shawshank Redemption,9.3,two imprisoned men bond number year find solac...
1,The Godfather,9.2,organize crime dynasty age patriarch transfer ...
2,The Dark Knight,9.0,menace know joker wreak havoc chaos people got...
3,The Godfather: Part II,9.0,early life career vito corleone 1920s new york...
4,12 Angry Men,9.0,jury holdout attempt prevent miscarriage justi...


In [3]:
v = TfidfVectorizer()
cv = CountVectorizer()

In [23]:
tfidf_matrix = v.fit_transform(df.feature)
cos_sim = cosine_similarity(tfidf_matrix)
title = "The Avengers"

movie_id = df.loc[df['title'] == title, :].index[0]
score = list(enumerate(cos_sim[movie_id]))
sorted_score = sorted(score, key = lambda x: x[1], reverse = True)

print("TFIDF")

i = 0
for item in sorted_score:
    movie_title = df.loc[item[0], 'title']
    print(i+1,movie_title)
    i = i+1
    if i > 10:
        break

TFIDF
1 The Avengers
2 Avengers: Age of Ultron
3 Captain America: Civil War
4 Captain America: The Winter Soldier
5 Serenity
6 Avengers: Infinity War
7 Man of Steel
8 Arrival
9 The Nanny Diaries
10 Avengers: Endgame
11 Iron Man


In [5]:
cv_matrix = cv.fit_transform(df.feature)
cos_sim = cosine_similarity(cv_matrix)
title = "Jaws"

movie_id = df.loc[df['title'] == title, :].index[0]
score = list(enumerate(cos_sim[movie_id]))
sorted_score = sorted(score, key = lambda x: x[1], reverse = True)

print("COUNT VECTOR")

i = 0
for item in sorted_score[1:11]:
    movie_title = df.loc[item[0], 'title']
    print(i+1,movie_title)
    i += 1
    if i > 10:
        break

COUNT VECTOR
1 Jaws
2 Close Encounters of the Third Kind
3 Jaws: The Revenge
4 The Sting
5 Dead Shot
6 Rambo: First Blood Part II
7 Jurassic Park
8 The Beach
9 Out of Darkness
10 War of the Worlds
11 Cast Away


In [12]:
df.loc[11, 'feature']

'presidency kennedy johnson event vietnam watergate historical event unfold perspective alabama man iq 75 whose desire reunite childhood sweetheart drama romance robert zemeckis tom hanks robin wright gary sinise sally field'

In [6]:
# Can use USER INPUT to find an 'undefined movie'?
df2 = df.copy()

In [13]:
user_title = "USER INPUT"
user_desc = "man fight war get medal find love have child"

user_input = {
    "title": user_title,
    "feature": user_desc
}

df_add = pd.DataFrame(user_input, columns = ['title', 'feature'], index = [0])

display(df_add.info())
display(df_add.head())

<class 'pandas.core.frame.DataFrame'>
Index: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    1 non-null      object
 1   feature  1 non-null      object
dtypes: object(2)
memory usage: 24.0+ bytes


None

Unnamed: 0,title,feature
0,USER INPUT,man fight war get medal find love have child


In [14]:
df_con = pd.concat([df2, df_add]).reset_index(drop = True)

display(df_con.info())
display(df_con.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3068 entries, 0 to 3067
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   title    3068 non-null   object 
 1   rating   3067 non-null   float64
 2   feature  3068 non-null   object 
dtypes: float64(1), object(2)
memory usage: 72.0+ KB


None

Unnamed: 0,title,rating,feature
0,The Shawshank Redemption,9.3,two imprisoned men bond number year find solac...
1,The Godfather,9.2,organize crime dynasty age patriarch transfer ...
2,The Dark Knight,9.0,menace know joker wreak havoc chaos people got...
3,The Godfather: Part II,9.0,early life career vito corleone 1920s new york...
4,12 Angry Men,9.0,jury holdout attempt prevent miscarriage justi...


In [27]:
tfidf_matrix = v.fit_transform(df_con.feature)
cos_sim = cosine_similarity(tfidf_matrix)
title = "USER INPUT"

movie_id = df_con.loc[df_con['title'] == title, :].index[0]
score = list(enumerate(cos_sim[movie_id]))
sorted_score = sorted(score, key = lambda x: x[1], reverse = True)

print("TFIDF")

i = 0
for item in sorted_score[0:12]:
    movie_title = df_con.loc[item[0], 'title']
    print(i+1,movie_title)
    i = i+1
    if i > 10:
        break

TFIDF
1 USER INPUT
2 Hacksaw Ridge
3 Monsters, Inc.
4 Beasts of No Nation
5 Bedknobs and Broomsticks
6 If Only
7 Undisputed II: Last Man Standing
8 Jack the Giant Slayer
9 The Deliverance
10 Bad Lieutenant: Port of Call - New Orleans
11 Platoon
