<a href="https://colab.research.google.com/github/Avin-Kolahdooz/Movie-Recommender/blob/main/Netflix_Movies_and_TV_Shows.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from google.colab import files
import io
uploaded=files.upload()

Saving top10K-TMDB-movies.csv to top10K-TMDB-movies.csv


In [3]:
films = pd.read_csv(io.BytesIO(uploaded['top10K-TMDB-movies.csv']))

In [4]:
films.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [5]:
films.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,10000.0,10000.0,10000.0,10000.0
mean,161243.505,34.697267,6.62115,1547.3094
std,211422.046043,211.684175,0.766231,2648.295789
min,5.0,0.6,4.6,200.0
25%,10127.75,9.15475,6.1,315.0
50%,30002.5,13.6375,6.6,583.5
75%,310133.5,25.65125,7.2,1460.0
max,934761.0,10436.917,8.7,31917.0


In [6]:
films.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.2+ KB


In [7]:
films.isnull().sum()

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

Feature Selection

In [8]:
# In this part we need to use features that are important to our porpuse
films.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [9]:
#So we want to choose features that could be determinative in a movie Recommendation System
films = films[['id', 'title', 'genre' , 'overview']]

In [10]:
# We want to combine two features to make it easier ("genre" and "overview")
films['bags'] = films['genre'] + films['overview']

In [11]:
new_films = films.drop(columns =['genre' , 'overview'])

In [12]:
new_films.head()

Unnamed: 0,id,title,bags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."


In [20]:
#CountVectorizer :
# max_features specifies the maximum number of features (words or tokens) to consider.
cv = CountVectorizer(max_features= 1000 )

In [21]:
vectors = cv.fit_transform(new_films['bags'].values.astype('U')).toarray()

In [22]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

##Creating Recommender System
Cosin Similarity

In [16]:
#Cosin Similarity: method for measuring similarity between two vectors. This method is commonly used to compare two text documents or two feature vectors.
sm = cosine_similarity(vectors)
sm

array([[1.        , 0.46730017, 0.4410391 , ..., 0.3828847 , 0.45732956,
        0.36820406],
       [0.46730017, 1.        , 0.40759382, ..., 0.39674028, 0.31098316,
        0.29481739],
       [0.4410391 , 0.40759382, 1.        , ..., 0.37242047, 0.38227643,
        0.18314031],
       ...,
       [0.3828847 , 0.39674028, 0.37242047, ..., 1.        , 0.36279462,
        0.21243078],
       [0.45732956, 0.31098316, 0.38227643, ..., 0.36279462, 1.        ,
        0.26837252],
       [0.36820406, 0.29481739, 0.18314031, ..., 0.21243078, 0.26837252,
        1.        ]])

In [23]:
# For example here we can see the similarity between the movie with index= 2 with other films
sm[2]

array([0.4410391 , 0.40759382, 1.        , ..., 0.37242047, 0.38227643,
       0.18314031])

In [28]:
# In this part we sort the films by the more closest to the less closest to the movie number 2
dist = sorted(list(enumerate(sm[2])), reverse = True, key= lambda vector: vector[1])

In [29]:
# In this part we want to see what are the titles of the most similar movies
for i in dist[0:5]:
  print(new_films.iloc[i[0]].title)

The Godfather
The Replacement Killers
The Tree of Life
Largo Winch II
The Legend of Hercules


In [38]:
# Now we are difining a function to get the name of the film and tell the most movies similar to that

def recommand(movies):
    index=new_films[new_films['title']==movies].index[0]
    distance = sorted(list(enumerate(sm[index])), reverse=True, key=lambda vector:vector[1])
    for i in distance[0:5]:
        print(new_films.iloc[i[0]].title)

In [40]:
recommand("Iron Man")

Iron Man
Dead Rising: Watchtower
District B13
Dawn of the Planet of the Apes
Spawn
