# Recommandation

The file `top10K-TMDB-movies.csv` contains information about Netflix content.

## Load, examine, clean, prepare

In [1]:
# Read and parse the csv file.

import pandas as pd
data = pd.read_csv("../raw_data/recommender_system.csv")

# We can take a look at the dataset to see what it contains 
data.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


In [2]:
# Rows and columns in the dataset

n_rows, n_cols = data.shape
print("The dataset has {} rows and {} columns.".format(n_rows, n_cols))

The dataset has 10000 rows and 9 columns.


## Feature Selection

In [3]:
# Columns of this dataset.

print("The columns of this dataset are :")
for col_name in data.columns:
    print("  -", col_name)

The columns of this dataset are :
  - id
  - title
  - genre
  - original_language
  - overview
  - popularity
  - release_date
  - vote_average
  - vote_count


In [4]:
# Keeping important columns
data = data[['id', 'title', 'overview', 'genre']]
data

Unnamed: 0,id,title,overview,genre
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama,Crime"
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy,Drama,Romance"
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama,Crime"
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"Drama,History,War"
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...,"Drama,Crime"
...,...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo...","Action,Adventure,Fantasy"
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...,"Action,TV Movie,Science Fiction,Comedy,Adventure"
9997,13995,Captain America,"During World War II, a brave, patriotic Americ...","Action,Science Fiction,War"
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...,"Adventure,Fantasy,Action,Drama"


In [5]:
# Grouping genre and overview
data['tags']=data['overview']+data['genre']

# Droping genre and overview
data = data.drop(columns=['overview', 'genre'])
data

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...
...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...


In [6]:
# How many missing values ?

print(data.isna().sum())
print("There are {} missing values in the dataframe.".format(data.isna().sum().sum()))

id        0
title     0
tags     15
dtype: int64
There are 15 missing values in the dataframe.


In [7]:
# Remove the rows which have at least 1 missing value.
data = data.dropna(axis='index')
#verification
rows_no_missing_values = data.shape[0]
print("Number of rows removed having at least 1 missing value:", n_rows - rows_no_missing_values)

Number of rows removed having at least 1 missing value: 15


In [8]:
# How many duplicate values ?
print("There are {} duplicate values in the dataframe.".
     format(data.duplicated().sum()))

There are 0 duplicate values in the dataframe.


In [9]:
data

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,The true story of how businessman Oskar Schind...
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...
...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo..."
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...
9997,13995,Captain America,"During World War II, a brave, patriotic Americ..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...


## Building the Recommender Model 

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

#Creating the object CountVectorizer
cv=CountVectorizer(max_features=9985, stop_words='english')
# Transforming to counting matrix
vector=cv.fit_transform(data['tags'].values.astype('U')).toarray()
vector.shape

(9985, 9985)

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vector)
similarity

array([[1.        , 0.05634362, 0.13041013, ..., 0.07559289, 0.11065667,
        0.06900656],
       [0.05634362, 1.        , 0.07715167, ..., 0.        , 0.03636965,
        0.        ],
       [0.13041013, 0.07715167, 1.        , ..., 0.02300219, 0.0673435 ,
        0.09449112],
       ...,
       [0.07559289, 0.        , 0.02300219, ..., 1.        , 0.03253   ,
        0.03042903],
       [0.11065667, 0.03636965, 0.0673435 , ..., 0.03253   , 1.        ,
        0.04454354],
       [0.06900656, 0.        , 0.09449112, ..., 0.03042903, 0.04454354,
        1.        ]])

In [12]:
#  Printing the 5 most similar movies
def recommand(movies):
    index=data[data['title']==movies].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector:vector[1])
    for i in distance[1:11]:
        print(data.iloc[i[0]].title)

In [13]:
# Testing 
recommand("The Godfather") 

The Godfather: Part II
Blood Ties
Joker
Bomb City
Gotti
Felon
Rope
Batman: The Killing Joke
The Big Heat
The Outsider


In [14]:
#  Saving data 
import numpy as np

data.to_csv('../suitable_data/clean_recomendation.csv',index=False)