# Recomender System

### This research aims at building a recomender system for movies.

In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [2]:
# Loading the dataset (we will work on the movies dataset first)
movie_df=pd.read_csv("recom_set\movies_metadata.csv",low_memory=False)
movie_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'recom_set\\movies_metadata.csv'

In [None]:
# Dimesions of the dataset
movie_df.shape
movie_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [None]:
# Sampling  a section of the dataset for analysis
sample_df=movie_df.sample(frac=0.7,axis=0,ignore_index=True)
sample_df.shape

(31826, 24)

In [None]:
# Assessing the plot overviews. This is because the recomender is content based
sample_df['overview'].head()

0    Janne, a 60 year old party promoter is arrangi...
1                                  Bigfoot documentary
2    Five disparate kids snowed in at the airport o...
3    Carl Foster takes off on a well-deserved weeke...
4    Belgium, of all places. What's Belgium got bes...
Name: overview, dtype: object

In [None]:
# We will use a scikit learn library to extract the specific objects needed for recomendation as our recomender is content based.
# The vectorizer removes all the english words eg 'a','the' that arent necessary for modelling.
tfid_vector=TfidfVectorizer(stop_words='english')
# We will remove all the empty spaces and fill it with an empty string to avoid error
sample_df['overview']=sample_df['overview'].fillna('')
# # Constructing a required feature/matrix by fitting and transforming the data
tfid_matrix=tfid_vector.fit_transform(sample_df['overview'])

In [None]:
# Obtaining a similar matrix
sim_mat=linear_kernel(tfid_matrix,tfid_matrix)

In [None]:
# Forming a pandas series where the title is the index
indices=pd.Series(movie_df.index,index=movie_df['title']).drop_duplicates()
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

## Building the recomendation function

In [None]:
# Building a function to recomend movies. I want to pick a title get the movies that have common content in the matrix formed from overview
# Getting a content matrix that matches the golden eye movie
def recommender(title):
    idx=indices[title] 
    # Listing the movie index and the matrices matched
    score=list(enumerate(sim_mat[idx]))
    score=sorted(score,key=lambda x: x[1],reverse=True)
    # Getting the topten indexes with the highest matrix
    matrix_best=score[1:10]
    matrix_best
    # The movie indexes
    movie_ind=[i[0] for i in matrix_best]
    # # Getting the titles of the movies with the highest index
    return movie_df['title'].iloc[movie_ind]

In [None]:
# Testing the algorithm
recommender('Jumanji')

18389                All Night Long
13530                         Chaos
30590                     Le Cactus
20408                 Starring Maja
29037                     Innocence
5871         The City of Lost Souls
30423                  Mission Blue
7586     Support Your Local Sheriff
23715                Billy Two Hats
Name: title, dtype: object