# Import libraries

In [None]:
import pandas as pd 
import numpy as np 
from collections import deque 
import plotly.graph_objs as go
import plotly.offline as py


# Load Data :

In [None]:
# Load TMDB data
df1=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
df2=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')
df1.columns = ['id','Name','cast','crew']
df2= df2.merge(df1,on='id')
print(df1.shape)
print(df2.shape)
print(df2.head(3))

In [None]:
# Load Netflix data for movie titles :
movie_titles = pd.read_csv('../input/netflix-prize-data/movie_titles.csv', 
                           encoding = 'ISO-8859-1', 
                           header = None, 
                           names = ['Id', 'Year', 'Name']).set_index('Id')

print('Shape Movie-Titles:\t{}'.format(movie_titles.shape))
print(movie_titles.head(5))

In [None]:
# Load Netflix data for user data (movie ID is indexed):
df_raw1 = pd.read_csv('../input/netflix-prize-data/combined_data_1.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
#df_raw2 = pd.read_csv('../input/netflix-prize-data/combined_data_2.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
#df_raw3 = pd.read_csv('../input/netflix-prize-data/combined_data_3.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
#df_raw4 = pd.read_csv('../input/netflix-prize-data/combined_data_4.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])

df_raw = df_raw1
#df_raw = df_raw.append(df_raw2)
#df_raw = df_raw.append(df_raw3)
#df_raw = df_raw.append(df_raw4)

df_raw.index = np.arange(0,len(df_raw))
#print(df_raw.index)    #RangeIndex(start=0, stop=24058263, step=1)

df_raw['Rating'] = df_raw['Rating'].astype(float)

print('Dataset 1 shape: {}'.format(df_raw.shape))
print('-Dataset examples-')
print(df_raw.iloc[::5000000, :])

In [None]:
# Load Movies Dataset's movies metadata :
movie_metadata = pd.read_csv('../input/the-movies-dataset/movies_metadata.csv', low_memory=False)[['original_title', 'overview', 'vote_count']].set_index('original_title').dropna()
# Remove the long tail of rarly rated moves
movie_metadata = movie_metadata[movie_metadata['vote_count']>10].drop('vote_count', axis=1)

print('Shape Movie-Metadata:\t{}'.format(movie_metadata.shape))
movie_metadata.sample(5)

# Preprocessing

To add movie ID column to df.

In [None]:
# Find empty rows to slice dataframe for each movie
tmp_movies = df_raw[df_raw['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values] #movie[:-1] everything except last item in array

# Shift the movie_indices by one to get start and endpoints of all movies
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)


# Gather all dataframes
user_data = []

# Iterate over all movies
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    
    # Check if it is the last movie in the file
    if df_id_1<df_id_2:
        tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df_raw.loc[df_id_1+1:].copy()
        
    # Create movie_id column
    tmp_df['Movie'] = movie_id
    
    # Append dataframe to list
    user_data.append(tmp_df)

# Combine all dataframes
df = pd.concat(user_data)
del user_data, df_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(df.shape))
df.sample(5)

In [None]:
#Extract vote count and vote average      df.rename(index={0: "x", 1: "y", 2: "z"})
tmp=pd.DataFrame(df.groupby('Movie').agg({'Rating':['count', 'mean']}))
#tmp=tmp.rename(index={0: "vote_count",1 :"vote_average"})
tmp.head()

In [None]:
#Adding vote count and vote averages
df_n=df.merge(tmp,on='Movie')  ####check m
#df_n.drop('Rating_y',axis='columns', inplace=True)
df_n.columns=['User', 'Rating', 'Date', 'Movie', 'vote_count', 'vote_average']
print(df_n.head())
print(df_n.shape)

In [None]:
#Join Netflix movie titles with df_n
movie_titles.index.names = ['Movie']
df_n=df_n.merge(movie_titles , on='Movie')
print(df_n.sample(5))
print(df_n.shape)

In [None]:
#Join with movies metadata dataset
movie_metadata.reset_index(inplace=True)
movie_metadata.rename(columns={"original_title": "Name"},inplace=True)

df_n=pd.merge(df_n, movie_metadata, on='Name')
#df_n.merge(movie_metadata, on=['Name', 'original_title'])
print(df_n.sample(5))
print(df_n.shape)

In [None]:
print(df.shape)
df_n.shape

# Visualization

In [None]:
# Get data
data = movie_titles['Year'].value_counts().sort_index()

# Create trace
trace = go.Scatter(x = data.index,
                   y = data.values,
                   marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = '{} Movies Grouped By Year Of Release'.format(movie_titles.shape[0]),
              xaxis = dict(title = 'Release Year'),
              yaxis = dict(title = 'Movies'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig)

In [None]:
# Get data
data = df['Rating'].value_counts().sort_index(ascending=False)

# Create trace
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Distribution Of {} Netflix-Ratings'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig)

In [None]:
# Get data
data = df['Date'].value_counts()
data.index = pd.to_datetime(data.index)
data.sort_index(inplace=True)

# Create trace
trace = go.Scatter(x = data.index,
                   y = data.values,
                   marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = '{} Movie-Ratings Grouped By Day'.format(df.shape[0]),
              xaxis = dict(title = 'Date'),
              yaxis = dict(title = 'Ratings'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig)

In [None]:
##### Ratings Per Movie #####
# Get data
data = df.groupby('Movie')['Rating'].count()#.clip(upper=99999)




#import seaborn as sns
#sns.set()#_theme(style="whitegrid")
#tips = sns.load_dataset("tips")
#ax = sns.scatterplot(x=data.values)



# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 1000000,
                                  size = 100),
                    marker = dict(color = '#db0000'))
# Create layout
layout = go.Layout(title = 'Distribution Of Ratings Per Movie #(Clipped at 99999)',
                   xaxis = dict(title = 'Ratings Per Movie'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
#fig = go.Figure(data=[trace], layout=layout)
#py.iplot(fig)



##### Ratings Per User #####
# Get data
data = df.groupby('User')['Rating'].count().clip(upper=1999)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 2000,
                                  size = 2),
                     marker = dict(color = '#db0000'))
# Create layout
layout = go.Layout(title = 'Distribution Of Ratings Per User (Clipped at 1999)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig)

# Weighted Rating

**Demographic filtering** for users that are new or we have less data on.


We can use the average ratings of the movie as the score but using this won't be fair enough since a movie with 8.9 average rating and only 3 votes cannot be considered better than the movie with 7.8 as as average rating but 40 votes. So, I'll be using IMDB's weighted rating (wr) which is given as :-

 ![](https://image.ibb.co/jYWZp9/wr.png)

where,
* v is the number of votes for the movie;
* m is the minimum votes required to be listed in the chart;
* R is the average rating of the movie; And
* C is the mean vote across the whole report

We already have v(**vote_count**) and R (**vote_average**) and C can be calculated as 

In [None]:
# Calculation based on the IMDB formula :
#weighted rating= (v/(v+m) * R) + (m/(m+v) * C)

#C=df['Rating'].mean()   #3.5996343025565563
#C

C=tmp.iloc[:,1].mean()
C

In [None]:
#m=df.groupby('Movie')['Rating'].count()
#m=m.quantile(0.9)
#m
m=tmp.iloc[:, 0].quantile(0.9) 
m

In [None]:
# Calculation based on the IMDB formula
v = tmp.iloc[:, 0]
R = tmp.iloc[:, 1]
tmp['score']=((v/(v+m) * R) + (m/(m+v) * C))
tmp=tmp.sort_values('score', ascending=False)
tmp.head()

In [None]:
tmp=tmp.merge(movie_titles , on='Movie')
tmp.head()

# Content based filtering

**Content based filtering** :  overview, cast, crew, keyword, tagline etc is used to find its similarity with other movies.

In [None]:
movie_metadata.head(5)

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movie_metadata['overview'] = movie_metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movie_metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
tfidf_matrix

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(movie_metadata.index, index=movie_metadata['Name']).drop_duplicates()

In [None]:
indices

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movie_metadata['Name'].iloc[movie_indices]

In [None]:
idx = indices['The Avengers']
idx

In [None]:
    idx = indices['The Avengers']

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    print(cosine_sim[idx])
    #print((sim_scores[1:11]))
    # Sort the movies based on the similarity scores
    #sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #print((sim_scores[1:11]))
    # Get the scores of the 10 most similar movies
    #sim_scores = sim_scores[1:11]

    # Get the movie indices
    #movie_indices = [i[0] for i in sim_scores]
    #print(movie_metadata['Name'].iloc[movie_indices],sim_scores)

In [None]:
get_recommendations('The Dark Knight')

In [None]:
get_recommendations('The Avengers')