In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# With pandas using correlation

In [None]:
# Get the data;
movie_statistics = pd.read_csv('files/file.tsv', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp']  ) 
movie_titles = pd.read_csv('files/Movie_Id_Titles.csv') 
data = pd.merge(movie_statistics, movie_titles, on='item_id')

In [None]:
# Create movie matrix(Pivot table)
ratings = pd.DataFrame(data.groupby('title')['rating'].mean())  
ratings['num of ratings'] = pd.DataFrame(data.groupby('title')['rating'].count()) 
movie_matrix = data.pivot_table(index ='user_id', columns ='title', values ='rating') 
movie_matrix.head() 

In [None]:
# Analysing correlation with similar movies
starwars_user_ratings = movie_matrix['Star Wars (1977)']
similar_to_starwars = movie_matrix.corrwith(starwars_user_ratings)

# Create data frames with corellations
corr_starwars = pd.DataFrame(similar_to_starwars, columns =['Correlation']) 
corr_starwars.dropna(inplace = True)
corr_starwars.sort_values('Correlation', ascending = False).head(10)
corr_starwars = corr_starwars.join(ratings['num of ratings'])

recommended_movies = corr_starwars[corr_starwars['num of ratings']>100].sort_values('Correlation', ascending = False).head() 

In [None]:
recommended_movies

# Content Based Recommender

In [None]:
meta_data = pd. read_csv('files/movieliens/movies_metadata.csv')
links_small = pd.read_csv('files/movieliens/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
meta_data.head()

In [None]:
# Modify data. Data author suggested drop those rows
meta_data = meta_data.drop([19730, 29503, 35587])
meta_data['id'] = meta_data['id'].astype('int')

In [None]:
# Get only movies within links small
smd = meta_data[meta_data['id'].isin(links_small)]
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')
smd.head()

In [None]:
# Vecotrize words given in description using scikit-learn 
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])
tfidf_matrix.shape

In [None]:
# cosine(x,y)=x.y⊺ /||x||.||y||; The vectorizer gave norms to be one, so the cosine_sim is linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [None]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [None]:
get_recommendations('The Godfather').head(10)

In [None]:
get_recommendations('The Dark Knight').head(10)