In [50]:
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import r2_score
from random import randint, choice, choices
from sklearn.metrics.pairwise import cosine_similarity

In [51]:
movies_df = pd.read_csv("movies.csv")
ratings_df = pd.read_csv("ratings.csv")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [52]:
movies_df["year"] = movies_df.title.str.extract(pat='(\d{4})', expand=False)
movies_df['title'] = movies_df['title'].str.replace(r'\W{1}\d{4}\W{1}', '  ', regex=True)
movies_df["title"] = movies_df["title"].apply(lambda x: x.rstrip(" "))
movies_df.drop(["year"], axis=1, inplace=True)
ratings_df.drop(["timestamp"], axis=1, inplace=True)

In [53]:
user_input_df = pd.DataFrame([
    {"title": "Toy Story", "rating": 4},
    {"title": "Jumanji", "rating": 5},
    {"title": "Father of the Bride Part II", "rating": 5},
    {"title": "Heat", "rating": 5},
    {"title": "Space Jam", "rating": 5}
])
user_input_df

Unnamed: 0,title,rating
0,Toy Story,4
1,Jumanji,5
2,Father of the Bride Part II,5
3,Heat,5
4,Space Jam,5


In [54]:
input_movies = movies_df.merge(user_input_df, on='title')
input_movies.drop(4, inplace=True)
input_movies.drop(5, inplace=True)
input_movies.reset_index(drop=True, inplace=True)
input_movies

Unnamed: 0,movieId,title,genres,rating
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,4
1,2,Jumanji,Adventure|Children|Fantasy,5
2,5,Father of the Bride Part II,Comedy,5
3,6,Heat,Action|Crime|Thriller,5
4,673,Space Jam,Adventure|Animation|Children|Comedy|Fantasy|Sc...,5


In [55]:
user_subset_df = ratings_df.merge(input_movies, on='movieId')
user_subset_df.drop(["title", "genres", "rating_y"], axis=1 ,inplace=True)
user_subset_df.rename(columns={"rating_x": "rating"}, inplace=True)
user_subset_df

Unnamed: 0,userId,movieId,rating
0,13,2,2.0
1,17,2,3.0
2,34,2,2.5
3,40,2,5.0
4,75,2,3.5
...,...,...,...
137264,247728,1,4.0
137265,247732,1,3.5
137266,247735,1,4.5
137267,247751,1,4.0


In [57]:
user_subset_groups = user_subset_df.groupby(["userId"])
sorted_groups = sorted(user_subset_groups, key=lambda x: len(x[1]), reverse=True)

In [58]:
sorted_groups[0][1].sort_values(by='movieId', inplace=True)
input_movies.sort_values(by='movieId', inplace=True)
temp_df = input_movies[input_movies['movieId'].isin(sorted_groups[0][1]['movieId'].tolist())]
# user_list = np.reshape(sorted_groups[0][1]['rating'].tolist(), (1, -1))
# input_list = np.reshape(temp_df["rating"].tolist(), (1, -1))
corr = np.corrcoef(temp_df["rating"], sorted_groups[0][1]["rating"])
corr[1][0]

-0.6123724356957947

In [59]:
user_similarities= []
for userId, group in sorted_groups:
    group.sort_values(by='movieId', inplace=True)
    input_movies.sort_values(by='movieId', inplace=True)
    temp_df = input_movies[input_movies['movieId'].isin(group['movieId'].tolist())]
    corr = np.corrcoef(temp_df["rating"], group["rating"])
    temp_list = [userId, corr[1][0]]
    user_similarities.append(temp_list)
user_similarities_df = pd.DataFrame(user_similarities, columns=["userID", "similarity"])
user_similarities_df

  c /= stddev[:, None]
  c /= stddev[None, :]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


Unnamed: 0,userID,similarity
0,"(277,)",-0.612372
1,"(341,)",-0.534522
2,"(815,)",-0.968246
3,"(1174,)",-0.875000
4,"(1204,)",-0.912871
...,...,...
84113,"(247746,)",
84114,"(247748,)",
84115,"(247751,)",
84116,"(247752,)",


In [60]:
user_similarities_df["userID"] = user_similarities_df["userID"].apply(lambda x: x[0])

In [64]:
user_similarities_df.sort_values(by="similarity", ascending=False, inplace=True)
user_similarities_df

Unnamed: 0,userID,similarity
28673,172598,1.0
32233,214471,1.0
19167,62788,1.0
15133,17105,1.0
33053,223411,1.0
...,...,...
84113,247746,
84114,247748,
84115,247751,
84116,247752,


In [62]:
similar_user_ratings = pd.merge(user_similarities_df, ratings_df, right_on="userId", left_on="userID", how="inner")
similar_user_ratings.dropna(inplace=True)
similar_user_ratings["weighted rating"] = similar_user_ratings["similarity"] * similar_user_ratings["rating"]

In [63]:
temp_df = similar_user_ratings.groupby("movieId").sum()[["weighted rating", "similarity"]]
temp_df.sort_values(by="weighted rating", ascending=False)
recommend_df = pd.DataFrame()
recommend_df["Weighted Average Recommendation Score"] = temp_df["weighted rating"] / temp_df["similarity"]
recommend_df.dropna()
recommend_df.sort_values(by="Weighted Average Recommendation Score", ascending=False, inplace=True)
recommend_df.head(50)

Unnamed: 0_level_0,Weighted Average Recommendation Score
movieId,Unnamed: 1_level_1
27328,inf
59669,inf
114617,inf
111343,inf
140012,inf
109010,inf
108873,inf
69766,inf
80866,inf
105772,inf
