In [61]:
# General setup imports
import pandas as pd
import numpy as np
import sklearn_pandas
from cold_start import get_cold_start_rating

# Modeling
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [3]:
ratings_df = spark.read.json('data/ratings.json').toPandas()
metadata = pd.read_csv('data/movies_metadata.csv')
request_df = spark.read.json('data/requests.json').toPandas()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
ratings_df['user_id'].nunique()

5400

In [5]:
ratings_df['rating'].value_counts()

4    250180
3    186675
5    166246
2     76138
1     40710
Name: rating, dtype: int64

In [6]:
ratings_df.isna().sum()

movie_id     0
rating       0
timestamp    0
user_id      0
dtype: int64

In [7]:
len(metadata), metadata['tagline'].isna().sum()

(45466, 25054)

In [8]:
metadata.loc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [10]:
len(request_df)

280260

In [36]:
request_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,2019,,956678777.0,6040
1,759,,956679248.0,6040
2,2858,,956679275.0,6040
3,246,,956679413.0,6040
4,1617,,956679473.0,6040


In [20]:
user_df = pd.read_csv('data/users.dat', sep='::', header=None, names=['id', 'gender', 'age', 'occupation', 'zip'])
movie_info_df = pd.read_csv('data/movies.dat', sep='::', header=None, names=['id', 'name', 'genres'])

  """Entry point for launching an IPython kernel.
  


In [21]:
movie_info_df.head()

Unnamed: 0,id,name,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [22]:
movie_info_df['genres'] = movie_info_df['genres'].apply(lambda x: x.split('|'))

In [23]:
movie_info_df.head()

Unnamed: 0,id,name,genres
0,1,Toy Story (1995),"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [24]:
all_genres = set([item for movie in movie_info_df['genres'] for item in movie])

In [25]:
all_genres

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [26]:
user_df = user_df.drop('zip', axis=1)

In [27]:
user_df.head()

Unnamed: 0,id,gender,age,occupation
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15
3,4,M,45,7
4,5,M,25,20


In [43]:
# OHE the user cols
my_cols = ['gender', 'age', 'occupation']

ohe_multi = OneHotEncoder(categories='auto')
ohe_multi.fit(user_df[my_cols])
cols = ohe_multi.get_feature_names(my_cols)
ohe_mat = ohe_multi.transform(user_df[my_cols])

In [51]:
mapper = sklearn_pandas.DataFrameMapper([(['id'], None),
                                        (['gender', 'age','occupation'], OneHotEncoder())],
                                        df_out=True)

In [52]:
user_cat_df = mapper.fit_transform(user_df)

In [54]:
one_hot_df = ratings_df.merge(user_cat_df, how='left', left_on='user_id', right_on='id')

In [55]:
one_hot_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,cluster,id,gender_age_occupation_x0_F,gender_age_occupation_x0_M,gender_age_occupation_x1_1,gender_age_occupation_x1_18,...,gender_age_occupation_x2_11,gender_age_occupation_x2_12,gender_age_occupation_x2_13,gender_age_occupation_x2_14,gender_age_occupation_x2_15,gender_age_occupation_x2_16,gender_age_occupation_x2_17,gender_age_occupation_x2_18,gender_age_occupation_x2_19,gender_age_occupation_x2_20
0,858,4,956678732.0,6040,2,6040,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2384,4,956678754.0,6040,2,6040,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,593,5,956678754.0,6040,2,6040,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1961,4,956678777.0,6040,2,6040,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1419,3,956678856.0,6040,2,6040,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
# Random Forest
X = one_hot_df.drop(['rating', 'timestamp','user_id', 'cluster', 'id'], axis=1)
y = one_hot_df['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [59]:
rand_forest = RandomForestClassifier()

rand_forest.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [60]:
rand_forest.score(X_test, y_test)

0.33817810076227306

In [63]:
k_neighbors = KNeighborsClassifier(n_neighbors=5)
k_neighbors.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [64]:
k_neighbors.score(X_test, y_test)

0.33268328999711094

In [68]:
# Then KMeans cluster
k_clusters = KMeans(n_clusters=8, random_state=42)
k_clusters.fit(X_train)

k_clusters.score(X_test)

-3405142657.8751163

In [30]:
user_df['cluster'] = preds

In [31]:
user_df[user_df['id'] == 6040]

Unnamed: 0,id,gender,age,occupation,cluster
6039,6040,M,25,6,2


In [32]:
cluster_dict = {}
for k, v in zip(user_df['id'].tolist(), user_df['cluster'].tolist()):
    cluster_dict[k] = v

In [33]:
ratings_df['cluster'] = ratings_df['user_id'].apply(lambda x: cluster_dict[x])

In [34]:
ratings_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,cluster
0,858,4,956678732.0,6040,2
1,2384,4,956678754.0,6040,2
2,593,5,956678754.0,6040,2
3,1961,4,956678777.0,6040,2
4,1419,3,956678856.0,6040,2


In [39]:
movie_by_cluster = all_df.groupby(by=['cluster', 'movie_id']).agg({'rating': 'mean'}).reset_index()

In [39]:
len(new_user), len(new_movie)

(0, 0)

In [43]:
movie_by_cluster.head()

Unnamed: 0,cluster,movie_id,rating
0,0,1,3.942857
1,0,2,3.265306
2,0,3,2.756757
3,0,4,3.0
4,0,5,2.571429


In [299]:
ratings_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,cluster
0,858,4,956678732.0,6040,2
1,2384,4,956678754.0,6040,2
2,593,5,956678754.0,6040,2
3,1961,4,956678777.0,6040,2
4,1419,3,956678856.0,6040,2


In [300]:
request_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,2019,,956678777.0,6040
1,759,,956679248.0,6040
2,2858,,956679275.0,6040
3,246,,956679413.0,6040
4,1617,,956679473.0,6040


In [301]:
def cluster_rating(df, movie_id, cluster):
    cluster_rating = df[(df['movie_id'] == movie_id) & (df['cluster'] == cluster)]
    return cluster_rating['rating'].mean()

def user_bias(df, user_id):
    return  df.loc[df['user_id'] == user_id, 'rating'].mean() - df['rating'].mean()

def item_bias(df, movie_id):
    return  df.loc[df['movie_id'] == movie_id, 'rating'].mean() - df['rating'].mean()

In [281]:
avg = cluster_rating(df=ratings_df, movie_id=1617, cluster=1)

4.056818181818182

In [282]:
u = user_bias(ratings_df, 6040)

-0.0743489043735579

In [283]:
i = item_bias(ratings_df, 2019)

0.9880166870242912

In [279]:
avg + u + i

4.970485964468915

In [280]:
movie_info_df[movie_info_df['id'] == 1617]

Unnamed: 0,id,name,genres
1575,1617,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller


In [2]:
blah = get_cold_start_rating(user_id=53, movie_id=9999)

In [3]:
blah

3.2056819189106247

In [6]:
df = pd.read_csv('data/user_cluster.csv', index_col=0)

In [3]:
ratings_df = pd.read_csv('data/movie_cluster_avg.csv', index_col=0)
ratings_df.head()

Unnamed: 0,cluster,movie_id,rating
0,0,1,3.942857
1,0,2,3.265306
2,0,3,2.756757
3,0,4,3.0
4,0,5,2.571429
