### Data Preprocessing

In [143]:
import pandas as pd
import numpy as np
import itertools
from collections import Counter

from MovieClass import MovieClass, MultiMovieClass
from sklearn.model_selection import train_test_split

In [144]:
#data selection
df = pd.read_csv('data_raw/ratings_small.csv')
print(df)
df = df.loc[df.rating>=4.0]
#get number of movies watched by each userId
df_n = df['userId'].value_counts().to_frame()
df_n.columns = ['ntotal']
df_n['userId'] = df_n.index
# print(df_n) eliminate the data <21
df = df.merge(df_n, on = 'userId')
df = df.loc[df.ntotal>=30]

#sort by rating and timestap
df = df.sort_values(['userId','rating'],ascending=[1,0])
df = df.sort_values(['userId','timestamp'],ascending=[1,1])

df = df[['userId', 'movieId']]
df['count'] = df.groupby('userId').cumcount()
# df

        userId  movieId  rating   timestamp
0            1       31     2.5  1260759144
1            1     1029     3.0  1260759179
2            1     1061     3.0  1260759182
3            1     1129     2.0  1260759185
4            1     1172     4.0  1260759205
...        ...      ...     ...         ...
99999      671     6268     2.5  1065579370
100000     671     6269     4.0  1065149201
100001     671     6365     4.0  1070940363
100002     671     6385     2.5  1070979663
100003     671     6565     3.5  1074784724

[100004 rows x 4 columns]


In [131]:
#split and save inputs and truth ground 
df_X = df[df['count']<20]
df_Y = df[df['count']>=20]
df_X = df_X[['userId', 'movieId']]
df_Y = df_Y[['userId', 'movieId']]

df_X.to_csv('data/users_history.csv')
df_Y.to_csv('data/ground_truth.csv')

# Step-1: recall 

### "Movie2Movie" Co-visitation Matrix - Rating Weighted

In [13]:
df = pd.read_csv('data_raw/ratings_small.csv').drop('timestamp', axis=1)
df = pd.merge(df, df, on ='userId')
df = df.loc[df.movieId_x != df.movieId_y]
df['wgt'] = df.rating_y / 5
df = df[['movieId_x','movieId_y','wgt']]
df = df.groupby(['movieId_x','movieId_y']).wgt.sum()
df = df.reset_index()
df = df.sort_values(['movieId_x','wgt'], ascending=[True, False])
df = df.reset_index(drop = True)
df['n'] = df.groupby('movieId_x').movieId_y.cumcount()
df = df.loc[df.n <20].drop('n', axis=1)
df.to_csv('data/Movie2Movie.csv')

### Most Popular Movies

In [117]:
#from full dataset
df = pd.read_csv('data_raw/ratings.csv')
top_view_movies = df.movieId.value_counts().index.values[:20]

In [118]:
top_view_movies

array([ 356,  318,  296,  593, 2571,  260,  480,  527,  110,    1, 1210,
        589, 1196, 2959, 1198,   50, 2858,  150,  780,  858])

### Same Director

# Step-2: Rerank

In [132]:
def df_covisitation_to_dict(df):
    return df.groupby('movieId_x').movieId_y.apply(list).to_dict()

def df_user_to_dict(df):
    return df.groupby('userId').movieId.apply(list).to_dict()

#read
df_cov = pd.read_csv('data/Movie2Movie.csv')
df_X = pd.read_csv('data/users_history.csv')
df_y = pd.read_csv('data/ground_truth.csv')

top_20_movies = df_covisitation_to_dict(df_cov)
X_dic = df_user_to_dict(df_X)
gt_dic = df_user_to_dict(df_y)

In [133]:
print(df_X.head())
print(df_y.head())
print(df_cov.head())

   Unnamed: 0  userId  movieId
0           9       2      150
1          33       2      590
2          34       2      592
3          17       2      296
4          10       2      153
   Unnamed: 0  userId  movieId
0          26       2      509
1          24       2      500
2          28       2      527
3          30       2      551
4          16       2      273
   Unnamed: 0  movieId_x  movieId_y    wgt
0           0          1        356  139.6
1           1          1        260  137.9
2           2          1        318  132.3
3           3          1        296  131.7
4           4          1        593  119.1


In [140]:
def suggest_movie(df):
    movies = df.movieId.tolist()
    unique_movie = list(dict.fromkeys(movies))
    #gererate movie recommendation
    movies_2 = list(itertools.chain(*[top_20_movies[id] for id in unique_movie if id in top_20_movies]))
    #rerankg
    top_movies_2 = [id for id, cnt in Counter(movies_2).most_common(100) if id not in unique_movie]
    return top_movies_2 + list(top_view_movies) 

movie_rc = suggest_movie(df_X.loc[df_X.userId == 2])

In [141]:
df_Y.loc[df_Y.userId == 2]

Unnamed: 0,userId,movieId
26,2,509
24,2,500
28,2,527
30,2,551
16,2,273
22,2,468
31,2,585
27,2,515
11,2,222
25,2,508


In [142]:
count = 0
for id in movie_rc:
    if id in gt_dic[2]:
        print(id)
        count +=1
print(count)

527
500
509
527
4


In [137]:
movie_rc

[356,
 318,
 593,
 457,
 527,
 260,
 588,
 380,
 377,
 364,
 595,
 608,
 1,
 1196,
 2571,
 1270,
 500,
 357,
 2858,
 165,
 1210,
 1198,
 32,
 597,
 34,
 2959,
 858,
 539,
 2028,
 587,
 780,
 509,
 1240,
 356,
 318,
 296,
 593,
 2571,
 260,
 480,
 527,
 110,
 1,
 1210,
 589,
 1196,
 2959,
 1198,
 50,
 2858,
 150,
 780,
 858]

In [76]:
gt_dic[2]

[266,
 50,
 39,
 454,
 480,
 17,
 261,
 265,
 589,
 350,
 509,
 500,
 527,
 551,
 273,
 468,
 585,
 515,
 222,
 508,
 720,
 314,
 661,
 537]