### Data Preprocessing

In [61]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
import itertools
from collections import Counter

from MovieClass import MovieClass, MultiMovieClass
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
#data selection
df = pd.read_csv('data_raw/ratings_small.csv')
df = df.loc[df.rating>=4.0]
#get number of movies watched by each userId
df_n = df['userId'].value_counts().to_frame()
df_n.columns = ['ntotal']
df_n['userId'] = df_n.index
# print(df_n) eliminate the data <21
df = df.merge(df_n, on = 'userId')
df = df.loc[df.ntotal>=30]

#sort by rating and timestap
df = df.sort_values(['userId','rating'],ascending=[1,0])
df = df.sort_values(['userId','timestamp'],ascending=[1,1])

df = df[['userId', 'movieId']]
df['count'] = df.groupby('userId').cumcount()
# df.to_csv('data/users_data.csv')

Unnamed: 0,userId,movieId,count
9,2,150,0
33,2,590,1
34,2,592,2
17,2,296,3
10,2,153,4
...,...,...,...
51501,671,1035,79
51565,671,5995,80
51541,671,3271,81
51567,671,6365,82


In [63]:
#split and save inputs and truth ground 
df_X = df[df['count']<20]
df_Y = df[df['count']>=20]
df_X = df_X[['userId', 'movieId']]
df_Y = df_Y[['userId', 'movieId']]

df_X.to_csv('data/users_history.csv')
df_Y.to_csv('data/ground_truth.csv')

### "Movie2Movie" Co-visitation Matrix - Rating Weighted

In [13]:
df = pd.read_csv('data_raw/ratings_small.csv').drop('timestamp', axis=1)
df = pd.merge(df, df, on ='userId')
df = df.loc[df.movieId_x != df.movieId_y]
df['wgt'] = df.rating_y / 5
df = df[['movieId_x','movieId_y','wgt']]
df = df.groupby(['movieId_x','movieId_y']).wgt.sum()
df = df.reset_index()
df = df.sort_values(['movieId_x','wgt'], ascending=[True, False])
df = df.reset_index(drop = True)
df['n'] = df.groupby('movieId_x').movieId_y.cumcount()
df = df.loc[df.n <20].drop('n', axis=1)
df.to_csv('data/Movie2Movie.csv')

### Most Popular Movies

In [63]:
#from full dataset
df = pd.read_csv('data_raw/ratings.csv')
top_view_movies = df.movieId.value_counts().index.values[:50]
top_view_movies

array([  356,   318,   296,   593,  2571,   260,   480,   527,   110,
           1,  1210,   589,  1196,  2959,  1198,    50,  2858,   150,
         780,   858,  4993,   457,  1270,   592,    32,    47,   608,
        5952,  7153,   590,  2028,   588,   380,  2762,   377,  3578,
         364,   344,   648,  4306,  1721,  1580,   595,  1197,  4226,
         165,  1240,  1193,  1291, 58559])

In [64]:
df = df.movieId.value_counts().to_frame()
df = df.reset_index()
df.columns = ['movieId','ratingNum']
df['populairty'] = df['ratingNum'].apply(lambda x: int(x/9000))
df_pop = df[['movieId','populairty']]
df_pop.to_csv('data/popularity.csv')
df_pop.head()

Unnamed: 0,movieId,populairty
0,356,10
1,318,10
2,296,9
3,593,9
4,2571,8


### word2vec

In [142]:
#read keywords file
df1 = pd.read_csv('data_raw/keywords.csv')

wv_train_data = []
for i in range(len(df1)):
    key_list = []
    if len(df1.iloc[i].values):
        for key in eval(df1.iloc[i].values[1]):
            key_list.append(key['name'])
    if key_list:
        wv_train_data.append(key_list) 

model = Word2Vec(sentences=wv_train_data, vector_size=32, min_count=0, workers=5, window=5, epochs=30)
model.save("word2vec.model")

In [90]:
cosine_similarity(model.wv['man'].reshape(1,-1), model.wv['vampire'].reshape(1,-1))

array([[0.3885254]], dtype=float32)

### load data

In [49]:
def df_covisitation_to_dict(df):
    return df.groupby('movieId_x').movieId_y.apply(list).to_dict()

def df_user_to_dict(df):
    return df.groupby('userId').movieId.apply(list).to_dict()


df_cov = pd.read_csv('data/Movie2Movie.csv')
df_X = pd.read_csv('data/users_history.csv')
df_y = pd.read_csv('data/ground_truth.csv')
df_dc = pd.read_csv('data/movie_director_casts.csv')
df_pop = pd.read_csv('data/popularity.csv')
top_20_movies = df_covisitation_to_dict(df_cov)
X_dic = df_user_to_dict(df_X)
gt_dic = df_user_to_dict(df_y)
model_wv = Word2Vec.load("word2vec_movie.model")

In [50]:
def get_label(df_recall, df_y):
    groudtruth = df_y.groupby('userId').movieId.apply(list).to_dict()
    label = []
    for i in range(len(df_recall)):
        movieid = int(df_recall.iloc[i].movieId)
        userid = int(df_recall.iloc[i].userId)
        if movieid in groudtruth[userid]:
            label.append([0,1])
        else:
            label.append([1,0])
    return np.array(label)


df_recall = pd.read_csv('data/dataset_feature.csv')
df_x = df_recall.iloc[:, 2:]
label = get_label(df_recall, df_y)

x_train = df_x[:27447].values[:,1:]
x_test = df_x[27447:].values[:,1:]
y_train = label[:27447]
y_test = label[27447:]

SVC

In [65]:
from sklearn.svm import SVC

model = SVC(C=0.1, gamma = 0.001)
model.fit(x_train, np.argmax(np.array(y_train), axis=1))
preditction = model.predict(x_train)
# preditction[0:100]