In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('../lecture-1/links.csv')
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')
tags = pd.read_csv('../lecture-1/tags.csv')

In [3]:
rm = ratings.merge(movies,on='movieId',how='left')
rm.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [4]:
max_rat_users = int(rm.groupby('userId')['rating'].count().to_frame().reset_index().sort_values('rating',ascending=False).head(1)['userId'])
userfilms = rm[rm['userId']== max_rat_users][['movieId','rating']]
userfilms.head()

Unnamed: 0,movieId,rating
62294,1,4.0
62295,2,3.0
62296,3,4.0
62297,5,2.0
62298,6,3.0


In [5]:
tagsfilm = movies.merge(tags,on='movieId',how='left').groupby(['movieId'])['tag'].apply(list).to_frame().reset_index(level=0)
tagsfilm['tag'] = tagsfilm['tag'].apply(lambda x: str(x).replace('[','').replace(']','')
                                        .replace(',',' ').replace("'",''))

tagsfilm = tagsfilm.merge(movies,on='movieId',how='left')[['movieId','tag','genres']]

tagsfilm['genres']= tagsfilm['genres'].apply(lambda x: ' '.join(str(x).replace(' ', '').replace('-', '').split('|')))
tagsfilm.head()

Unnamed: 0,movieId,tag,genres
0,1,pixar pixar fun,Adventure Animation Children Comedy Fantasy
1,2,fantasy magic board game Robin Williams game,Adventure Children Fantasy
2,3,moldy old,Comedy Romance
3,4,,Comedy Drama Romance
4,5,pregnancy remake,Comedy


In [6]:
df1 = tagsfilm.merge(userfilms,on='movieId',how='left')
df1['rating'].fillna('Не смотрел',inplace=True)
df1.head()

Unnamed: 0,movieId,tag,genres,rating
0,1,pixar pixar fun,Adventure Animation Children Comedy Fantasy,4
1,2,fantasy magic board game Robin Williams game,Adventure Children Fantasy,3
2,3,moldy old,Comedy Romance,4
3,4,,Comedy Drama Romance,Не смотрел
4,5,pregnancy remake,Comedy,2


In [7]:
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import TfidfVectorizer

mapper = DataFrameMapper([
     ('tag', TfidfVectorizer()),
     ('genres', TfidfVectorizer())
 ])

In [8]:
features = mapper.fit_transform(df1)
features = pd.DataFrame(features)
features['movieId'] = df1['movieId']
features['rating'] = df1['rating']
Train = features[features['rating'] != 'Не смотрел']
Val = features[features['rating'] == 'Не смотрел']

In [9]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(Train[Train.columns.difference(['movieId','rating'])], Train['rating'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
reg.score(Train[Train.columns.difference(['movieId','rating'])], Train['rating'])

0.3313493209266569

In [11]:
predictions = reg.predict(Val[Val.columns.difference(['movieId','rating'])])

In [12]:
Val['predictions'] = predictions
Val['predictions'] = Val['predictions'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
Val[Val['predictions']==5].merge(movies,on='movieId',how='left')[['title','predictions']].sort_values(by='predictions', ascending=False)


Unnamed: 0,title,predictions
0,Restoration (1995),5
12,Ghosts of the Abyss (2003),5
21,"Invincible Iron Man, The (2007)",5
20,Tekkonkinkreet (Tekkon kinkurîto) (2006),5
19,Kiki's Delivery Service (Majo no takkyûbin) (1...,5


In [33]:
best_films1 = list(Val[Val['predictions']==5].merge(movies,on='movieId',how='left')[['title','predictions']].sort_values(by='predictions', ascending=False)['title'])


Теперь прогоним эту подборку через SVD

In [17]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split


movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [18]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [30]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)
trainset, testset = train_test_split(data, test_size=0.05, random_state=42)
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a19252d30>

In [43]:
best_films1_rat=[]
for e in best_films1:
    best_films1_rat.append(algo.predict(uid=max_rat_users, iid='{}'.format(e))[3])

In [49]:
df_best_films = pd.DataFrame()
df_best_films['title'] = best_films1
df_best_films['SVD_Rat'] = best_films1_rat
df_best_films.sort_values(by='SVD_Rat', ascending=False).head()

Unnamed: 0,title,SVD_Rat
12,Once Upon a Time in the West (C'era una volta ...,4.119429
10,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,4.116292
4,Kiki's Delivery Service (Majo no takkyûbin) (1...,4.106781
16,Blood Simple (1984),4.090274
9,Hannah and Her Sisters (1986),3.936526
