In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


## TFIDF на тегах

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [549]:
cnt_vec_t = CountVectorizer( min_df=4, max_df=10)

In [550]:
processed_t = cnt_vec_t.fit_transform(tags.tag)

In [551]:
processed_t.shape

(3683, 256)

In [552]:
tfidf_t = TfidfTransformer()

In [553]:
tfidf_dense_t = tfidf_t.fit_transform(processed_t).todense()

In [554]:
tfidf_dense_t.shape

(3683, 256)

In [555]:
# cnt_vec_t.vocabulary_

In [556]:
columns_t = [None for i in range(len(cnt_vec_t.vocabulary_))]
for k in cnt_vec_t.vocabulary_:
    columns_t[cnt_vec_t.vocabulary_[k]] = k+'_t'

In [557]:
df_tfidf_t = pd.DataFrame(tfidf_dense_t, columns=columns_t)

In [558]:
df_tfidf_t.head()

Unnamed: 0,aardman_t,abuse_t,acting_t,actress_t,adam_t,affleck_t,africa_t,al_t,alcoholism_t,alfred_t,...,wedding_t,weird_t,western_t,white_t,williams_t,witty_t,wizards_t,writing_t,york_t,zombies_t
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## TFIDF на жанрах

In [559]:
movies.head()

Unnamed: 0,movieId,title,genres,genres_space,genres_split
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),Comedy,Comedy,[Comedy]


In [560]:
movies['genres_split'] = movies.genres.str.split('|')

In [561]:
movies['genres_space'] = movies.apply(lambda r: ' '.join(r['genres_split']), axis=1)

In [562]:
cnt_vec_g = CountVectorizer(min_df = 2)

In [563]:
processed_g = cnt_vec_g.fit_transform(movies.genres_space)

In [564]:
processed_g.shape

(9742, 24)

In [565]:
tfidf_g = TfidfTransformer()

In [566]:
tfidf_dense_g = tfidf_g.fit_transform(processed_g).todense()

In [567]:
tfidf_dense_g.shape

(9742, 24)

In [568]:
columns_g = [None for i in range(len(cnt_vec_g.vocabulary_))]
for k in cnt_vec_g.vocabulary_:
    columns_g[cnt_vec_g.vocabulary_[k]] = k+'_g'

In [569]:
df_tfidf_g = pd.DataFrame(tfidf_dense_g, columns=columns_g)

In [570]:
df_tfidf_g.head()

Unnamed: 0,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,fantasy_g,fi_g,...,listed_g,musical_g,mystery_g,no_g,noir_g,romance_g,sci_g,thriller_g,war_g,western_g
0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Объединяем tfidf жанр и тег

In [571]:
movies_with_tfidf = pd.concat((movies, df_tfidf_g), axis=1)
movies_with_tfidf = pd.concat((movies_with_tfidf, df_tfidf_t), axis=1)

In [572]:
movies_with_tfidf.shape

(9742, 285)

In [573]:
# movies_with_tfidf = movies_with_tfidf.dropna()
# movies_with_tfidf.shape

In [574]:
movies_with_tfidf = movies_with_tfidf.fillna(0)

In [575]:
movies_with_tfidf.tail()

Unnamed: 0,movieId,title,genres,genres_space,genres_split,action_g,adventure_g,animation_g,children_g,comedy_g,...,wedding_t,weird_t,western_t,white_t,williams_t,witty_t,wizards_t,writing_t,york_t,zombies_t
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Action Animation Comedy Fantasy,"[Action, Animation, Comedy, Fantasy]",0.43601,0.0,0.614603,0.0,0.318581,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,Animation Comedy Fantasy,"[Animation, Comedy, Fantasy]",0.0,0.0,0.682937,0.0,0.354002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9739,193585,Flint (2017),Drama,Drama,[Drama],0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Action Animation,"[Action, Animation]",0.578606,0.0,0.815607,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy,Comedy,[Comedy],0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [576]:
list(movies_with_tfidf.columns[0:2])

['movieId', 'title']

In [577]:
movies_with_tfidf.columns[5:]

Index(['action_g', 'adventure_g', 'animation_g', 'children_g', 'comedy_g',
       'crime_g', 'documentary_g', 'drama_g', 'fantasy_g', 'fi_g',
       ...
       'wedding_t', 'weird_t', 'western_t', 'white_t', 'williams_t', 'witty_t',
       'wizards_t', 'writing_t', 'york_t', 'zombies_t'],
      dtype='object', length=280)

In [578]:
fc = list(movies_with_tfidf.columns[0:2]) + list(movies_with_tfidf.columns[5:])

In [579]:
movies_for_ds = movies_with_tfidf[fc]

In [580]:
movies_for_ds.head()

Unnamed: 0,movieId,title,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,...,wedding_t,weird_t,western_t,white_t,williams_t,witty_t,wizards_t,writing_t,york_t,zombies_t
0,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Средние оценки (+ median, variance, etc.) пользователя и фильма

In [581]:
movies_with_ratings = pd.merge(ratings, movies_for_ds, on = 'movieId')

In [582]:
movies_with_ratings_up = movies_with_ratings.groupby(movies_with_ratings['movieId']).rating.agg(['mean','median','std']).reset_index()

In [583]:
movies_with_ratings_up['variance'] = movies_with_ratings_up['std']**2

In [584]:
movies_with_ratings = pd.merge(movies_with_ratings, movies_with_ratings_up, on='movieId')

## CBRS For User

In [585]:
movies_with_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,action_g,adventure_g,animation_g,children_g,comedy_g,...,williams_t,witty_t,wizards_t,writing_t,york_t,zombies_t,mean,median,std,variance
0,1,1,4.0,964982703,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.834859,0.69699
1,5,1,4.0,847434962,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.834859,0.69699
2,7,1,4.5,1106635946,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.834859,0.69699
3,15,1,2.5,1510577970,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.834859,0.69699
4,17,1,4.5,1305696483,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.834859,0.69699


In [586]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

In [587]:
TARGET_USER = 7 #56

In [588]:
df_for_user = movies_with_ratings[movies_with_ratings['userId'] == TARGET_USER]

In [589]:
df_for_user.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,action_g,adventure_g,animation_g,children_g,comedy_g,...,williams_t,witty_t,wizards_t,writing_t,york_t,zombies_t,mean,median,std,variance
2,7,1,4.5,1106635946,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.0,0.834859,0.69699
575,7,50,4.5,1106635993,"Usual Suspects, The (1995)",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.237745,4.5,0.800921,0.641475
1570,7,260,5.0,1106635933,Star Wars: Episode IV - A New Hope (1977),0.432883,0.492725,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.231076,4.5,0.872004,0.76039
2428,7,356,5.0,1106635915,Forrest Gump (1994),0.0,0.0,0.0,0.0,0.340477,...,0.0,0.0,0.0,0.0,0.0,0.0,4.164134,4.0,0.831244,0.690966
3190,7,480,5.0,1106635917,Jurassic Park (1993),0.398088,0.45312,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.75,4.0,0.855301,0.73154


In [590]:
df_for_user.shape

(152, 289)

In [591]:
# df_for_user.columns.tolist()

In [592]:
X, y = df_for_user[list(movies_with_tfidf.columns[5:])], df_for_user['rating']

In [593]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [594]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [595]:
models = [LinearRegression, Lasso, Ridge, SVR, RandomForestRegressor]

In [596]:
for m in models:
    model = m()
    model.fit(X_train,y_train)
    print("{}. r2_train: {:.4f}, r2_test: {:.4f}, mae_train: {:.4f}, mae_test: {:.4f}".format(
        m.__name__, model.score(X_train,y_train), model.score(X_test,y_test),
        mean_absolute_error(model.predict(X_train),y_train),
        mean_absolute_error(model.predict(X_test),y_test)
    ))
    model.predict(X_test)

LinearRegression. r2_train: 0.2953, r2_test: -592852516419048703000576.0000, mae_train: 0.8512, mae_test: 198392071013.6648
Lasso. r2_train: 0.0000, r2_test: -0.0112, mae_train: 1.1034, mae_test: 1.2335
Ridge. r2_train: 0.3908, r2_test: -0.2600, mae_train: 0.7265, mae_test: 1.3323
SVR. r2_train: 0.1391, r2_test: -0.0412, mae_train: 0.8966, mae_test: 1.2181
RandomForestRegressor. r2_train: 0.5732, r2_test: -0.0395, mae_train: 0.5962, mae_test: 1.1892




Модель сильно переобучилась, полагаю если обработать теги ( убрать множественное число, артилки, предлоги и т.д.) то должно быть получше. Мало данных и много фич.

## Item 2 item

In [488]:
from sklearn.neighbors import NearestNeighbors

In [489]:
cl = movies_for_ds.columns[2:].tolist()

In [490]:
X_unsup = movies_for_ds[cl]

In [491]:
nn = NearestNeighbors(n_neighbors=10, metric= 'minkowski', p=4)

In [492]:
nn.fit(X_unsup)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=10, p=4, radius=1.0)

In [493]:
movies_for_ds[movies_for_ds['movieId'] == 188751]

Unnamed: 0,movieId,title,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,...,western_t,white_t,will_t,williams_t,witty_t,wizards_t,world_t,writing_t,york_t,zombies_t
9715,188751,Mamma Mia: Here We Go Again! (2018),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [494]:
neighbors = nn.kneighbors(movies_for_ds[movies_for_ds['movieId'] == 188751][cl])[1]

In [495]:
neighbors.reshape(-1)

array([203, 386, 250, 248, 577, 297, 482, 216, 103, 315], dtype=int64)

In [496]:
movies_for_ds.iloc[neighbors.reshape(-1)]

Unnamed: 0,movieId,title,action_g,adventure_g,animation_g,children_g,comedy_g,crime_g,documentary_g,drama_g,...,western_t,white_t,will_t,williams_t,witty_t,wizards_t,world_t,writing_t,york_t,zombies_t
203,237,Forget Paris (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
386,444,Even Cowgirls Get the Blues (1993),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
250,289,Only You (1994),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
248,287,Nina Takes a Lover (1994),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
577,708,"Truth About Cats & Dogs, The (1996)",0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
297,339,While You Were Sleeping (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
482,550,Threesome (1994),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216,252,I.Q. (1994),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,118,If Lucy Fell (1996),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315,357,Four Weddings and a Funeral (1994),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
