In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [5]:
tags.head(10)

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765
5,15,35957,short,1141391873
6,15,37729,dull story,1141391806
7,15,45950,powerpoint,1169616291
8,15,100365,activist,1425876220
9,15,100365,documentary,1425876220


In [6]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [7]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,501.0,Pixar,1292956000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,,,
2,3,Grumpier Old Men (1995),Comedy|Romance,,,
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,,,
4,5,Father of the Bride Part II (1995),Comedy,431.0,steve martin,1140455000.0


In [8]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [9]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [10]:
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [11]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [12]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [13]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=7, p=2, radius=1.0)

In [14]:
test = change_string("Comedy|Romance")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [15]:
res

(array([[0., 0., 0., 0., 0., 0., 0.]]),
 array([[4921, 3304,  985, 6656, 8214, 7595, 5800]]))

In [16]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
4921,6999,Housesitter (1992),Comedy|Romance
3304,4141,Head Over Heels (2001),Comedy|Romance
985,1230,Annie Hall (1977),Comedy|Romance
6656,51705,Priceless (Hors de prix) (2006),Comedy|Romance
8214,98908,How to Make Love to a Woman (2010),Comedy|Romance
7595,79590,"Rebound, The (2009)",Comedy|Romance
5800,26435,Starting Over (1979),Comedy|Romance


In [17]:
movies_with_tags.dropna(inplace=True)

In [18]:
movies_with_tags.title.unique().shape

(689,)

In [19]:
tag_strings = []
movies = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

HBox(children=(IntProgress(value=0, max=689), HTML(value='')))




In [20]:
tag_strings[:5]

['cute', 'toplist10', 'getdvd', 'getdvd', 'toplist13']

In [21]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [22]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [23]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='manhattan',
         metric_params=None, n_jobs=-1, n_neighbors=10, p=2, radius=1.0)

In [24]:
for i in range(len(movies)):
    if 'Magnolia (1999)' == movies[i]:
        print(i)

366


In [25]:
tag_strings[122]

'melgibson'

In [26]:
test = change_string('pixar pixar fun')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [27]:
res

(array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 array([[415, 417, 407, 408, 410, 413, 414, 403, 416, 420]]))

In [28]:
for i in res[1][0]:
    print(movies[i])

Nebraska (2013)
Night and the City (1950)
Murmur of the Heart (Le souffle au coeur) (1971)
Musa the Warrior (Musa) (2001)
My Fair Lady (1964)
My Winnipeg (2007)
National Treasure (2004)
Mr. Smith Goes to Washington (1939)
Nerve (2016)
Nine Months (1995)


In [30]:
#TODO сделать матрицу юзеров фильмов и оценок

In [35]:
links2 = pd.read_csv('links.csv')
movies2 = pd.read_csv('movies.csv')
ratings2 = pd.read_csv('ratings.csv')
tags2 = pd.read_csv('tags.csv')

In [36]:
movies_with_ratings = movies2.join(ratings2.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [44]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.movieId,
    'rating': movies_with_ratings.rating
})

In [45]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,7.0,1,3.0
1,9.0,1,4.0
2,13.0,1,5.0
3,15.0,1,2.0
4,19.0,1,3.0


In [40]:
Ratings_M = ratings2.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings_M.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
Ratings_pre = Ratings_M.as_matrix()
user_ratings_mean = np.mean(Ratings_pre, axis = 1)
Ratings_demeaned = Ratings_pre - user_ratings_mean.reshape(-1, 1)

  """Entry point for launching an IPython kernel.


In [None]:
#TODO предсказать оценку

In [42]:
ratings2.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [46]:
feature_cols = ["uid", "iid"]
X = dataset[feature_cols]
y = dataset.rating

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [47]:
XTrain, XTest, ytrain, ytest = train_test_split(X, y, test_size = 0.20, random_state = 20)

In [48]:
lr = LinearRegression()
lr.fit(XTrain, ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [50]:
ypred= lr.predict(XTest)

In [52]:
ypred

array([3.53924004, 3.5551029 , 3.53323277, ..., 3.50289684, 3.54667768,
       3.4920975 ])

In [54]:
ytest

38375    0.5
62802    3.0
64584    2.0
73881    3.0
62068    2.0
32218    4.0
67703    3.0
27021    3.0
29345    4.0
76323    3.5
1910     3.0
67957    3.0
27089    4.0
33084    4.0
34526    3.5
80962    4.0
86527    4.0
75391    3.0
61943    3.0
46775    4.0
48208    2.0
80026    4.0
11297    5.0
52929    1.0
69714    4.0
87094    3.0
57258    3.0
21094    1.0
23244    4.0
85023    4.0
        ... 
72823    5.0
68002    1.5
70475    2.0
16902    4.0
84554    2.5
45086    4.0
49792    5.0
15526    3.5
78252    3.5
35568    4.0
51805    3.0
80760    5.0
90061    3.0
13628    4.0
12706    2.0
22092    3.0
56481    1.0
34062    3.5
39489    4.0
53111    5.0
30822    4.0
76093    4.0
47030    4.5
18548    3.0
24170    3.0
95774    3.5
71102    2.5
93235    3.5
34922    4.0
94537    4.0
Name: rating, Length: 20001, dtype: float64