In [67]:
import pandas as pd
ratings_df = pd.read_csv('../data/the-movies-dataset/ratings_small.csv')

In [25]:
ratings_df.shape

(100004, 4)

In [33]:
ratings_df.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [39]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [55]:
ratings_df['movieId'].unique

<bound method Series.unique of 0           31
1         1029
2         1061
3         1129
4         1172
5         1263
6         1287
7         1293
8         1339
9         1343
10        1371
11        1405
12        1953
13        2105
14        2150
15        2193
16        2294
17        2455
18        2968
19        3671
20          10
21          17
22          39
23          47
24          50
25          52
26          62
27         110
28         144
29         150
          ... 
99974     4034
99975     4306
99976     4308
99977     4880
99978     4886
99979     4896
99980     4963
99981     4973
99982     4993
99983     4995
99984     5010
99985     5218
99986     5299
99987     5349
99988     5377
99989     5445
99990     5464
99991     5669
99992     5816
99993     5902
99994     5952
99995     5989
99996     5991
99997     5995
99998     6212
99999     6268
100000    6269
100001    6365
100002    6385
100003    6565
Name: movieId, Length: 100004, dtype: int64>

In [40]:
ratings_df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [61]:
pivoted_ratings = ratings_df[['userId', 'movieId', 'rating']].pivot(index='userId', columns='movieId', values='rating')

In [62]:
pivoted_ratings.shape

(671, 9066)

In [63]:
pivoted_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,


In [35]:
ratings_df = pivoted_ratings.fillna(.5)

In [52]:
from surprise import SVD
from surprise.model_selection import cross_validate, train_test_split
from surprise import Dataset
from surprise import Reader
from surprise.prediction_algorithms.knns import KNNWithZScore, KNNBaseline
from surprise.prediction_algorithms.matrix_factorization import NMF

# We'll use the famous SVD algorithm.
algo = SVD(verbose=True)
algo2 = KNNWithZScore()
algo3 = KNNBaseline(bsl_options={'method': 'sgd'})
algo4 = NMF()

reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)


cross_validate(algo4, data, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9667  0.9774  0.9859  0.9695  0.9782  0.9755  0.0068  
MAE (testset)     0.7520  0.7658  0.7721  0.7585  0.7646  0.7626  0.0069  
Fit time          3.83    3.86    3.85    3.81    3.74    3.82    0.04    
Test time         0.10    0.09    0.09    0.09    0.09    0.09    0.00    


{'test_rmse': array([0.96665739, 0.97743277, 0.98590718, 0.96950362, 0.97819713]),
 'test_mae': array([0.75196954, 0.76583209, 0.77212042, 0.75847526, 0.76464282]),
 'fit_time': (3.833962917327881,
  3.8574700355529785,
  3.8462321758270264,
  3.8093271255493164,
  3.737318992614746),
 'test_time': (0.0984346866607666,
  0.09462404251098633,
  0.0902249813079834,
  0.09289193153381348,
  0.09151220321655273)}

In [64]:
small_df = ratings_df.sample(n=10)

In [65]:
small_df

Unnamed: 0,userId,movieId,rating,timestamp
1819,15,4451,2.5,1075143282
1308,15,1584,4.0,1052896685
96287,641,1407,5.0,856747910
41332,295,8957,4.0,1112543772
14855,96,3739,5.0,1223256464
30529,217,110,4.0,1108160687
39915,292,1722,4.0,1140050529
62386,452,4292,4.0,989110975
58420,424,2770,4.0,1088826594
35469,254,253,3.0,845157293


In [135]:
movies_df = movies[['original_title', 'movieId']]

In [141]:
movies_df.dtypes

original_title    object
movieId           object
dtype: object

In [147]:
movies_df.movieId = pd.to_numeric(movies_df.movieId, errors='coerce').fillna(-1).astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [149]:
movies_df.shape

(45466, 2)

In [151]:
movies.shape

(45466, 25)

In [172]:
mask = movies_df['movieId'] == -1
movies_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,15,1584,4.0,1052896685,School of Rock
1,641,1407,5.0,856747910,La Môme
2,217,110,4.0,1108160687,Trois couleurs : Rouge
3,292,1722,4.0,1140050529,Captain Corelli's Mandolin
4,424,2770,4.0,1088826594,American Pie 2


In [173]:
movies_df = small_df.merge(movies_df, on='movieId')

In [176]:
movies_df.head()

Unnamed: 0,userId_x,movieId,rating_x,timestamp_x,userId_y,rating_y,timestamp_y,original_title
0,15,1584,4.0,1052896685,15,4.0,1052896685,School of Rock
1,641,1407,5.0,856747910,641,5.0,856747910,La Môme
2,217,110,4.0,1108160687,217,4.0,1108160687,Trois couleurs : Rouge
3,292,1722,4.0,1140050529,292,4.0,1140050529,Captain Corelli's Mandolin
4,424,2770,4.0,1088826594,424,4.0,1088826594,American Pie 2


In [177]:
movies_pivoted = movies_df.pivot(index='userId_x', columns='movieId', values='rating_x')

In [165]:
movie_df_pivoted = movies_df.pivot(index='userId', columns='movieId', values='rating')

In [66]:
small_df.pivot(index='userId', columns='movieId', values='rating')

movieId,110,253,1407,1584,1722,2770,3739,4292,4451,8957
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15,,,,4.0,,,,,2.5,
96,,,,,,,5.0,,,
217,4.0,,,,,,,,,
254,,3.0,,,,,,,,
292,,,,,4.0,,,,,
295,,,,,,,,,,4.0
424,,,,,,4.0,,,,
452,,,,,,,,4.0,,
641,,,5.0,,,,,,,


In [68]:
pivoted = ratings_df.pivot(index='userId', columns='movieId', values='rating')

In [90]:
len(ratings_df['movieId'].unique())

9066

In [69]:
pivoted.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,


In [92]:
pivoted_filled = pivoted.fillna(.5)

In [180]:
movie_df_pivoted = movies_pivoted.fillna(0)

In [170]:
movie_df_pivoted.head()

movieId,110,253,1407,1584,1722,2770
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15,0.0,0.0,0.0,4.0,0.0,0.0
217,4.0,0.0,0.0,0.0,0.0,0.0
254,0.0,3.0,0.0,0.0,0.0,0.0
292,0.0,0.0,0.0,0.0,4.0,0.0
424,0.0,0.0,0.0,0.0,0.0,4.0


In [93]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=10, n_iter=7, random_state=42)
svd.fit(pivoted_filled)

TruncatedSVD(algorithm='randomized', n_components=10, n_iter=7,
       random_state=42, tol=0.0)

In [94]:
print(svd.explained_variance_ratio_)

[0.05550726 0.0653082  0.04663567 0.03124022 0.01909455 0.0172729
 0.01504083 0.01447535 0.01214544 0.01103283]


In [95]:
print(svd.singular_values_)

[1406.38361097  250.00241714  208.95216658  171.02156865  133.70712276
  127.16359922  118.84360005  116.4122836   106.63228386  101.63180084]


In [167]:
from scipy.linalg import svd as scipy_svd
U, Sigma, VT = scipy_svd(pivoted_filled)

In [212]:
U_test, Sigma_test, VT_test = scipy_svd(pivoted_filled)

In [188]:
pivoted_filled.shape

(671, 9066)

In [213]:
Vmatx = pd.DataFrame(VT_test)

In [214]:
Umatx = pd.DataFrame(U_test)

In [215]:
Vmatx.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9056,9057,9058,9059,9060,9061,9062,9063,9064,9065
0,-0.033558,-0.018355,-0.01356,-0.009798,-0.013705,-0.01978,-0.013539,-0.009602,-0.010529,-0.019651,...,-0.009215,-0.009088,-0.009222,-0.009112,-0.009137,-0.009226,-0.009303,-0.009197,-0.009148,-0.009375
1,0.04123,0.01004,-0.006467,-0.006485,-0.000343,0.023492,6.9e-05,-0.006373,-0.010216,0.010753,...,-0.006727,-0.007646,-0.006357,-0.007461,-0.007276,-0.007746,-0.005583,-0.008275,-0.007996,-0.005579
2,-0.030658,-0.010223,0.009034,0.003434,0.005847,0.001952,0.020867,0.003408,0.006597,-0.011131,...,0.001644,0.000421,-0.001884,0.000191,-3.8e-05,-0.001016,-0.003267,0.000372,0.000394,0.003171
3,0.042481,0.069175,0.027087,0.006006,0.029629,0.019549,0.024422,0.005033,0.01447,0.081076,...,-0.00349,0.000652,0.000729,0.000738,0.000823,0.000881,0.000775,0.000466,0.000549,-0.008668
4,-0.001214,-0.027473,0.001386,-0.002194,-0.003741,-0.063788,0.001004,-0.001986,-0.001654,-0.059125,...,-0.003813,0.000977,0.001279,0.001367,0.001757,0.003703,0.00146,0.00145,0.00124,-0.0098


In [216]:
Umatx.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,661,662,663,664,665,666,667,668,669,670
0,-0.03331,-0.036091,0.013554,-0.01237,0.019126,-0.00066,0.008808,-0.012245,0.006421,-0.017007,...,0.036896,-0.008904,0.001622,-0.013954,0.001225,-0.004141,0.005255,-0.017511,0.007219,-0.00646
1,-0.036076,-0.023756,0.021776,0.050135,-0.070503,0.007255,0.013427,0.022132,-0.021953,-0.00019,...,0.022619,0.003883,0.041512,-0.017571,0.038864,-0.055351,0.018636,0.010548,0.01271,0.048683
2,-0.035302,-0.022657,-0.003482,-0.002326,-0.003092,0.002335,-0.016754,-0.015031,-0.002619,0.015388,...,0.01804,-0.000326,-0.006742,-0.000505,0.004824,-0.005375,0.013257,-0.006829,-0.002142,-0.001992
3,-0.042439,0.051439,0.033056,0.050554,0.101593,0.028359,0.012887,0.062993,-0.032854,-0.062601,...,0.001138,-0.003094,0.001171,-0.004623,0.002833,2.5e-05,-0.004361,-0.001094,0.007521,0.001787
4,-0.037268,-0.001454,-0.012799,0.012769,-0.008617,-0.063712,0.002828,-0.047961,-0.039844,-0.018956,...,-0.009826,0.008793,0.005874,-0.000663,-0.007912,0.000645,-0.009598,-0.004837,0.004441,-0.004756


In [228]:
def cosine_distance(u, v):
    """
    Returns the cosine of the angle between vectors v and u. This is equal to
    u.v / |u||v|.
    """
    return np.dot(u, v) / (math.sqrt(np.dot(u, u)) * math.sqrt(np.dot(v, v)))

In [217]:
user1 = Umatx.iloc[0]

In [218]:
Umatx.shape

(671, 671)

In [219]:
Vmatx.shape

(9066, 9066)

In [220]:
Sigma_test.shape

(671,)

In [221]:
pivoted_filled.shape

(671, 9066)

In [223]:
Umatx.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,661,662,663,664,665,666,667,668,669,670
0,-0.03331,-0.036091,0.013554,-0.01237,0.019126,-0.00066,0.008808,-0.012245,0.006421,-0.017007,...,0.036896,-0.008904,0.001622,-0.013954,0.001225,-0.004141,0.005255,-0.017511,0.007219,-0.00646
1,-0.036076,-0.023756,0.021776,0.050135,-0.070503,0.007255,0.013427,0.022132,-0.021953,-0.00019,...,0.022619,0.003883,0.041512,-0.017571,0.038864,-0.055351,0.018636,0.010548,0.01271,0.048683
2,-0.035302,-0.022657,-0.003482,-0.002326,-0.003092,0.002335,-0.016754,-0.015031,-0.002619,0.015388,...,0.01804,-0.000326,-0.006742,-0.000505,0.004824,-0.005375,0.013257,-0.006829,-0.002142,-0.001992
3,-0.042439,0.051439,0.033056,0.050554,0.101593,0.028359,0.012887,0.062993,-0.032854,-0.062601,...,0.001138,-0.003094,0.001171,-0.004623,0.002833,2.5e-05,-0.004361,-0.001094,0.007521,0.001787
4,-0.037268,-0.001454,-0.012799,0.012769,-0.008617,-0.063712,0.002828,-0.047961,-0.039844,-0.018956,...,-0.009826,0.008793,0.005874,-0.000663,-0.007912,0.000645,-0.009598,-0.004837,0.004441,-0.004756


In [236]:
Umatx.apply(cosine_distance, raw=True, args=(user1), axis=1)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [233]:
import numpy as np
import math
type(cosine_distance(user1, user1))

numpy.float64

### Cosine Similarity

In [237]:
from sklearn.metrics.pairwise import cosine_similarity

In [250]:
cosine_similarity(user1.reshape(1,-1) ,user1.reshape(1,-1))

array([[1.]])

In [251]:
user1 = U_test[0]

In [265]:
from heapq import heappush, heappop, nlargest, nsmallest

user_v = user1.reshape(1,-1)
heap = []
for i, row in enumerate(U_test):
    v_row = row.reshape(1,-1)
    heappush(heap, (cosine_similarity(user_v, v_row)[0][0], i))

In [264]:
nsmallest(10, heap)

[-5.48172618408671e-16,
 -3.2959746043559335e-16,
 -3.200564813177209e-16,
 -2.983724378680108e-16,
 -2.671474153004283e-16,
 -2.5673907444456745e-16,
 -2.5326962749261384e-16,
 -2.42861286636753e-16,
 -2.42861286636753e-16,
 -2.3592239273284576e-16]

In [266]:
nsmallest(10, heap)

[(-5.48172618408671e-16, 324),
 (-3.2959746043559335e-16, 632),
 (-3.200564813177209e-16, 468),
 (-2.983724378680108e-16, 384),
 (-2.671474153004283e-16, 551),
 (-2.5673907444456745e-16, 8),
 (-2.5326962749261384e-16, 414),
 (-2.42861286636753e-16, 325),
 (-2.42861286636753e-16, 633),
 (-2.3592239273284576e-16, 171)]

In [40]:
ratings_df_pivoted.shape

(671, 9066)

In [55]:
U.shape

(671, 671)

In [53]:
VT.shape

(671, 9066)

In [54]:
Sigma.shape

(671,)

In [41]:
ratings_df_pivoted

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create SVD Matrix

In [68]:
import pandas as pd
from scipy.linalg import svd as scipy_svd
import numpy as np
import math
from sklearn.metrics.pairwise import cosine_similarity
from heapq import heappush, heappop, nlargest, nsmallest

ratings_df = pd.read_csv('../data/the-movies-dataset/ratings_small.csv')
movies = pd.read_csv('../data/the-movies-dataset/movies_metadata.csv')

In [88]:
movies_df = movies[['original_title', 'id']]

movies_df = movies_df.assign(movieId=pd.to_numeric(movies_df.id, errors='coerce').fillna(-1).astype('int64'))


movies_df = ratings_df.merge(movies_df, on='movieId')

In [89]:
movies_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,original_title,id
0,1,1371,2.5,1260759135,Rocky III,1371
1,4,1371,4.0,949810302,Rocky III,1371
2,7,1371,3.0,851869160,Rocky III,1371
3,19,1371,4.0,855193404,Rocky III,1371
4,21,1371,3.0,853852263,Rocky III,1371


In [74]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [92]:
movies_df.groupby(['userId', 'movieId']).agg()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp,original_title,id
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1371,2.5,1260759135,Rocky III,1371
1,1405,1.0,1260759203,Greed,1405
1,2105,4.0,1260759139,American Pie,2105
1,2193,2.0,1260759198,My Tutor,2193
1,2294,2.0,1260759108,Jay and Silent Bob Strike Back,2294
1,2455,2.5,1260759113,Vivement dimanche!,2455
2,17,5.0,835355681,The Dark,17
2,62,3.0,835355749,2001: A Space Odyssey,62
2,110,4.0,835355532,Trois couleurs : Rouge,110
2,144,3.0,835356016,Der Himmel über Berlin,144


In [85]:
#Commented movies pivoted, because could not get column names as movie names
#movies_df.reset_index(inplace=True)
movies_pivoted = movies_df[['userId', 'movieId', 'rating']].pivot(index='userId', columns='movieId', values='rating')
#ratings_pivoted = ratings_df.pivot(index='userId', columns='movieId', values='rating')

#movie_df_pivoted = movies_pivoted.fillna(0)
#ratings_df_pivoted = ratings_pivoted.fillna(0)

ValueError: Index contains duplicate entries, cannot reshape

In [82]:
svd_input = ratings_df.merge(movies_df, how='left', on='movieId', suffixes=('', '1'))

In [83]:
svd_input.head()

Unnamed: 0,userId,movieId,rating,timestamp,level_0,index,userId1,rating1,timestamp1,original_title,id
0,1,31,2.5,1260759144,,,,,,,
1,1,1029,3.0,1260759179,,,,,,,
2,1,1061,3.0,1260759182,,,,,,,
3,1,1129,2.0,1260759185,,,,,,,
4,1,1172,4.0,1260759205,,,,,,,


In [84]:
svd_input.isnull().sum()

userId                0
movieId               0
rating                0
timestamp             0
level_0           55015
index             55015
userId1           55015
rating1           55015
timestamp1        55015
original_title    55015
id                55015
dtype: int64

In [None]:
U, Sigma, VT = scipy_svd(movie_df_pivoted, full_matrices=False)
#U, Sigma, VT = scipy_svd(ratings_df_pivoted.values, full_matrices=False)

In [66]:
user1 = U[1]

user_v = user1.reshape(1,-1)
heap = []
for i, row in enumerate(U):
    v_row = row.reshape(1,-1)
    heappush(heap, (cosine_similarity(user_v, v_row)[0][0], i))

print(nsmallest(10, heap))

  interactivity=interactivity, compiler=compiler, result=result)


ValueError: Index contains duplicate entries, cannot reshape

In [63]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [64]:
movies_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,original_title,id
0,1,1371,2.5,1260759135,Rocky III,1371
1,4,1371,4.0,949810302,Rocky III,1371
2,7,1371,3.0,851869160,Rocky III,1371
3,19,1371,4.0,855193404,Rocky III,1371
4,21,1371,3.0,853852263,Rocky III,1371


In [58]:
pd.DataFrame(VT.T, index=ratings_df_pivoted.columns.values)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,661,662,663,664,665,666,667,668,669,670
1,-0.078961,-0.035243,-0.050907,0.028616,0.008136,0.028946,0.065364,0.063612,-0.025274,0.018813,...,0.016361,0.034945,-0.024131,0.018539,-0.035395,-0.016775,0.029596,0.027444,-0.047852,0.063567
2,-0.032187,-0.015264,-0.064308,-0.008655,-0.041373,0.001270,0.025483,0.038160,0.007073,0.008542,...,-0.010672,-0.041870,-0.055457,0.013986,0.009900,-0.065183,0.005995,-0.021970,0.055425,-0.053545
3,-0.012846,0.005636,-0.028440,-0.009420,-0.015818,-0.004685,0.002659,0.006111,0.007165,0.003801,...,0.053952,-0.057408,-0.049232,-0.010528,-0.004044,-0.018703,0.017239,-0.013817,0.039810,-0.031211
4,-0.002416,0.002665,-0.006711,0.001049,-0.002712,0.000192,0.000195,0.005580,0.006754,0.003960,...,0.009880,0.028922,-0.004554,-0.031128,-0.013353,-0.007604,0.009789,-0.022174,-0.021233,-0.022152
5,-0.015225,0.004057,-0.026970,-0.018439,-0.022584,0.021009,-0.015852,0.018845,0.006176,0.023055,...,0.097576,0.056898,-0.102505,0.162508,-0.007992,0.024982,0.008476,0.055147,-0.025590,-0.004687
6,-0.038012,0.002343,-0.024066,0.039259,-0.034002,-0.044785,-0.031435,-0.055307,0.008385,-0.003077,...,0.037797,-0.095084,-0.023369,0.040705,-0.002454,-0.002454,0.057405,0.044920,-0.059877,0.042752
7,-0.014711,0.019143,-0.024539,-0.011992,-0.020762,0.026267,0.015658,0.022708,-0.003303,0.010887,...,-0.017005,0.047406,-0.025227,0.020172,-0.005416,-0.000028,0.010551,0.022638,0.015305,-0.002332
8,-0.001741,0.002836,-0.003775,-0.001847,-0.004573,0.005814,-0.003753,-0.006035,-0.007372,0.010121,...,-0.046261,0.005367,0.023683,-0.032498,-0.013234,-0.007618,0.007346,-0.051247,-0.015020,-0.022472
9,-0.003077,0.003698,-0.015768,-0.001582,-0.008990,-0.006361,0.003709,0.000525,0.001794,0.000892,...,0.020094,-0.025564,0.025169,0.043742,0.020029,-0.009992,-0.050885,-0.005196,0.057813,-0.024755
10,-0.035408,-0.017961,-0.080703,0.017175,-0.045783,-0.016481,-0.000728,-0.022538,0.031905,-0.047215,...,-0.033326,-0.035164,-0.025602,0.043602,-0.067955,-0.000174,-0.028093,0.002812,0.064930,-0.050621


In [3]:
closest_users = nlargest(10, heap)

In [4]:
[_[1] for _ in closest_users]

[1, 207, 182, 214, 565, 526, 665, 270, 572, 612]

### Getting Movies Based on Closest Users

In [5]:
user_id = 1
user_seen_movies_df = ratings_df.groupby('userId').get_group(user_id)[['movieId']][0:10]
user_seen_movies = user_seen_movies_df['movieId']

In [6]:
[x for x in user_seen_movies]

[31, 1029, 1061, 1129, 1172, 1263, 1287, 1293, 1339, 1343]

In [29]:
for m in user_seen_movies:
    print(m)

31
1029
1061
1129
1172
1263
1287
1293
1339
1343


0                                Toy Story
1                                  Jumanji
2                         Grumpier Old Men
3                        Waiting to Exhale
4              Father of the Bride Part II
5                                     Heat
6                                  Sabrina
7                             Tom and Huck
8                             Sudden Death
9                                GoldenEye
10                  The American President
11             Dracula: Dead and Loving It
12                                   Balto
13                                   Nixon
14                        Cutthroat Island
15                                  Casino
16                   Sense and Sensibility
17                              Four Rooms
18          Ace Ventura: When Nature Calls
19                             Money Train
20                              Get Shorty
21                                 Copycat
22                               Assassins
23         

In [9]:
movies_df[movies_df['movieId'] == 273]['original_title']

2077    Das weisse Rauschen
2078    Das weisse Rauschen
2079    Das weisse Rauschen
2080    Das weisse Rauschen
2081    Das weisse Rauschen
2082    Das weisse Rauschen
2083    Das weisse Rauschen
2084    Das weisse Rauschen
2085    Das weisse Rauschen
2086    Das weisse Rauschen
2087    Das weisse Rauschen
2088    Das weisse Rauschen
2089    Das weisse Rauschen
2090    Das weisse Rauschen
2091    Das weisse Rauschen
2092    Das weisse Rauschen
2093    Das weisse Rauschen
2094    Das weisse Rauschen
2095    Das weisse Rauschen
2096    Das weisse Rauschen
2097    Das weisse Rauschen
2098    Das weisse Rauschen
2099    Das weisse Rauschen
2100    Das weisse Rauschen
2101    Das weisse Rauschen
2102    Das weisse Rauschen
2103    Das weisse Rauschen
Name: original_title, dtype: object

In [30]:
for mov_id in user_seen_movies:
    try:
        print(set(movies_df[movies_df['movieId'] == mov_id]['original_title']).pop())
    except:
        print('empty')

empty
empty
empty
empty
empty
empty
empty
empty
empty
empty


In [36]:
set(movies_df[movies_df['movieId'] == 273]['original_title']).pop() #273

'Das weisse Rauschen'

In [59]:
movies_df[movies_df['movieId'] == 162376]

Unnamed: 0,userId,movieId,rating,timestamp,original_title,id


In [62]:
movies[movies.id == 273]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
