In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import TruncatedSVD

In [2]:
frame = pd.read_csv('u.data', names = ["user_id", "item_id","rating","timestamp"], header=None, sep='\t')

In [3]:
frame.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
len(frame)

100000

In [5]:
columns = ["item_id", "movie title", "release date", "video release date", "IMDb URL", 'unknown', "Action",
          "Adventure", "Animation", "Childrens", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
          "Horror", "Musical","Mystery","Romance","Sci-Fi", "Thirller", "War", "Western"]

movies = pd.read_csv('ml-100k/u.item', sep='|', names = columns, encoding="Latin-1")

In [6]:
movies.head()

Unnamed: 0,item_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thirller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
movie_names = pd.DataFrame( movies[['item_id', 'movie title']])

In [8]:
combined_movies_data = pd.merge(frame, movie_names, on='item_id')
combined_movies_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [9]:
# Find most reviewed and rated movies
combined_movies_data.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [10]:
Filter = combined_movies_data['item_id'] == 50
combined_movies_data[Filter]['movie title'].unique()

array(['Star Wars (1977)'], dtype=object)

In [11]:
# Building User-Item Matrix
ratings_crosstab = combined_movies_data.pivot_table(values='rating', index='user_id', columns='movie title', fill_value=0)
ratings_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


In [12]:
# Transposing the Matrix
print("User: %.0f " % ratings_crosstab.shape[0] + "--- Movie: %.0f " % ratings_crosstab.shape[1])

print(ratings_crosstab.shape)

User: 943 --- Movie: 1664 
(943, 1664)


In [13]:
X = ratings_crosstab.values.T
X.shape

(1664, 943)

In [14]:
# Decomposing Matrix

SVD = TruncatedSVD(n_components=12, random_state=17)

resultant_matrix = SVD.fit_transform(X)

resultant_matrix.shape

(1664, 12)

In [15]:
# Generating Correlation Matrix
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape

(1664, 1664)

In [16]:
# Isolating Star Wars From the Correlation Matrix
movies_names = ratings_crosstab.columns.values
movies_list = list(movie_names['movie title'])

star_wars = movies_list.index('Star Wars (1977)')
print(star_wars)

49


In [17]:
corr_star_wars = corr_mat[star_wars]
corr_star_wars.shape

(1664,)

In [18]:
corr_star_wars[:50]

array([-0.10442202,  0.80609585, -0.06343612,  0.68287662,  0.20096791,
        0.40019502,  0.31943315,  0.71892043,  0.27435215,  0.6738408 ,
        0.78230844,  0.35893529,  0.11525022,  0.070226  , -0.13327566,
        0.25848011,  0.33974213,  0.09798199,  0.06806563,  0.18359194,
        0.14285041,  0.13405622,  0.55221098,  0.06012094,  0.46441653,
        0.48561561,  0.00512004,  0.63754098,  0.09967793,  0.25447834,
        0.85051979,  0.23595032,  0.16199174,  0.05675966,  0.18072585,
        0.50833008,  0.19004817, -0.14201848, -0.17391558,  0.37462918,
        0.15371963,  0.57371509,  0.25067657,  0.2778654 ,  0.51251416,
        0.62943488, -0.11898177,  0.39389534,  0.29404732,  1.        ])

In [19]:
# Each of the rows has Pearson r correlation indicates how well each movie in the dataset is correlated with
# Star Wars (1977) based on user preferences.

In [20]:
just_movie_names = np.array(movie_names.iloc[:1664, 1])

In [23]:
# Recommend Highly Correlated Movie with Star Wars (1977)
(just_movie_names[(corr_star_wars <1.0) & (corr_star_wars > 0.88)])

array(['Incognito (1997)', 'Heathers (1989)',
       'Tetsuo II: Body Hammer (1992)', 'Love Is All There Is (1996)',
       'Mr. Wonderful (1993)'], dtype=object)