# Model Based Movie Recommendation System

### Importing Libraries 

In [3]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD

### importing data

This Data contains rating of different movies given by different Users

In [4]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


This Data contains information about movies
We will only use movie ID and name

In [81]:
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movie_names = movies[['item_id', 'movie title']]
movie_names.head()

Unnamed: 0,item_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


### Merging Data Based on Movie IDs 

In [82]:
movie_rating = pd.merge(data,movie_names,on='item_id')

In [83]:
movie_rating

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)
...,...,...,...,...,...
99995,840,1674,4,891211682,Mamma Roma (1962)
99996,655,1640,3,888474646,"Eighth Day, The (1996)"
99997,655,1637,3,888984255,Girls Town (1996)
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1..."


### Grouping Data by movies and finding rating counts 

We will group the data by movie IDs and find the count of ratings received by each movie

In [9]:
movie_rating_count = pd.DataFrame(movie_rating.groupby('item_id')['rating'].count()).sort_values('rating', ascending=False)

In [10]:
movie_rating_count

Unnamed: 0_level_0,rating
item_id,Unnamed: 1_level_1
50,583
258,509
100,508
181,507
294,485
...,...
1576,1
1577,1
1348,1
1579,1


Movie with ID 50 is the most rated movie

In [24]:
most_rated_movie = movie_rating[movie_rating['item_id']==50]['movie title'].unique()[0]

In [25]:
most_rated_movie

'Star Wars (1977)'

### Finding User Rating of each Movie

Here we will create a pivot table.<br />
It will take users as rows and movies as columns<br />
The cell will be filled by the rating they give to a movie they watched<br />
Null values will be filled by 0

In [84]:
user_rating = movie_rating.pivot_table(index='user_id', columns='movie title', values='rating', fill_value=0)

In [85]:
user_rating

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
942,0,0,0,0,0,0,0,3,0,3,...,0,0,0,0,0,0,0,0,0,0


### Compressing the Data by columns

There are 943 users<br />
we will compress the data to have 1664 * 943 to 1664*12

In [59]:
svd = TruncatedSVD(n_components=12, random_state=17)

Transpose the user_rating data because we want all the movies uncompressed

In [60]:
X = svd.fit_transform(user_rating.values.T)

In [61]:
X = pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.039994,0.659900,0.045686,0.814038,0.166974,-0.983311,-0.321286,0.307193,-0.036657,-0.733809,-0.307421,-0.502488
1,0.436584,-0.257259,0.352956,-0.668694,-0.293082,-0.005821,-0.231494,-0.566758,0.336974,-0.235076,0.356348,-0.095098
2,12.543744,5.669187,-4.907836,5.021835,7.880469,-0.812796,-0.102903,-4.838583,2.094505,3.865625,0.590287,-0.841715
3,25.663725,-12.267634,6.077468,3.114552,0.713219,1.884833,-3.659443,-0.373329,1.397950,-5.825121,1.229550,-1.668007
4,3.636418,4.217252,2.633292,1.654664,-3.200552,-1.943744,-0.664004,-0.819335,2.333509,-0.272937,-0.727663,-1.538501
...,...,...,...,...,...,...,...,...,...,...,...,...
1659,6.566454,1.511648,-5.562119,-1.528016,-3.108589,-2.727236,-0.578669,-0.009548,-0.904834,-1.267858,1.716259,0.037213
1660,4.787711,2.142408,3.589150,-4.735310,-1.221891,-2.299756,0.182283,-3.077109,1.497814,1.913944,1.546886,-2.662323
1661,0.358930,0.371252,0.022971,0.246609,0.650117,-0.005131,-0.018331,-0.511889,0.017368,-0.075916,0.183633,-0.034075
1662,1.424280,0.814959,-0.490235,-0.654467,-1.029017,-0.322938,0.723553,-0.762313,0.415133,0.158403,0.552850,-0.616945


### Finding Correlation of each movie based of User Rating 

Here we will find correlation of every movie with each other based on user ID<br />
If a person rated starwars high, he's likely to enjoy the movies that have high correlation with starwars.

In [86]:
movie_corr = pd.DataFrame(np.corrcoef(X))

In [87]:
movie_corr 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1654,1655,1656,1657,1658,1659,1660,1661,1662,1663
0,1.000000,-0.108751,0.522660,0.409234,0.641605,0.561322,0.271426,0.222175,0.492033,0.124765,...,0.382802,0.372460,0.400351,0.247988,0.437873,0.447126,0.156613,0.392865,0.218205,0.511112
1,-0.108751,1.000000,0.065397,0.511070,0.309820,0.429945,0.304974,0.474048,0.417974,0.585315,...,0.225862,0.332031,-0.071705,0.517818,0.202988,0.226180,0.751335,0.157371,0.512735,0.245910
2,0.522660,0.065397,1.000000,0.377707,0.287657,0.575654,0.703720,0.409269,0.190444,0.344502,...,0.241067,0.257025,0.465188,0.521846,0.605136,0.565363,0.291041,0.767696,0.444013,0.200189
3,0.409234,0.511070,0.377707,1.000000,0.250579,0.443322,0.750918,0.905796,0.519590,0.871994,...,0.375838,0.424989,0.569751,0.832513,0.522000,0.412438,0.324140,0.186903,0.248225,0.401022
4,0.641605,0.309820,0.287657,0.250579,1.000000,0.586358,0.228048,0.222068,0.600629,0.156082,...,0.279290,0.554805,0.202595,0.144045,0.335594,0.361469,0.576760,0.150962,0.582245,0.523622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659,0.447126,0.226180,0.565363,0.412438,0.361469,0.581108,0.551576,0.488732,0.209416,0.141380,...,0.568927,0.250817,0.597840,0.630023,0.969464,1.000000,0.327005,0.146445,0.727496,0.198872
1660,0.156613,0.751335,0.291041,0.324140,0.576760,0.704604,0.268015,0.395407,0.505965,0.310896,...,0.201390,0.388508,0.125838,0.341938,0.331121,0.327005,1.000000,0.245923,0.717627,0.618010
1661,0.392865,0.157371,0.767696,0.186903,0.150962,0.439790,0.340165,0.040586,0.322996,0.324466,...,-0.057199,0.240806,0.023629,0.283088,0.131619,0.146445,0.245923,1.000000,0.181418,0.111203
1662,0.218205,0.512735,0.444013,0.248225,0.582245,0.554654,0.452329,0.390606,0.330936,0.179903,...,0.405362,0.444359,0.274994,0.488155,0.668666,0.727496,0.717627,0.181418,1.000000,0.194100


### Testing 

As we saw above starwars was most highly rated<br />
lets find movies that highlt correlate with it

In [None]:
movie_names = (user_rating.columns)
movies_list = list(movie_names)

In [78]:
star_wars_index = movies_list.index('Star Wars (1977)')
star_wars_index

1398

In [79]:
star_wars_corr = movie_corr[star_war_index]

In [80]:
movie_names[(star_wars_corr > 0.9)]

Index(['Die Hard (1988)', 'Empire Strikes Back, The (1980)',
       'Fugitive, The (1993)', 'Raiders of the Lost Ark (1981)',
       'Return of the Jedi (1983)', 'Star Wars (1977)',
       'Terminator 2: Judgment Day (1991)', 'Terminator, The (1984)',
       'Toy Story (1995)'],
      dtype='object', name='movie title')

In [None]:
Above movies an be recommended to star