In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [48]:
cols = ['User_id', 'item_id', 'rating', 'timestamp']

In [52]:
user_ratings = pd.read_csv('u.data', names = cols, sep= '\t')

In [55]:
user_ratings

Unnamed: 0,User_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [80]:
cols2 = ['item_id', 'title']

In [81]:
cols2

['item_id', 'Title']

In [82]:
movie_titles = pd.read_csv('u.item', names = cols2, usecols = range(2), encoding = 'latin1', sep= '|')

In [83]:
movie_titles

Unnamed: 0,item_id,Title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [96]:
df = user_ratings.merge(movie_titles)

In [93]:
movie_titles

Unnamed: 0,item_id,Title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [94]:
user_ratings.head()

Unnamed: 0,User_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [97]:
df

Unnamed: 0,User_id,item_id,rating,timestamp,Title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)
...,...,...,...,...,...
99995,840,1674,4,891211682,Mamma Roma (1962)
99996,655,1640,3,888474646,"Eighth Day, The (1996)"
99997,655,1637,3,888984255,Girls Town (1996)
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1..."


In [106]:
df.groupby('Title')['rating'].count().sort_values(ascending=False).head(10) # shows the most highly
#watched films but is perhaps unfair as one film may have been watched by loads more people?

Title
Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
Name: rating, dtype: int64

In [109]:
df.groupby('Title')['rating'].mean().sort_values(ascending=False).round().head(20) # a better way to 
# rank the films as it gives the average so does not depend on the number of people who watched them

Title
They Made Me a Criminal (1939)                            5.0
Marlene Dietrich: Shadow and Light (1996)                 5.0
Saint of Fort Washington, The (1993)                      5.0
Someone Else's America (1995)                             5.0
Star Kid (1997)                                           5.0
Great Day in Harlem, A (1994)                             5.0
Aiqing wansui (1994)                                      5.0
Santa with Muscles (1996)                                 5.0
Prefontaine (1997)                                        5.0
Entertaining Angels: The Dorothy Day Story (1996)         5.0
Pather Panchali (1955)                                    5.0
Some Mother's Son (1996)                                  4.0
Maya Lin: A Strong Clear Vision (1994)                    4.0
Anna (1996)                                               4.0
Everest (1998)                                            4.0
Close Shave, A (1995)                                     4.0
Sc

In [111]:
df.groupby('Title')['rating'].sum().sort_values(ascending=False).round().head(7) # shows the most highly
#recommended films but is perhaps unfair as some films may have been watched by loads more people?

Title
Star Wars (1977)                  2541
Fargo (1996)                      2111
Return of the Jedi (1983)         2032
Contact (1997)                    1936
Raiders of the Lost Ark (1981)    1786
Godfather, The (1972)             1769
English Patient, The (1996)       1759
Name: rating, dtype: int64

In [143]:
df['liked'] = (df['rating'] >= 4.5).astype(int) # adds a new column 'liked' and gives a numerical 
# value

In [144]:
df

Unnamed: 0,User_id,item_id,rating,timestamp,Title,liked
0,196,242,3,881250949,Kolya (1996),0
1,63,242,3,875747190,Kolya (1996),0
2,226,242,5,883888671,Kolya (1996),1
3,154,242,3,879138235,Kolya (1996),0
4,306,242,5,876503793,Kolya (1996),1
...,...,...,...,...,...,...
99995,840,1674,4,891211682,Mamma Roma (1962),0
99996,655,1640,3,888474646,"Eighth Day, The (1996)",0
99997,655,1637,3,888984255,Girls Town (1996),0
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1...",0


In [145]:
df.loc[950:960]

Unnamed: 0,User_id,item_id,rating,timestamp,Title,liked
950,293,265,3,888906193,"Hunt for Red October, The (1990)",0
951,648,265,4,884796886,"Hunt for Red October, The (1990)",0
952,479,265,4,879460918,"Hunt for Red October, The (1990)",0
953,554,265,4,876232956,"Hunt for Red October, The (1990)",0
954,738,265,4,892957967,"Hunt for Red October, The (1990)",0
955,709,265,4,879846489,"Hunt for Red October, The (1990)",0
956,495,265,5,888633316,"Hunt for Red October, The (1990)",1
957,710,265,4,883705484,"Hunt for Red October, The (1990)",0
958,660,265,2,891199241,"Hunt for Red October, The (1990)",0
959,592,265,4,882956039,"Hunt for Red October, The (1990)",0


In [247]:
search_result = df[df['Title'] == 'Aiqing wansui (1994)']
search_result

Unnamed: 0,User_id,item_id,rating,timestamp,Title,liked
98402,385,1536,5,879441339,Aiqing wansui (1994),1


In [249]:
df[df['Title'] =='Aiqing wansui (1994)']

Unnamed: 0,User_id,item_id,rating,timestamp,Title,liked
98402,385,1536,5,879441339,Aiqing wansui (1994),1


In [225]:
pivot_df = df.pivot(index='User_id', columns='item_id', values='liked').fillna(0)

In [226]:
pivot_df

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
user_movie_matrix = df.pivot(index='User_id', columns='item_id', values='liked').fillna(0)

In [227]:
user_movie_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
User_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [228]:
frequent_itemsets = apriori(user_movie_matrix, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)



In [229]:
mov = 96
df[df['item_id'] == mov]['Title'].iloc[0]

'Terminator 2: Judgment Day (1991)'

In [230]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(56),(50),0.199364,0.344645,0.103924,0.521277,1.512504,0.035214,1.368964,0.423219
1,(50),(56),0.344645,0.199364,0.103924,0.301538,1.512504,0.035214,1.146285,0.51704
2,(50),(100),0.344645,0.240721,0.109226,0.316923,1.316557,0.026263,1.111557,0.36689
3,(100),(50),0.240721,0.344645,0.109226,0.453744,1.316557,0.026263,1.199723,0.316673
4,(50),(127),0.344645,0.226935,0.116649,0.338462,1.491445,0.038437,1.168586,0.502795
5,(127),(50),0.226935,0.344645,0.116649,0.514019,1.491445,0.038437,1.348519,0.426238
6,(50),(172),0.344645,0.182397,0.149523,0.433846,2.378587,0.086661,1.444136,0.884379
7,(172),(50),0.182397,0.344645,0.149523,0.819767,2.378587,0.086661,3.636165,0.70888
8,(50),(173),0.344645,0.168611,0.106045,0.307692,1.824867,0.047934,1.200895,0.689725
9,(173),(50),0.168611,0.344645,0.106045,0.628931,1.824867,0.047934,1.766127,0.543686


In [239]:
df

Unnamed: 0,User_id,item_id,rating,timestamp,Title,liked
0,196,242,3,881250949,Kolya (1996),0
1,63,242,3,875747190,Kolya (1996),0
2,226,242,5,883888671,Kolya (1996),1
3,154,242,3,879138235,Kolya (1996),0
4,306,242,5,876503793,Kolya (1996),1
...,...,...,...,...,...,...
99995,840,1674,4,891211682,Mamma Roma (1962),0
99996,655,1640,3,888474646,"Eighth Day, The (1996)",0
99997,655,1637,3,888984255,Girls Town (1996),0
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1...",0


In [232]:
df.iloc[500]

User_id                             320
item_id                              51
rating                                5
timestamp                     884750992
Title        Legends of the Fall (1994)
liked                                 1
Name: 500, dtype: object

In [233]:
movie_name = "Toy Story (1995)"
n_recommendations = 5

In [241]:
movie_id =df[df['Title'] == movie_name]['item_id'].iloc[0]
movie_id

1

In [242]:
recommendations = rules[rules['antecedents'].apply(lambda x: movie_id in x)]
sorted_recommendations = recommendations.sort_values(by='lift', ascending=False).head(n_recommendations)


In [243]:
recommendations

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
