In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [5]:
cols = ['user_id', 'item_id', 'rating', 'timestamp']

In [6]:
user_ratings = pd.read_csv('u.data', names = cols, sep= '\t')

In [7]:
user_ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [8]:
cols2 = ['item_id', 'title']

In [9]:
cols2

['item_id', 'title']

In [10]:
movie_titles = pd.read_csv('u.item', names = cols2, usecols = range(2), encoding = 'latin1', sep= '|')

In [11]:
movie_titles

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [12]:
df = pd.merge(user_ratings,movie_titles,on='item_id')

In [13]:
df

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)
...,...,...,...,...,...
99995,840,1674,4,891211682,Mamma Roma (1962)
99996,655,1640,3,888474646,"Eighth Day, The (1996)"
99997,655,1637,3,888984255,Girls Town (1996)
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1..."


In [64]:
df[df['title'] == 'Aiqing wansui (1994)']

Unnamed: 0,user_id,item_id,rating,timestamp,title,liked
98402,385,1536,5,879441339,Aiqing wansui (1994),1


In [15]:
df.groupby('title')['rating'].count().sort_values(ascending=False).head(10) # shows the most highly
#watched films but is perhaps unfair as one film may have been watched by loads more people?

title
Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
Name: rating, dtype: int64

In [16]:
df.groupby('title')['rating'].mean().sort_values(ascending=False).round().head(20) # a better way to 
# rank the films as it gives the average so does not depend on the number of people who watched them

title
They Made Me a Criminal (1939)                            5.0
Marlene Dietrich: Shadow and Light (1996)                 5.0
Saint of Fort Washington, The (1993)                      5.0
Someone Else's America (1995)                             5.0
Star Kid (1997)                                           5.0
Great Day in Harlem, A (1994)                             5.0
Aiqing wansui (1994)                                      5.0
Santa with Muscles (1996)                                 5.0
Prefontaine (1997)                                        5.0
Entertaining Angels: The Dorothy Day Story (1996)         5.0
Pather Panchali (1955)                                    5.0
Some Mother's Son (1996)                                  4.0
Maya Lin: A Strong Clear Vision (1994)                    4.0
Anna (1996)                                               4.0
Everest (1998)                                            4.0
Close Shave, A (1995)                                     4.0
Sc

In [17]:
df.groupby('title')['rating'].sum().sort_values(ascending=False).round().head(7) # shows the most highly
#recommended films but is perhaps unfair as some films may have been watched by loads more people?

title
Star Wars (1977)                  2541
Fargo (1996)                      2111
Return of the Jedi (1983)         2032
Contact (1997)                    1936
Raiders of the Lost Ark (1981)    1786
Godfather, The (1972)             1769
English Patient, The (1996)       1759
Name: rating, dtype: int64

In [19]:
df['liked'] = (df['rating'] >= 3.5).astype(int) # adds a new column 'liked' and gives a numerical 
# value

In [20]:
df

Unnamed: 0,user_id,item_id,rating,timestamp,title,liked
0,196,242,3,881250949,Kolya (1996),0
1,63,242,3,875747190,Kolya (1996),0
2,226,242,5,883888671,Kolya (1996),1
3,154,242,3,879138235,Kolya (1996),0
4,306,242,5,876503793,Kolya (1996),1
...,...,...,...,...,...,...
99995,840,1674,4,891211682,Mamma Roma (1962),1
99996,655,1640,3,888474646,"Eighth Day, The (1996)",0
99997,655,1637,3,888984255,Girls Town (1996),0
99998,655,1630,3,887428735,"Silence of the Palace, The (Saimt el Qusur) (1...",0


In [24]:
user_movie_matrix = df.pivot(index='user_id', columns='item_id', values='liked').fillna(0)

In [25]:
user_movie_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
frequent_itemsets = apriori(user_movie_matrix, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.0)



In [42]:
mov = 95
df[df['item_id'] == mov]['title'].iloc[0]

'Aladdin (1992)'

In [62]:
movie_name = "Air Force One (1997)"
n_recommendations = 5
movie_id =df[df['title'] == movie_name]['item_id'].iloc[0]
movie_id
recommendations = rules[rules['antecedents'].apply(lambda x: movie_id in x)]
sorted_recommendations = recommendations.sort_values(by='lift', ascending=False).head(n_recommendations)
sorted_recommendations

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2245,(300),(313),0.267232,0.301166,0.117709,0.440476,1.462567,0.037228,1.248979,0.431611
2225,(300),(258),0.267232,0.364793,0.128314,0.480159,1.316249,0.030829,1.221925,0.327887


In [60]:
print('We recommend:\n')

for rec in sorted_recommendations['consequents']:
    for mov in list(rec):
        print(df[df['item_id'] == mov]['title'].iloc[0], '\n')

We recommend:

Star Wars (1977) 

Back to the Future (1985) 

Star Wars (1977) 

Forrest Gump (1994) 

Star Wars (1977) 

Braveheart (1995) 

Forrest Gump (1994) 

Shawshank Redemption, The (1994) 

Star Wars (1977) 

