## Imports

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

#load dataset
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

## Ratings based on Tags

### Preprocessing

In [2]:
tagRatings = pd.merge(ratings, tags, on=['movieId', 'movieId'])
tagRatings.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,userId_y,tag,timestamp_y
0,1,1,4.0,964982703,336,pixar,1139045764
1,1,1,4.0,964982703,474,pixar,1137206825
2,1,1,4.0,964982703,567,fun,1525286013
3,5,1,4.0,847434962,336,pixar,1139045764
4,5,1,4.0,847434962,474,pixar,1137206825


In [3]:
dummies = tagRatings['tag'].str.get_dummies()
tagRatings = pd.concat([tagRatings, dummies], axis= 1)

tagRatings['rating'] = tagRatings['rating'].apply(lambda x: str(x))

dummies = tagRatings['rating'].str.get_dummies()
tagRatings = pd.concat([tagRatings, dummies], axis=1)

tagRatings.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,userId_y,tag,timestamp_y,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,...,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0
0,1,1,4.0,964982703,336,pixar,1139045764,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,4.0,964982703,474,pixar,1137206825,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,1,4.0,964982703,567,fun,1525286013,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,5,1,4.0,847434962,336,pixar,1139045764,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,5,1,4.0,847434962,474,pixar,1137206825,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
tagRatings.drop(['timestamp_x', 'timestamp_y', 'tag', 'rating', 'userId_x', 'userId_y', 'movieId'], axis=1, inplace=True)
tagRatings.head()

Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Rule discovery

In [5]:
frequent_itemsets = apriori(tagRatings, min_support=0.002, use_colnames=True)
print(frequent_itemsets)



      support                  itemsets
0    0.002371               (Brad Pitt)
1    0.007495                  (Disney)
2    0.002153                    (EPIC)
3    0.003340        (In Netflix queue)
4    0.003293       (Leonardo DiCaprio)
..        ...                       ...
123  0.002809  (4.0, thought-provoking)
124  0.003212  (thought-provoking, 5.0)
125  0.002444        (4.0, time travel)
126  0.002830       (4.0, twist ending)
127  0.002950       (twist ending, 5.0)

[128 rows x 2 columns]


In [6]:
association_rules(frequent_itemsets, 
                  metric='confidence', 
                  min_threshold=0.3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Disney),(4.0),0.007495,0.277776,0.00235,0.313501,1.128611,0.000268,1.052039,0.114816
1,(classic),(5.0),0.006968,0.273763,0.00244,0.350154,1.279042,0.000532,1.117553,0.219696
2,(dark comedy),(5.0),0.008816,0.273763,0.002736,0.310311,1.133505,0.000322,1.052993,0.118828
3,(disturbing),(5.0),0.005909,0.273763,0.002015,0.341074,1.245875,0.000398,1.102153,0.198524
4,(great soundtrack),(5.0),0.00557,0.273763,0.002041,0.366436,1.338516,0.000516,1.146273,0.254321
5,(imdb top 250),(5.0),0.006458,0.273763,0.002032,0.314741,1.149686,0.000265,1.0598,0.131043
6,(sci-fi),(5.0),0.010836,0.273763,0.003503,0.323308,1.18098,0.000537,1.073217,0.154925
7,(thought-provoking),(5.0),0.010664,0.273763,0.003212,0.301166,1.100099,0.000292,1.039213,0.091972
8,(time travel),(4.0),0.007418,0.277776,0.002444,0.32948,1.186134,0.000384,1.07711,0.158098


### Summary

## Ratings based on Genres

### Preprocessing

In [7]:
moviesRatings = pd.merge(movies, ratings, on=['movieId', 'movieId'])
moviesRatings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [8]:
dummies = moviesRatings['genres'].str.get_dummies('|')
moviesRatings = pd.concat([moviesRatings, dummies], axis= 1)

moviesRatings['rating'] = moviesRatings['rating'].apply(lambda x: str(x))

dummies = moviesRatings['rating'].str.get_dummies()
moviesRatings = pd.concat([moviesRatings, dummies], axis=1)

moviesRatings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,(no genres listed),Action,Adventure,Animation,...,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0


In [9]:
moviesRatings.drop(['timestamp', 'genres', 'movieId', 'title', 'rating', 'userId'], axis=1, inplace=True)
moviesRatings.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0
0,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


### Rule discovery

In [10]:
frequent_itemsets = apriori(moviesRatings, min_support=0.02, use_colnames=True)
print(frequent_itemsets)



      support                                  itemsets
0    0.303810                                  (Action)
1    0.239607                               (Adventure)
2    0.069301                               (Animation)
3    0.091317                                (Children)
4    0.387292                                  (Comedy)
..        ...                                       ...
146  0.024594                    (Thriller, 4.0, Crime)
147  0.022720                (Thriller, Drama, Mystery)
148  0.027798                     (Romance, 4.0, Drama)
149  0.028115                    (Thriller, 4.0, Drama)
150  0.023146  (Animation, Children, Adventure, Comedy)

[151 rows x 2 columns]


In [11]:
association_rules(frequent_itemsets, 
                  metric='confidence', 
                  min_threshold=0.6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(IMAX),(Action),0.041106,0.30381,0.025854,0.628951,2.070209,0.013365,1.876272,0.539118
1,(Sci-Fi),(Action),0.171,0.30381,0.107362,0.627849,2.066583,0.055411,1.870719,0.622569
2,(IMAX),(Adventure),0.041106,0.239607,0.02517,0.612304,2.555452,0.01532,1.961313,0.634773
3,(Animation),(Children),0.069301,0.091317,0.052947,0.764024,8.36676,0.046619,3.850746,0.946041
4,(War),(Drama),0.048187,0.415804,0.039639,0.822597,1.97833,0.019602,3.29305,0.519559
5,(Mystery),(Thriller),0.076104,0.262327,0.055189,0.725176,2.764397,0.035225,2.684165,0.690832
6,"(Fantasy, Action)",(Adventure),0.031913,0.239607,0.02391,0.749223,3.126885,0.016263,3.03215,0.702616
7,"(Sci-Fi, Adventure)",(Action),0.078821,0.30381,0.059344,0.752894,2.478172,0.035397,2.817371,0.647515
8,"(Thriller, Adventure)",(Action),0.053493,0.30381,0.049377,0.923063,3.038288,0.033126,9.048791,0.708782
9,"(Crime, Action)",(Thriller),0.067248,0.262327,0.043407,0.64548,2.460594,0.025766,2.080766,0.63639


### Summary

## Tags based on Genres

### Preprocessing

In [12]:
moviesTags = pd.merge(movies, tags, on=['movieId', 'movieId'])
moviesTags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932


In [13]:
dummies = moviesTags['tag'].str.get_dummies()
moviesTags = pd.concat([moviesTags, dummies], axis= 1)

dummies = moviesTags['genres'].str.get_dummies('|')
moviesTags = pd.concat([moviesTags, dummies], axis=1)

moviesTags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
moviesTags.drop(['movieId', 'userId', 'tag', 'genres', 'timestamp', 'title'], axis=1, inplace=True)
moviesTags.head()

Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Rule Discovery

In [15]:
frequent_itemsets = apriori(moviesTags, min_support=0.002, use_colnames=True)
print(frequent_itemsets)

      support                                           itemsets
0    0.002715                                        (Australia)
1    0.002172                                        (Christmas)
2    0.006245                                           (Disney)
3    0.002715                                          (England)
4    0.002987                                        (Holocaust)
..        ...                                                ...
657  0.007059  (Sci-Fi, Crime, Thriller, Action, Drama, Mystery)
658  0.007059   (Sci-Fi, Crime, IMAX, Thriller, Action, Mystery)
659  0.007059   (Sci-Fi, IMAX, Thriller, Action, Drama, Mystery)
660  0.007059    (Sci-Fi, Crime, IMAX, Thriller, Drama, Mystery)
661  0.007059  (Sci-Fi, Crime, IMAX, Thriller, Action, Drama,...

[662 rows x 2 columns]




In [16]:
association_rules(frequent_itemsets, 
                metric='confidence',
                min_threshold=0.8)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Disney),(Animation),0.006245,0.064078,0.005430,0.869565,13.570376,0.005030,7.175400,0.932131
1,(Disney),(Children),0.006245,0.050231,0.006245,1.000000,19.908108,0.005931,inf,0.955738
2,(India),(Drama),0.002715,0.568830,0.002172,0.800000,1.406396,0.000628,2.155851,0.289750
3,(Leonardo DiCaprio),(Drama),0.002715,0.568830,0.002715,1.000000,1.757995,0.001171,inf,0.432344
4,(Mafia),(Crime),0.002715,0.217214,0.002172,0.800000,3.683000,0.001582,3.913929,0.730466
...,...,...,...,...,...,...,...,...,...,...
704,"(Thriller, IMAX, Mystery)","(Sci-Fi, Crime, Action, Drama)",0.007059,0.007331,0.007059,1.000000,136.407407,0.007008,inf,0.999727
705,"(Action, IMAX, Drama)","(Thriller, Sci-Fi, Crime, Mystery)",0.008146,0.007602,0.007059,0.866667,113.997619,0.006998,7.442981,0.999368
706,"(Action, IMAX, Mystery)","(Thriller, Sci-Fi, Crime, Drama)",0.007059,0.007331,0.007059,1.000000,136.407407,0.007008,inf,0.999727
707,"(Mystery, IMAX, Drama)","(Thriller, Sci-Fi, Crime, Action)",0.007059,0.007602,0.007059,1.000000,131.535714,0.007006,inf,0.999453
