## Imports

In [93]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

#load dataset
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

## Ratings based on Tags

In [86]:
ratings['rating'] = ratings['rating'].apply(lambda x: "{:.1f}".format(x))

### Preprocessing

In [140]:
tagsForMovie = tags.groupby('movieId')['tag'].apply('|'.join)
avgRateForMovie = ratings.groupby('movieId')['rating'].mean().apply(lambda x: round(2*x)/2)
# ratingsForMovie = ratings.groupby('movieId')['rating'].apply(lambda x: '|'.join(x.values))

# tagRatings = pd.DataFrame(data={"rating": avgRateForMovie.values, "tags": tagsForMovie.values})
tagRatings = pd.merge(avgRateForMovie, tagsForMovie, right_index=True, left_index=True)
tagRatings.head()

Unnamed: 0_level_0,rating,tag
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.0,pixar|pixar|fun
2,3.5,fantasy|magic board game|Robin Williams|game
3,3.5,moldy|old
5,3.0,pregnancy|remake
7,3.0,remake


In [141]:
dummies = tagRatings['tag'].str.get_dummies()
tagRatings = pd.concat([tagRatings, dummies], axis= 1)

tagRatings['rating'] = tagRatings['rating'].apply(lambda x: str(x))

dummies = tagRatings['rating'].str.get_dummies()
tagRatings = pd.concat([tagRatings, dummies], axis=1)

tagRatings.head()

Unnamed: 0_level_0,rating,tag,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,...,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,pixar|pixar|fun,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3.5,fantasy|magic board game|Robin Williams|game,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,3.5,moldy|old,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,3.0,pregnancy|remake,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,3.0,remake,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [142]:
tagRatings.drop(['tag', 'rating'], axis=1, inplace=True)
tagRatings.head()

Unnamed: 0_level_0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Rule discovery

In [149]:
frequent_itemsets = apriori(tagRatings, min_support=0.002, use_colnames=True)
print(frequent_itemsets)



      support                                  itemsets
0    0.002574                                 (Aardman)
1    0.002574                                  (Action)
2    0.003218                            (Adam Sandler)
3    0.003218                                  (Africa)
4    0.003218                               (Al Pacino)
..        ...                                       ...
468  0.002574             (hallucinatory, 4.0, surreal)
469  0.002574  (philosophy, thought-provoking, surreal)
470  0.002574            (suspense, 4.0, psychological)
471  0.002574   (4.0, thought-provoking, psychological)
472  0.003218                      (sci-fi, 4.0, space)

[473 rows x 2 columns]


In [151]:
association_rules(frequent_itemsets, 
                  metric='confidence', 
                  min_threshold=0.9)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(John Grisham),(3.5),0.002574,0.322394,0.002574,1.0,3.101796,0.001744,inf,0.679355
1,(Judaism),(4.0),0.002574,0.428571,0.002574,1.0,2.333333,0.001471,inf,0.572903
2,(Wizards),(Magic),0.002574,0.003218,0.002574,1.0,310.8,0.002566,inf,0.999355
3,(Quentin Tarantino),(4.0),0.003218,0.428571,0.003218,1.0,2.333333,0.001839,inf,0.573273
4,(Samuel L. Jackson),(4.0),0.002574,0.428571,0.002574,1.0,2.333333,0.001471,inf,0.572903
5,(artificial intelligence),(robots),0.003218,0.005792,0.003218,1.0,172.666667,0.003199,inf,0.997418
6,(existentialism),(atmospheric),0.003218,0.020592,0.003218,1.0,48.5625,0.003151,inf,0.982569
7,(baseball),(3.5),0.003218,0.322394,0.003218,1.0,3.101796,0.00218,inf,0.679793
8,(beautiful),(4.0),0.003218,0.428571,0.003218,1.0,2.333333,0.001839,inf,0.573273
9,(cerebral),(4.0),0.003861,0.428571,0.003861,1.0,2.333333,0.002206,inf,0.573643


### Summary

Our data preprocessing consisted of grouping all tags of certain movies and merging them with calculated avg of grades for that movie (rounded to nearest grade). Rules we have discovered point on how were movies with more popular tags rated, but also most popular combinations of tags.

## Ratings based on Genres

### Preprocessing

In [7]:
moviesRatings = pd.merge(movies, ratings, on=['movieId', 'movieId'])
moviesRatings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [8]:
dummies = moviesRatings['genres'].str.get_dummies('|')
moviesRatings = pd.concat([moviesRatings, dummies], axis= 1)

moviesRatings['rating'] = moviesRatings['rating'].apply(lambda x: str(x))

dummies = moviesRatings['rating'].str.get_dummies()
moviesRatings = pd.concat([moviesRatings, dummies], axis=1)

moviesRatings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,(no genres listed),Action,Adventure,Animation,...,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,0,0,1,1,...,0,0,0,0,1,0,0,0,0,0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,0,0,1,1,...,0,0,0,0,0,0,0,0,1,0


In [9]:
moviesRatings.drop(['timestamp', 'genres', 'movieId', 'title', 'rating', 'userId'], axis=1, inplace=True)
moviesRatings.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0
0,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


### Rule discovery

In [10]:
frequent_itemsets = apriori(moviesRatings, min_support=0.02, use_colnames=True)
print(frequent_itemsets)



      support                                  itemsets
0    0.303810                                  (Action)
1    0.239607                               (Adventure)
2    0.069301                               (Animation)
3    0.091317                                (Children)
4    0.387292                                  (Comedy)
..        ...                                       ...
146  0.024594                    (4.0, Crime, Thriller)
147  0.022720                (Mystery, Drama, Thriller)
148  0.027798                     (Romance, Drama, 4.0)
149  0.028115                    (Drama, 4.0, Thriller)
150  0.023146  (Animation, Children, Adventure, Comedy)

[151 rows x 2 columns]


In [11]:
association_rules(frequent_itemsets, 
                  metric='confidence', 
                  min_threshold=0.6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(IMAX),(Action),0.041106,0.30381,0.025854,0.628951,2.070209,0.013365,1.876272,0.539118
1,(Sci-Fi),(Action),0.171,0.30381,0.107362,0.627849,2.066583,0.055411,1.870719,0.622569
2,(IMAX),(Adventure),0.041106,0.239607,0.02517,0.612304,2.555452,0.01532,1.961313,0.634773
3,(Animation),(Children),0.069301,0.091317,0.052947,0.764024,8.36676,0.046619,3.850746,0.946041
4,(War),(Drama),0.048187,0.415804,0.039639,0.822597,1.97833,0.019602,3.29305,0.519559
5,(Mystery),(Thriller),0.076104,0.262327,0.055189,0.725176,2.764397,0.035225,2.684165,0.690832
6,"(Fantasy, Action)",(Adventure),0.031913,0.239607,0.02391,0.749223,3.126885,0.016263,3.03215,0.702616
7,"(Sci-Fi, Adventure)",(Action),0.078821,0.30381,0.059344,0.752894,2.478172,0.035397,2.817371,0.647515
8,"(Adventure, Thriller)",(Action),0.053493,0.30381,0.049377,0.923063,3.038288,0.033126,9.048791,0.708782
9,"(Action, Crime)",(Thriller),0.067248,0.262327,0.043407,0.64548,2.460594,0.025766,2.080766,0.63639


### Summary

In this example our preprocessing was splitting all genres of a movie and seeing how were the movies graded based on them. Here we could discover that genre didn't have too big of an impact on a rating of a movie, there were only a few rules supporting some genres getting certain scores. However we decided to keep this result since we've found a rule that if a Romance film was rather avarege (graded 3) it is an comedy which we found funny 

## Tags based on Genres

### Preprocessing

In [12]:
moviesTags = pd.merge(movies, tagsForMovie, on=['movieId', 'movieId'])
moviesTags.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar|pixar|fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy|magic board game|Robin Williams|game
2,3,Grumpier Old Men (1995),Comedy|Romance,moldy|old
3,5,Father of the Bride Part II (1995),Comedy,pregnancy|remake
4,7,Sabrina (1995),Comedy|Romance,remake


In [13]:
dummies = moviesTags['tag'].str.get_dummies()
moviesTags = pd.concat([moviesTags, dummies], axis= 1)

dummies = moviesTags['genres'].str.get_dummies('|')
moviesTags = pd.concat([moviesTags, dummies], axis=1)

moviesTags.head()

Unnamed: 0,movieId,title,genres,tag,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar|pixar|fun,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy|magic board game|Robin Williams|game,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,moldy|old,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,5,Father of the Bride Part II (1995),Comedy,pregnancy|remake,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,Sabrina (1995),Comedy|Romance,remake,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [14]:
moviesTags.drop(['movieId', 'tag', 'genres', 'title'], axis=1, inplace=True)
moviesTags.head()

Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Rule Discovery

In [15]:
frequent_itemsets = apriori(moviesTags, min_support=0.008, use_colnames=True)
print(frequent_itemsets)

      support                                   itemsets
0    0.013995                                   (Disney)
1    0.083333                         (In Netflix queue)
2    0.008906                                   (aliens)
3    0.020356                              (atmospheric)
4    0.009542                                   (comedy)
..        ...                                        ...
184  0.008906                (Mystery, Horror, Thriller)
185  0.010178           (Drama, Action, Crime, Thriller)
186  0.008906   (Animation, Adventure, Children, Comedy)
187  0.008270  (Fantasy, Animation, Children, Adventure)
188  0.008906          (Mystery, Drama, Crime, Thriller)

[189 rows x 2 columns]




In [16]:
association_rules(frequent_itemsets, 
                metric='confidence',
                min_threshold=0.7)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Disney),(Animation),0.013995,0.054707,0.012087,0.863636,15.786469,0.011321,6.932146,0.949949
1,(Disney),(Children),0.013995,0.05916,0.013995,1.0,16.903226,0.013167,inf,0.954194
2,(In Netflix queue),(Drama),0.083333,0.562341,0.05916,0.709924,1.262443,0.012299,1.508772,0.226784
3,(aliens),(Sci-Fi),0.008906,0.101781,0.008906,1.0,9.825,0.007999,inf,0.90629
4,(atmospheric),(Drama),0.020356,0.562341,0.015903,0.78125,1.389282,0.004456,2.000727,0.286026
5,(comedy),(Comedy),0.009542,0.330153,0.008906,0.933333,2.826975,0.005756,10.04771,0.652491
6,(dark comedy),(Comedy),0.010178,0.330153,0.008906,0.875,2.650289,0.005546,5.358779,0.629086
7,(funny),(Comedy),0.013359,0.330153,0.010178,0.761905,2.307735,0.005768,2.813359,0.574347
8,(mental illness),(Drama),0.009542,0.562341,0.00827,0.866667,1.541176,0.002904,3.282443,0.354528
9,(religion),(Drama),0.013995,0.562341,0.010814,0.772727,1.374126,0.002944,1.9257,0.276129


### Summary

In this experiment as preprocessing for each movie we counted wich tags and genres it had. This led us to discovering of many interesting rules of relations between them.