**Apply Association Rules**

In [1]:
all_txns = []

with open('groceries.csv') as f:
    
    # read each line
    content = f.readlines()

    # Remove white space from the beginning and end of the line
    txns = [x.strip() for x in content]

    # Iterate through each line and create a list of transactions
    for each_txn in txns:

        # Each transaction will contain a list of item in the transaction
        all_txns.append(each_txn.split(','))


In [2]:
all_txns[0:5]

[['citrus fruit', 'semi-finished bread', 'margarine', 'ready soups'],
 ['tropical fruit', 'yogurt', 'coffee'],
 ['whole milk'],
 ['pip fruit', 'yogurt', 'cream cheese ', 'meat spreads'],
 ['other vegetables',
  'whole milk',
  'condensed milk',
  'long life bakery product']]

**Encoding the transactions**

In [3]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [4]:
# Initialize OnehotTransactions
one_hot_encoding = TransactionEncoder()

# Transform the data into one-hot-encoding format
one_hot_txns = one_hot_encoding.fit(all_txns).transform(all_txns)

# Convert the matrix into the dataframe
one_hot_txns_df = pd.DataFrame(one_hot_txns, columns=one_hot_encoding.columns_)

In [5]:
one_hot_txns_df.iloc[5:10, 10:20]

Unnamed: 0,berries,beverages,bottled beer,bottled water,brandy,brown bread,butter,butter milk,cake bar,candles
5,False,False,False,False,False,False,True,False,False,False
6,False,False,False,False,False,False,False,False,False,False
7,False,False,True,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False


In [6]:
one_hot_txns_df.shape

(9835, 171)

**Generating Association Rules**

In [7]:
len(one_hot_txns_df.columns)

171

In [8]:
frequent_itemsets = apriori(one_hot_txns_df, min_support=0.02, use_colnames=True)

In [9]:
frequent_itemsets.sample(10, random_state=90)

Unnamed: 0,support,itemsets
60,0.020437,"(bottled beer, whole milk)"
52,0.033859,(sugar)
89,0.035892,"(tropical fruit, other vegetables)"
105,0.021047,"(tropical fruit, root vegetables)"
88,0.03274,"(soda, other vegetables)"
16,0.058058,(coffee)
111,0.024504,"(shopping bags, whole milk)"
36,0.079817,(newspapers)
119,0.056024,"(whole milk, yogurt)"
55,0.071683,(whipped/sour cream)


In [10]:
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)
rules.sample(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
100,(soda),(tropical fruit),0.174377,0.104931,0.020844,0.119534,1.139159,1.0,0.002546,1.016585,0.14796,0.080645,0.016314,0.159088
37,(fruit/vegetable juice),(whole milk),0.072293,0.255516,0.02664,0.368495,1.44216,1.0,0.008168,1.178904,0.330488,0.088454,0.151755,0.236376
124,(other vegetables),"(whole milk, yogurt)",0.193493,0.056024,0.022267,0.115081,2.054131,1.0,0.011427,1.066737,0.636294,0.097987,0.062562,0.25627
30,(whole milk),(frankfurter),0.255516,0.058973,0.020539,0.080382,1.363029,1.0,0.00547,1.02328,0.357751,0.069872,0.022751,0.214329
29,(whole milk),(domestic eggs),0.255516,0.063447,0.029995,0.11739,1.850203,1.0,0.013783,1.061117,0.617231,0.1038,0.057597,0.295073


**Top Ten Rules**

In [11]:
rules.sort_values('confidence', ascending=False)[0:10]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
122,"(other vegetables, yogurt)",(whole milk),0.043416,0.255516,0.022267,0.512881,2.007235,1.0,0.011174,1.52834,0.524577,0.080485,0.345695,0.300014
16,(butter),(whole milk),0.055414,0.255516,0.027555,0.497248,1.946053,1.0,0.013395,1.480817,0.514659,0.097237,0.324697,0.302543
24,(curd),(whole milk),0.053279,0.255516,0.026131,0.490458,1.919481,1.0,0.012517,1.461085,0.505984,0.092446,0.315577,0.296363
116,"(other vegetables, root vegetables)",(whole milk),0.047382,0.255516,0.023183,0.48927,1.914833,1.0,0.011076,1.457687,0.501524,0.082879,0.313982,0.289999
114,"(whole milk, root vegetables)",(other vegetables),0.048907,0.193493,0.023183,0.474012,2.44977,1.0,0.013719,1.53332,0.62223,0.105751,0.347821,0.296912
28,(domestic eggs),(whole milk),0.063447,0.255516,0.029995,0.472756,1.850203,1.0,0.013783,1.41203,0.490649,0.1038,0.2918,0.295073
109,(whipped/sour cream),(whole milk),0.071683,0.255516,0.032232,0.449645,1.759754,1.0,0.013916,1.352735,0.465077,0.109273,0.260757,0.287895
91,(root vegetables),(whole milk),0.108998,0.255516,0.048907,0.448694,1.756031,1.0,0.021056,1.350401,0.483202,0.154961,0.259479,0.320049
50,(root vegetables),(other vegetables),0.108998,0.193493,0.047382,0.434701,2.246605,1.0,0.026291,1.426693,0.622764,0.185731,0.299078,0.339789
32,(frozen vegetables),(whole milk),0.048094,0.255516,0.020437,0.424947,1.663094,1.0,0.008149,1.294636,0.418855,0.072172,0.227582,0.252466


**Collaborative Filtering**

In [12]:
rating_df = pd.read_csv('ratings.csv')

In [13]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [14]:
rating_df.drop('timestamp', axis=1, inplace=True)

In [15]:
len(rating_df.userId.unique())

668

In [16]:
len(rating_df.movieId.unique())

10325

We will create a Pivot table or a matrix and represent users as rows and movies as columns.

In [17]:
user_movies_df = rating_df.pivot(index='userId', columns='movieId', values='rating').reset_index(drop=True)

user_movies_df.index = rating_df.userId.unique()
user_movies_df.fillna(0, inplace=True)

user_movies_df.iloc[0:5, 0:10]

movieId,1,2,3,4,5,6,7,8,9,10
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Calculating Cosine Similarity between Users**

In [18]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

user_sim = 1 - pairwise_distances(user_movies_df.values, metric='cosine')

# Store the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)

# Set the index and columns names to user ids (0 to 668)
user_sim_df.index = rating_df.userId.unique()
user_sim_df.columns = rating_df.userId.unique()

In [19]:
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,1,2,3,4,5
1,1.0,0.101113,0.210044,0.128766,0.057896
2,0.101113,1.0,0.115559,0.03461,0.032705
3,0.210044,0.115559,1.0,0.058208,0.044426
4,0.128766,0.03461,0.058208,1.0,0.019298
5,0.057896,0.032705,0.044426,0.019298,1.0


In [20]:
user_sim_df.shape

(668, 668)

The matrix shape shows that it contains the cosine similarity between all possible pairs of users. And each cell represents the cosine similarity between two specific users. For example, the similarity between userid 1 and userid 5 is 0.057896.

In [21]:
np.fill_diagonal(user_sim_df.values, 0)
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,1,2,3,4,5
1,0.0,0.101113,0.210044,0.128766,0.057896
2,0.101113,0.0,0.115559,0.03461,0.032705
3,0.210044,0.115559,0.0,0.058208,0.044426
4,0.128766,0.03461,0.058208,0.0,0.019298
5,0.057896,0.032705,0.044426,0.019298,0.0


**Filtering Similar Users**

In [22]:
user_sim_df.idxmax(axis=1)[0:10]

1     348
2      96
3     245
4     322
5      38
6     195
7     403
8      96
9     473
10    186
dtype: int64

The above result shows user 348 is most similar to user 1, user 96 is most similar to user 2, and so on.

In [23]:
user_sim_df.iloc[1:2, 90:100]

Unnamed: 0,91,92,93,94,95,96,97,98,99,100
2,0.138594,0.075357,0.525623,0.0,0.327027,0.755883,0.03576,0.04149,0.0,0.035751


Now we verify why the users are similar using the movies.csv .

In [24]:
movies_df = pd.read_csv('movies.csv')

movies_df[0:5]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [25]:
movies_df.drop('genres', axis=1, inplace=True)

**Finding Common Movies of Similar Users**

In [26]:
def get_user_similar_movies(user1, user2):

    # Inner join between movies watched between two users will give the common movies
    # watched.
    common_movies = rating_df[rating_df.userId == user1].merge(rating_df[rating_df.userId == user2], on="movieId", how="inner")

    #join the above result set with movies details
    return common_movies.merge(movies_df, on='movieId')

In [27]:
common_movies = get_user_similar_movies(2, 96)

common_movies[(common_movies.rating_x >= 4.0) & (common_movies.rating_y >= 4.0)]

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
0,2,1,5.0,96,5.0,Toy Story (1995)
6,2,36,5.0,96,5.0,Dead Man Walking (1995)
8,2,58,4.0,96,4.0,"Postman, The (Postino, Il) (1994)"
9,2,62,5.0,96,5.0,Mr. Holland's Opus (1995)
13,2,260,4.0,96,5.0,Star Wars: Episode IV - A New Hope (1977)
14,2,376,4.0,96,4.0,"River Wild, The (1994)"
16,2,608,5.0,96,4.0,Fargo (1996)
21,2,780,4.0,96,4.0,Independence Day (a.k.a. ID4) (1996)
24,2,802,4.0,96,4.0,Phenomenon (1996)
25,2,805,5.0,96,4.0,"Time to Kill, A (1996)"


**Item-Based Similarity**

**Calculating Cosine Similarity between Movies**

In [28]:
rating_mat = rating_df.pivot(index='movieId', columns='userId', values='rating').reset_index(drop=True)

# Fill all NaNs with 0
rating_mat.fillna(0, inplace=True)

# Find the correlation between movies
movie_sim = 1 - pairwise_distances(rating_mat.values, metric="correlation")

# Fill the diagonal with 0, as it represents the auto-correlation of movies
movie_sim_df = pd.DataFrame(movie_sim)

movie_sim_df.iloc[0:5, 0:5]

Unnamed: 0,0,1,2,3,4
0,1.0,0.229124,0.222062,0.079319,0.243736
1,0.229124,1.0,0.108133,0.08561,0.1234
2,0.222062,0.108133,1.0,0.145779,0.469828
3,0.079319,0.08561,0.145779,1.0,0.095993
4,0.243736,0.1234,0.469828,0.095993,1.0


In [29]:
movie_sim_df.shape

(10325, 10325)

The similarity was calculated based on all pairs of 10325 movies.

**Finding Most Similar Movies**

In [30]:
def get_similar_movies(movieid, topN=5):

    # Get index of the movie record in movies_df
    movieidx = movies_df[movies_df.movieId == movieid].index[0]
    movies_df['similarity'] = movie_sim_df.iloc[movieidx]
    top_n = movies_df.sort_values(["similarity"], ascending=False)[0:topN]

    return top_n

In [31]:
movies_df[movies_df.movieId == 858]

Unnamed: 0,movieId,title
695,858,"Godfather, The (1972)"


In [32]:
get_similar_movies(858)

Unnamed: 0,movieId,title,similarity
695,858,"Godfather, The (1972)",1.0
982,1221,"Godfather: Part II, The (1974)",0.673023
974,1213,Goodfellas (1990),0.431297
956,1193,One Flew Over the Cuckoo's Nest (1975),0.386964
969,1208,Apocalypse Now (1979),0.382045


In [33]:
get_similar_movies(69)

Unnamed: 0,movieId,title,similarity
63,69,Friday (1995),1.0
3229,4102,Eddie Murphy Raw (1987),0.438656
3970,5110,Super Troopers (2001),0.433061
1410,1799,Suicide Kings (1997),0.429051
6161,26467,"Day After, The (1983)",0.414761


In [None]:
pip install scikit-surprise