#### **1. Importing Libraries**

In [95]:
import pandas as pd                                                 # Importing for panel data analysis
#-------------------------------------------------------------------------------------------------------------------------------
import numpy as np                                                  # Importing package numpys (For Numerical Python)
#-------------------------------------------------------------------------------------------------------------------------------
import matplotlib.pyplot as plt                                     # Importing pyplot interface of matplotlib
import seaborn as sns                                               # Importing seaborn library for interactive visualization
%matplotlib inline
import random
import math
import time
import os
#--------------------~-----------------------------------------------------------------------------------------------------------
import pyfpgrowth                                                   # For testing the scratch implementation
#-------------------------------------------------------------------------------------------------------------------------------
import warnings                                                     # Importing warning to disable runtime warnings
warnings.filterwarnings("ignore")                                   # Warnings will appear only once

In [96]:
ratings = pd.read_csv('./ml-latest-small/ratings.csv')
print('Shape of the dataset:', ratings.shape)
ratings.head(5)

Shape of the dataset: (100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [97]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [98]:
print("Number of unique users:", ratings["userId"].nunique())

Number of unique users: 610


In [99]:
print("Number of unique movies:", ratings["movieId"].nunique())

Number of unique movies: 9724


#### **2. Data preprocessing**

#### Form the transactional data set, which consists of entries of the form <user id, {movies rated above 2}

In [100]:
# let's extract the number of unique movies and its corresponding ratings
group = ratings.groupby('movieId')
df = group.apply(lambda x: x['rating'].unique())
df

movieId
1             [4.0, 4.5, 2.5, 3.5, 3.0, 5.0, 0.5, 2.0, 1.5]
2         [4.0, 3.0, 3.5, 4.5, 2.5, 5.0, 1.5, 1.0, 2.0, ...
3             [4.0, 5.0, 3.0, 3.5, 2.0, 1.0, 2.5, 0.5, 1.5]
4                                      [3.0, 1.0, 2.0, 1.5]
5             [5.0, 3.0, 4.0, 2.0, 3.5, 4.5, 1.5, 2.5, 0.5]
                                ...                        
193581                                                [4.0]
193583                                                [3.5]
193585                                                [3.5]
193587                                                [3.5]
193609                                                [4.0]
Length: 9724, dtype: object

- So, there are movies that have been rated 2 or less. Let's keep only entries where movie ratings are greater than 2. 

In [101]:
ratings_above_2 = ratings[ratings["rating"] > 2.0]
ratings_above_2

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [102]:
# Let's extract the number of unique movies that each user might have rated
group = ratings_above_2.groupby('userId')
df = group.apply(lambda x: len(x['movieId'].unique()))
df

userId
1       226
2        28
3        18
4       167
5        40
       ... 
606    1070
607     174
608     670
609      37
610    1233
Length: 610, dtype: int64

In [103]:
count_freq = dict(df)
count_freq

{1: 226,
 2: 28,
 3: 18,
 4: 167,
 5: 40,
 6: 294,
 7: 111,
 8: 43,
 9: 34,
 10: 119,
 11: 59,
 12: 32,
 13: 28,
 14: 42,
 15: 111,
 16: 96,
 17: 105,
 18: 493,
 19: 357,
 20: 210,
 21: 380,
 22: 70,
 23: 120,
 24: 107,
 25: 26,
 26: 19,
 27: 109,
 28: 476,
 29: 78,
 30: 34,
 31: 45,
 32: 99,
 33: 137,
 34: 67,
 35: 22,
 36: 35,
 37: 20,
 38: 63,
 39: 90,
 40: 94,
 41: 170,
 42: 353,
 43: 114,
 44: 38,
 45: 366,
 46: 42,
 47: 111,
 48: 33,
 49: 21,
 50: 236,
 51: 319,
 52: 130,
 53: 20,
 54: 31,
 55: 16,
 56: 46,
 57: 379,
 58: 103,
 59: 101,
 60: 22,
 61: 37,
 62: 357,
 63: 248,
 64: 504,
 65: 34,
 66: 337,
 67: 33,
 68: 1085,
 69: 44,
 70: 61,
 71: 30,
 72: 45,
 73: 187,
 74: 177,
 75: 51,
 76: 87,
 77: 25,
 78: 47,
 79: 60,
 80: 167,
 81: 17,
 82: 207,
 83: 95,
 84: 287,
 85: 27,
 86: 69,
 87: 20,
 88: 52,
 89: 425,
 90: 53,
 91: 495,
 92: 24,
 93: 97,
 94: 44,
 95: 160,
 96: 66,
 97: 35,
 98: 81,
 99: 44,
 100: 141,
 101: 50,
 102: 52,
 103: 362,
 104: 254,
 105: 717,
 106: 33,
 10

In [104]:
# create a new column to record the number of movies rated by each userId
ratings_above_2['count_freq_userId'] = ratings_above_2['userId']
ratings_above_2['count_freq_userId'] = ratings_above_2['count_freq_userId'].map(count_freq)
ratings_above_2

Unnamed: 0,userId,movieId,rating,timestamp,count_freq_userId
0,1,1,4.0,964982703,226
1,1,3,4.0,964981247,226
2,1,6,4.0,964982224,226
3,1,47,5.0,964983815,226
4,1,50,5.0,964982931,226
...,...,...,...,...,...
100831,610,166534,4.0,1493848402,1233
100832,610,168248,5.0,1493850091,1233
100833,610,168250,5.0,1494273047,1233
100834,610,168252,5.0,1493846352,1233


----

- Let's keep only those users who have rated more than 10 movies.

In [105]:
# Now let's remove the rows where the value of 'count_freq_userId' is less than 10.
more_than_10_movies_rated_above_2 = ratings_above_2.drop(ratings_above_2[ratings_above_2['count_freq_userId'] <= 10].index)
print(more_than_10_movies_rated_above_2)

        userId  movieId  rating   timestamp  count_freq_userId
0            1        1     4.0   964982703                226
1            1        3     4.0   964981247                226
2            1        6     4.0   964982224                226
3            1       47     5.0   964983815                226
4            1       50     5.0   964982931                226
...        ...      ...     ...         ...                ...
100831     610   166534     4.0  1493848402               1233
100832     610   168248     5.0  1493850091               1233
100833     610   168250     5.0  1494273047               1233
100834     610   168252     5.0  1493846352               1233
100835     610   170875     3.0  1493846415               1233

[87295 rows x 5 columns]


----

- Let's create the transactional data of the form <user id, {movies rated above 2}>

In [106]:
# make a new dataframe with all unique userId
transactional_df = pd.DataFrame({'userId':more_than_10_movies_rated_above_2.userId.unique()})

# And then just get the list of all unique subreddits they are active in, assigning it to a new column
transactional_df['movies_rated_above_2'] = [set(more_than_10_movies_rated_above_2['movieId'].loc[more_than_10_movies_rated_above_2['userId'] == x['userId']]) 
    for _, x in transactional_df.iterrows()]

transactional_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"{1024, 1, 1025, 3, 2048, 1029, 6, 1030, 1031, ..."
1,2,"{115713, 122882, 48516, 91529, 80906, 91658, 1..."
2,3,"{70946, 2851, 5764, 4518, 26409, 7991, 1275, 2..."
3,4,"{1025, 3079, 3083, 21, 1046, 2583, 4121, 538, ..."
4,5,"{1, 515, 261, 265, 527, 531, 21, 150, 534, 153..."
...,...,...
602,606,"{1, 8195, 6148, 7, 11, 69644, 4109, 15, 17, 18..."
603,607,"{1, 517, 2053, 2054, 1544, 3081, 11, 1036, 257..."
604,608,"{1, 4105, 10, 6157, 16, 21, 31, 32, 2080, 34, ..."
605,609,"{1, 137, 10, 650, 1161, 786, 150, 288, 161, 10..."


In [107]:
print("Number of unique users:", more_than_10_movies_rated_above_2["userId"].nunique())

Number of unique users: 607


In [108]:
print("Number of unique movies:", more_than_10_movies_rated_above_2["movieId"].nunique())

Number of unique movies: 8852


- As we observe, the number of unique users are reduced from 610 to 607 after preprocessing, and the number of unique movies have reduced from 9724 to 8852.

----

- Divide the data set into 80% training set and 20% test set. Remove 20% of
movies watched from each user and create a test set using the removed
movies

In [109]:
# dummy data
dummy_df = pd.DataFrame({'userId':[1,2,4,6,8], 'movies_rated_above_2':[[100,200,300,400,500,600,700], [100,200,300,400], [300,400,500,600,700,800], [500,600,700,800,900,1000,1100,1200], [700,800,900]]})
dummy_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"[100, 200, 300, 400, 500, 600, 700]"
1,2,"[100, 200, 300, 400]"
2,4,"[300, 400, 500, 600, 700, 800]"
3,6,"[500, 600, 700, 800, 900, 1000, 1100, 1200]"
4,8,"[700, 800, 900]"


In [110]:
# dividing dummy df into 80-20 train-test, such that 20% of movies watched from each user is test set.
'''
- Parse through each user
- Randomly shuffle the items in the list and split into 80-20
- extract 20 of each user and make a separate df
'''

'\n- Parse through each user\n- Randomly shuffle the items in the list and split into 80-20\n- extract 20 of each user and make a separate df\n'

In [111]:
cols = ['userId', 'movies_rated_above_2']
train_df = pd.DataFrame(columns=cols)
test_df = pd.DataFrame(columns=cols)


# loop through the rows using iterrows()
for index, row in dummy_df.iterrows():
    # print(row['userId'], row['movies_rated_above_2'])
    print(row['movies_rated_above_2'])
    print("-----")
    n = int(np.ceil(0.2 * len(row['movies_rated_above_2'])))  # initialize a value that represents 20% of the total items in the list.
    test_list = random.sample(row['movies_rated_above_2'], n)  # randomly choose 20% of the values (n) from list and make a sublist.
    print("test_list", test_list)
    print("-----")
    train_list = [i for i in row['movies_rated_above_2'] if i not in test_list] # rest 80% values of list is in train/-list
    print("train_list", train_list) # randomly choose 20% of the values from list and make a sublist
    print("******************************************************")
    
    df_1 = pd.DataFrame({
    'userId': [row['userId']],
    'movies_rated_above_2': [train_list]
    })

    df_2 = pd.DataFrame({
    'userId': [row['userId']],
    'movies_rated_above_2': [test_list]
    })

    train_df = pd.concat([train_df, df_1])
    test_df = pd.concat([test_df, df_2])
    # print("index", index)
    # train_df.loc[index].userId = row['userId']
    # train_df.loc[index].movies_rated_above_2 = train_list

    # test_df.loc[index].userId = row['userId']
    # test_df.loc[index].movies_rated_above_2 = test_list


[100, 200, 300, 400, 500, 600, 700]
-----
test_list [400, 300]
-----
train_list [100, 200, 500, 600, 700]
******************************************************
[100, 200, 300, 400]
-----
test_list [400]
-----
train_list [100, 200, 300]
******************************************************
[300, 400, 500, 600, 700, 800]
-----
test_list [300, 800]
-----
train_list [400, 500, 600, 700]
******************************************************
[500, 600, 700, 800, 900, 1000, 1100, 1200]
-----
test_list [1000, 600]
-----
train_list [500, 700, 800, 900, 1100, 1200]
******************************************************
[700, 800, 900]
-----
test_list [800]
-----
train_list [700, 900]
******************************************************


In [112]:
dummy_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"[100, 200, 300, 400, 500, 600, 700]"
1,2,"[100, 200, 300, 400]"
2,4,"[300, 400, 500, 600, 700, 800]"
3,6,"[500, 600, 700, 800, 900, 1000, 1100, 1200]"
4,8,"[700, 800, 900]"


In [113]:
train_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"[100, 200, 500, 600, 700]"
0,2,"[100, 200, 300]"
0,4,"[400, 500, 600, 700]"
0,6,"[500, 700, 800, 900, 1100, 1200]"
0,8,"[700, 900]"


In [114]:
test_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"[400, 300]"
0,2,[400]
0,4,"[300, 800]"
0,6,"[1000, 600]"
0,8,[800]


----

- Divide the data set into 80% training set and 20% test set. Remove 20% of
movies watched from each user and create a test set using the removed
movies

In [115]:
# extending the operations of dummy data on the original data

transactional_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"{1024, 1, 1025, 3, 2048, 1029, 6, 1030, 1031, ..."
1,2,"{115713, 122882, 48516, 91529, 80906, 91658, 1..."
2,3,"{70946, 2851, 5764, 4518, 26409, 7991, 1275, 2..."
3,4,"{1025, 3079, 3083, 21, 1046, 2583, 4121, 538, ..."
4,5,"{1, 515, 261, 265, 527, 531, 21, 150, 534, 153..."
...,...,...
602,606,"{1, 8195, 6148, 7, 11, 69644, 4109, 15, 17, 18..."
603,607,"{1, 517, 2053, 2054, 1544, 3081, 11, 1036, 257..."
604,608,"{1, 4105, 10, 6157, 16, 21, 31, 32, 2080, 34, ..."
605,609,"{1, 137, 10, 650, 1161, 786, 150, 288, 161, 10..."


In [116]:
# dividing transactional_df df into 80-20 train-test, such that 20% of movies watched from each user is test set.
'''
- Parse through each user
- Randomly shuffle the items in the list and split into 80-20
- extract 20 of each user and make a separate df
'''

cols = ['userId', 'movies_rated_above_2']
train_df = pd.DataFrame(columns=cols)
test_df = pd.DataFrame(columns=cols)

# loop through the rows using iterrows()
for index, row in transactional_df.iterrows():
    # print(row['userId'], row['movies_rated_above_2'])
    # print(row['movies_rated_above_2'])
    # print("-----")
    n = int(np.ceil(0.2 * len(row['movies_rated_above_2']))) # initialize a value that represents 20% of the total items in the list.
    test_list = random.sample(list(row['movies_rated_above_2']), n)  # randomly choose 20% of the values (n) from list and make a sublist.
    # print("test_list", test_list)
    # print("-----")
    train_list = [i for i in row['movies_rated_above_2'] if i not in test_list] # rest 80% values of list is in train_list
    # print("train_list", train_list)
    # print("******************************************************")
    
    df_1 = pd.DataFrame({
    'userId': [row['userId']],
    'movies_rated_above_2': [train_list]
    })

    df_2 = pd.DataFrame({
    'userId': [row['userId']],
    'movies_rated_above_2': [test_list]
    })

    train_df = pd.concat([train_df, df_1])
    test_df = pd.concat([test_df, df_2])

In [117]:
train_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"[1024, 1, 1025, 3, 2048, 1029, 6, 1030, 1031, ..."
0,2,"[122882, 48516, 80906, 91658, 131724, 77455, 1..."
0,3,"[70946, 2851, 5764, 4518, 26409, 1275, 2288, 8..."
0,4,"[3079, 3083, 1046, 2583, 4121, 538, 2076, 2078..."
0,5,"[1, 261, 265, 527, 531, 21, 150, 534, 410, 34,..."
...,...,...
0,606,"[1, 8195, 7, 11, 69644, 4109, 15, 17, 2073, 28..."
0,607,"[1, 517, 2053, 2054, 1544, 3081, 11, 1036, 257..."
0,608,"[1, 4105, 10, 6157, 16, 21, 31, 32, 2080, 34, ..."
0,609,"[1, 10, 650, 1161, 786, 150, 288, 161, 1056, 2..."


In [118]:
test_df

Unnamed: 0,userId,movies_rated_above_2
0,1,"[2654, 3703, 2291, 1552, 362, 2093, 235, 1224,..."
0,2,"[71535, 91529, 89774, 79132, 115713, 106782]"
0,3,"[6835, 5746, 7991, 5919]"
0,4,"[1073, 1196, 4273, 1136, 2351, 648, 1500, 176,..."
0,5,"[457, 595, 344, 608, 515, 153, 232, 50]"
...,...,...
0,606,"[2424, 6202, 51037, 2065, 307, 72603, 1221, 49..."
0,607,"[695, 3020, 1974, 3386, 150, 5060, 2737, 1918,..."
0,608,"[661, 1801, 88, 196, 5220, 7090, 3275, 6934, 1..."
0,609,"[1059, 454, 296, 208, 731, 742, 137, 231]"


In [119]:
# let's confirm if the first row of the transactional data has been split into 80-20.
print(len(transactional_df["movies_rated_above_2"].iloc[0]))
print(len(train_df["movies_rated_above_2"].iloc[0]))
print(len(test_df["movies_rated_above_2"].iloc[0]))

226
180
46


----

- Saving the 80% of training data and 20% of test data in the csv

In [120]:
# train_df.to_csv('./output/transactional_df_train.csv', index=False)
# test_df.to_csv('./output/transactional_df_test.csv', index=False)

----

#### **3. Association rule mining**

- From the training set, extract the set of all association rules of form X→Y, <br />
where X contains a single movie and Y contains the set of movies from the training set <br />
by employing the apriori or FPgrowth approach and set some minsup and minconf (eg : 50 and 0.1 respectively) <br />

In [121]:
# reading the training transactional data
# train_df = pd.read_csv('./output/transactional_df_train.csv')
# print('Shape of the dataset:', train_df.shape)
# train_df.head(5)

In [122]:
# reading the training transactional data
print('Shape of the dataset:', train_df.shape)
train_df.head(5)

Shape of the dataset: (607, 2)


Unnamed: 0,userId,movies_rated_above_2
0,1,"[1024, 1, 1025, 3, 2048, 1029, 6, 1030, 1031, ..."
0,2,"[122882, 48516, 80906, 91658, 131724, 77455, 1..."
0,3,"[70946, 2851, 5764, 4518, 26409, 1275, 2288, 8..."
0,4,"[3079, 3083, 1046, 2583, 4121, 538, 2076, 2078..."
0,5,"[1, 261, 265, 527, 531, 21, 150, 534, 410, 34,..."


In [123]:
# the type of the rows in the second column of the transactional dataframe 'train_df'
type(train_df.movies_rated_above_2.iloc[0])

list

In [124]:
train_df.movies_rated_above_2.iloc[0] 

[1024,
 1,
 1025,
 3,
 2048,
 1029,
 6,
 1030,
 1031,
 1032,
 2054,
 2058,
 1042,
 2580,
 1049,
 2078,
 3617,
 1060,
 1573,
 2596,
 552,
 553,
 2090,
 1580,
 2096,
 1073,
 50,
 1587,
 2099,
 1080,
 2105,
 2616,
 1089,
 1090,
 1092,
 2116,
 70,
 2628,
 1097,
 3147,
 590,
 592,
 1617,
 596,
 2644,
 1625,
 2137,
 2139,
 3671,
 2141,
 2143,
 2657,
 3168,
 101,
 1127,
 110,
 1136,
 2174,
 2700,
 2193,
 661,
 151,
 2716,
 3740,
 3744,
 673,
 163,
 3243,
 1197,
 1198,
 3247,
 3253,
 1206,
 1208,
 1213,
 1214,
 1220,
 1732,
 1222,
 2761,
 1226,
 3273,
 3793,
 216,
 2268,
 733,
 223,
 736,
 2273,
 3809,
 231,
 1256,
 1258,
 2797,
 1265,
 1777,
 1270,
 1275,
 1278,
 1793,
 1282,
 260,
 2826,
 1291,
 780,
 1804,
 1805,
 2329,
 804,
 2858,
 2353,
 2872,
 3386,
 316,
 2366,
 1348,
 333,
 2387,
 2899,
 1377,
 356,
 2916,
 2406,
 3439,
 3440,
 3441,
 1396,
 3448,
 3450,
 2427,
 1408,
 1920,
 2947,
 2949,
 1927,
 2959,
 2450,
 919,
 3479,
 923,
 2459,
 3489,
 1954,
 1445,
 2470,
 423,
 2985,
 2987,
 9

In [125]:
train_df_key_value = dict(zip(train_df['userId'], train_df['movies_rated_above_2']))
train_df_key_value

{1: [1024,
  1,
  1025,
  3,
  2048,
  1029,
  6,
  1030,
  1031,
  1032,
  2054,
  2058,
  1042,
  2580,
  1049,
  2078,
  3617,
  1060,
  1573,
  2596,
  552,
  553,
  2090,
  1580,
  2096,
  1073,
  50,
  1587,
  2099,
  1080,
  2105,
  2616,
  1089,
  1090,
  1092,
  2116,
  70,
  2628,
  1097,
  3147,
  590,
  592,
  1617,
  596,
  2644,
  1625,
  2137,
  2139,
  3671,
  2141,
  2143,
  2657,
  3168,
  101,
  1127,
  110,
  1136,
  2174,
  2700,
  2193,
  661,
  151,
  2716,
  3740,
  3744,
  673,
  163,
  3243,
  1197,
  1198,
  3247,
  3253,
  1206,
  1208,
  1213,
  1214,
  1220,
  1732,
  1222,
  2761,
  1226,
  3273,
  3793,
  216,
  2268,
  733,
  223,
  736,
  2273,
  3809,
  231,
  1256,
  1258,
  2797,
  1265,
  1777,
  1270,
  1275,
  1278,
  1793,
  1282,
  260,
  2826,
  1291,
  780,
  1804,
  1805,
  2329,
  804,
  2858,
  2353,
  2872,
  3386,
  316,
  2366,
  1348,
  333,
  2387,
  2899,
  1377,
  356,
  2916,
  2406,
  3439,
  3440,
  3441,
  1396,
  3448,
  3450,


In [126]:
test_df_key_value = dict(zip(test_df['userId'], test_df['movies_rated_above_2']))
test_df_key_value

{1: [2654,
  3703,
  2291,
  1552,
  362,
  2093,
  235,
  1224,
  1210,
  349,
  1676,
  2991,
  1298,
  2997,
  2115,
  3702,
  2542,
  2414,
  1240,
  3729,
  2640,
  543,
  1023,
  296,
  2161,
  527,
  2641,
  47,
  2692,
  1196,
  2094,
  1620,
  608,
  2948,
  2648,
  1644,
  157,
  954,
  2944,
  367,
  648,
  3639,
  2395,
  4006,
  2571,
  593],
 2: [71535, 91529, 89774, 79132, 115713, 106782],
 3: [6835, 5746, 7991, 5919],
 4: [1073,
  1196,
  4273,
  1136,
  2351,
  648,
  1500,
  176,
  1923,
  348,
  4902,
  232,
  937,
  914,
  1517,
  1907,
  1057,
  2204,
  2997,
  4381,
  4347,
  608,
  2926,
  1895,
  3408,
  21,
  4033,
  3809,
  4074,
  1025,
  4034,
  3358,
  2858,
  2843],
 5: [457, 595, 344, 608, 515, 153, 232, 50],
 6: [374,
  460,
  43,
  575,
  45,
  435,
  780,
  11,
  171,
  383,
  647,
  592,
  159,
  314,
  212,
  569,
  364,
  304,
  145,
  508,
  416,
  510,
  310,
  267,
  135,
  318,
  65,
  86,
  472,
  609,
  1049,
  243,
  353,
  330,
  835,
  432,

In [127]:
# #Save train_dict and test_dict 
# import pickle 
# with open('train_dict.pickle', 'wb') as handle:
#     pickle.dump(train_df_key_value, handle, protocol=pickle.HIGHEST_PROTOCOL)
# #Save train_dict and test_dict 
# with open('test_dict.pickle', 'wb') as handle:
#     pickle.dump(test_df_key_value, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [128]:
train_df_key_value[1]

[1024,
 1,
 1025,
 3,
 2048,
 1029,
 6,
 1030,
 1031,
 1032,
 2054,
 2058,
 1042,
 2580,
 1049,
 2078,
 3617,
 1060,
 1573,
 2596,
 552,
 553,
 2090,
 1580,
 2096,
 1073,
 50,
 1587,
 2099,
 1080,
 2105,
 2616,
 1089,
 1090,
 1092,
 2116,
 70,
 2628,
 1097,
 3147,
 590,
 592,
 1617,
 596,
 2644,
 1625,
 2137,
 2139,
 3671,
 2141,
 2143,
 2657,
 3168,
 101,
 1127,
 110,
 1136,
 2174,
 2700,
 2193,
 661,
 151,
 2716,
 3740,
 3744,
 673,
 163,
 3243,
 1197,
 1198,
 3247,
 3253,
 1206,
 1208,
 1213,
 1214,
 1220,
 1732,
 1222,
 2761,
 1226,
 3273,
 3793,
 216,
 2268,
 733,
 223,
 736,
 2273,
 3809,
 231,
 1256,
 1258,
 2797,
 1265,
 1777,
 1270,
 1275,
 1278,
 1793,
 1282,
 260,
 2826,
 1291,
 780,
 1804,
 1805,
 2329,
 804,
 2858,
 2353,
 2872,
 3386,
 316,
 2366,
 1348,
 333,
 2387,
 2899,
 1377,
 356,
 2916,
 2406,
 3439,
 3440,
 3441,
 1396,
 3448,
 3450,
 2427,
 1408,
 1920,
 2947,
 2949,
 1927,
 2959,
 2450,
 919,
 3479,
 923,
 2459,
 3489,
 1954,
 1445,
 2470,
 423,
 2985,
 2987,
 9

----

In [129]:
from collections import deque 

def traversetree(root):
    queue = deque([(root, root, 0)])
    while queue:
        parent_node, node, level = queue.popleft()
        print(f"{level = }")
        print(f"Parent: {parent_node.item}, Parent count: {parent_node.count}, Data: {node.item}, Count: {node.count}")
        for node_name in node.children:
            queue.append((node, node.children[node_name], level + 1))

def traverseheader(header_table):
    for key in header_table.keys():
        node = header_table[key]
        while node is not None:
            print(f"Header item: {key}, Link data: {node.item}, Link count: {node.count}")
            node = node.link 

In [130]:
#Global variable
id = 0
class Node:
    def __init__(self, item, count, parent):
        self.item = item           # Item value
        self.count = count         # Support count of the itemset
        self.parent = parent       # Parent node
        self.children = {}         # Children nodes (item: Node)
        self.link = None 

class FPGrowth:
    def __init__(self, data, minsup):
        self.data = data

    
    def find_frequent_items(self,data, minsup):
        header_table = {}
        for _, item_ls in data.items():
            for item in item_ls:
                header_table[item] = header_table.get(item, 0) + 1
        
        #Sort the dictionary
        # print(f"Before sorting {header_table = }")
        header_table = {k: v for k, v in sorted(header_table.items(), key=lambda item: (item[1], item[0]), reverse=True)}
        # print(f"After sorting {header_table = }")
        header_table = {k:-1 for k,v in header_table.items() if v>minsup}
        self.l = [*header_table.keys()]
        return header_table 
    
    #Constructing an FPTree
    def construct_fptree(self, data, header_table):
        root = Node(None,0,None)
        for _, transaction in data.items():
            ordered_transaction = [item for item in transaction if item in self.l]
            ordered_transaction.sort(key = lambda x:self.l.index(x))
            current_node = root
            # print(f"{ordered_transaction = }")
            for item in ordered_transaction:
                if item in current_node.children:
                    #Update the count of the already existing node
                    child_node = current_node.children[item]
                    child_node.count += 1
                else:
                    #Create a new node 
                    child_node = Node(item, 1, current_node)
                    current_node.children[item] = child_node
                    #Update header table
                    if item in header_table: #Why does this exist?
                        if header_table[item] == -1:
                            header_table[item] = child_node
                        else:
                            header_node = header_table[item]
                            while header_node.link is not None:
                                header_node =  header_node.link
                            header_node.link = child_node 
                current_node = child_node 
        return root, header_table

    #Mining an FPTree
    def mine_frequent_patterns(self, header_table, min_support, prefix=[]):
        global id
        frequent_patterns = []
        # Sort items in header table in descending order of frequency
        sorted_items = [item for item in header_table.keys()]
        sorted_items.sort(key=lambda x: (header_table[x].count, x))
        for item in sorted_items:
            new_prefix = prefix + [item]
            support = 0
            # Build the conditional pattern base
            conditional_dataset = {}
            node = header_table[item]
            while node is not None:
                count = node.count
                support += count 
                path = []
                current_node = node.parent
                while current_node.parent is not None:
                    path.append(current_node.item)
                    current_node = current_node.parent
                for _ in range(count):
                    conditional_dataset[id] = path
                    id += 1
                node = node.link
            frequent_patterns.append((new_prefix, support))
 
            
            # Recursively mine the conditional FP-tree
            conditional_header_table = self.find_frequent_items(conditional_dataset, min_support)
            root, conditional_header_table = self.construct_fptree(conditional_dataset, conditional_header_table)
            # print(f"Conditional prefix tree for prefix: {new_prefix}")
            # traversetree(root)
            # print()
            if conditional_header_table:
                frequent_patterns.extend(self.mine_frequent_patterns(conditional_header_table, min_support, new_prefix))
  
        return frequent_patterns
        

minsup = 50
FPGrowth_obj = FPGrowth(train_df_key_value, minsup)
header_table = FPGrowth_obj.find_frequent_items(train_df_key_value,minsup)
root, header_table = FPGrowth_obj.construct_fptree(train_df_key_value, header_table)
frequent_patterns = FPGrowth_obj.mine_frequent_patterns(header_table, minsup, [])
print(f"{frequent_patterns = }")
#For debugging
# traversetree(root)
# traverseheader(header_table)

frequent_patterns = [([1], 162), ([1, 527], 53), ([1, 589], 65), ([1, 589, 356], 51), ([1, 2959], 53), ([1, 593], 75), ([1, 593, 296], 52), ([1, 593, 356], 53), ([1, 110], 70), ([1, 110, 356], 51), ([1, 480], 71), ([1, 480, 356], 60), ([1, 318], 78), ([1, 318, 356], 57), ([1, 260], 77), ([1, 260, 356], 58), ([1, 2571], 75), ([1, 2571, 356], 58), ([1, 296], 83), ([1, 296, 356], 63), ([1, 356], 100), ([2], 77), ([2, 356], 57), ([6], 83), ([6, 593], 51), ([10], 91), ([10, 480], 52), ([10, 592], 52), ([10, 296], 54), ([10, 589], 54), ([10, 356], 68), ([11], 52), ([16], 63), ([21], 58), ([32], 137), ([32, 50], 59), ([32, 110], 56), ([32, 150], 54), ([32, 457], 59), ([32, 480], 61), ([32, 589], 65), ([32, 592], 57), ([32, 608], 63), ([32, 1196], 52), ([32, 1210], 51), ([32, 1], 64), ([32, 260], 63), ([32, 2571], 64), ([32, 47], 67), ([32, 356], 75), ([32, 356, 296], 56), ([32, 593], 65), ([32, 593, 296], 51), ([32, 318], 75), ([32, 318, 296], 52), ([32, 318, 356], 53), ([32, 296], 79), ([34]

In [131]:
#Testing with in-built python package
transactions = [['f', 'a', 'c', 'd', 'g', 'i', 'm', 'p'], ['a', 'b', 'c', 'f', 'l', 'm', 'o'],['b', 'f', 'h', 'j', 'o'], \
               ['b', 'c', 'k', 's', 'p'],['a', 'f', 'c', 'e', 'l', 'p', 'm', 'n']]
patterns = pyfpgrowth.find_frequent_patterns(transactions, 3)
print(f"{patterns = }")

patterns = {('a', 'c'): 3, ('a', 'f'): 3, ('a', 'm'): 3, ('c', 'm'): 3, ('a', 'c', 'm'): 3, ('f', 'm'): 3, ('a', 'f', 'm'): 3, ('p',): 3, ('c', 'p'): 3, ('b',): 3, ('f',): 4, ('c',): 4}
