In [1]:
%pip install networkx

Note: you may need to restart the kernel to use updated packages.


In [95]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict, namedtuple, OrderedDict
import networkx as nx
import matplotlib.pyplot as plt

In [3]:
rating_csv = 'dataset/ratings.csv'
movies_csv = 'dataset/movies.csv'

# Preprocessing

In [4]:
ratings_df = pd.read_csv(rating_csv)
ratings_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [5]:
movies_df = pd.read_csv(movies_csv)
movies_df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


## Cleaning

In [6]:
ratings_df = ratings_df.drop(labels=['timestamp'], axis=1)

In [7]:
ratings_df.head(100)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
95,1,1445,3.0
96,1,1473,4.0
97,1,1500,4.0
98,1,1517,5.0


## Transactional dataset

In [8]:
ratings_df = ratings_df[ratings_df['rating'] > 2]

In [9]:
ratings_df.head(100)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
96,1,1473,4.0
97,1,1500,4.0
98,1,1517,5.0
99,1,1552,4.0


In [10]:
user_counts = pd.DataFrame(ratings_df['userId'].value_counts().sort_values())
user_counts

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
442,2
508,6
293,10
329,11
431,13
...,...
610,1233
448,1255
599,1794
474,1853


In [11]:
active_users = user_counts[user_counts['count'] > 10].index
ratings_df = ratings_df[ratings_df['userId'].isin(active_users)]

In [12]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [13]:
transactions = (ratings_df
                 .groupby('userId')['movieId']
                 .apply(lambda x: (list(set(x)), len(x)))
                 .reset_index())

transactions[['movies', 'count']] = pd.DataFrame(transactions['movieId'].tolist(), index=transactions.index)
transactions.drop(columns=['movieId'], inplace=True)

transactions

Unnamed: 0,userId,movies,count
0,1,"[1024, 1, 1025, 3, 2048, 1029, 6, 1030, 1031, ...",226
1,2,"[115713, 122882, 48516, 91529, 80906, 91658, 1...",28
2,3,"[70946, 2851, 5764, 4518, 26409, 7991, 1275, 2...",18
3,4,"[1025, 3079, 3083, 21, 1046, 2583, 4121, 538, ...",167
4,5,"[1, 515, 261, 265, 527, 531, 21, 150, 534, 153...",40
...,...,...,...
602,606,"[1, 8195, 6148, 7, 11, 69644, 4109, 15, 17, 18...",1070
603,607,"[1, 517, 2053, 2054, 1544, 3081, 11, 1036, 257...",174
604,608,"[1, 4105, 10, 6157, 16, 21, 31, 32, 2080, 34, ...",670
605,609,"[1, 137, 10, 650, 1161, 786, 150, 288, 161, 10...",37


In [14]:
transactions.to_csv('transaction_dataset.csv', header=True)

In [15]:
transactions = pd.read_csv('transaction_dataset.csv')
transactions.head(10)

Unnamed: 0.1,Unnamed: 0,userId,movies,count
0,0,1,"[1024, 1, 1025, 3, 2048, 1029, 6, 1030, 1031, ...",226
1,1,2,"[115713, 122882, 48516, 91529, 80906, 91658, 1...",28
2,2,3,"[70946, 2851, 5764, 4518, 26409, 7991, 1275, 2...",18
3,3,4,"[1025, 3079, 3083, 21, 1046, 2583, 4121, 538, ...",167
4,4,5,"[1, 515, 261, 265, 527, 531, 21, 150, 534, 153...",40
5,5,6,"[2, 3, 4, 5, 6, 7, 8, 515, 10, 11, 516, 13, 52...",294
6,6,7,"[1, 8207, 34319, 42002, 3114, 1584, 50, 58, 16...",111
7,7,8,"[2, 11, 141, 527, 21, 150, 282, 539, 32, 34, 2...",43
8,8,9,"[3328, 4993, 5378, 5890, 5893, 1674, 5902, 373...",34
9,9,10,"[7169, 33794, 6155, 54286, 72720, 86548, 30749...",119


In [16]:
def split_movies(movies, test_size=0.2):
    movies = json.loads(movies)  # Convert set to list
    split_idx = int(len(movies) * (1 - test_size))  # Index for 80/20 split
    train_movies = set(movies[:split_idx])  # First 80% for training
    test_movies = set(movies[split_idx:])  # Last 20% for test
    return (train_movies, test_movies)

transactions[['trainMovies', 'testMovies']] = transactions['movies'].apply(
    lambda movies: pd.Series(split_movies(movies))
)
transactions = transactions.drop(labels='movies', axis=1)

In [17]:
transactions.head(10)

Unnamed: 0.1,Unnamed: 0,userId,count,trainMovies,testMovies
0,0,1,226,"{1024, 1, 1025, 3, 2048, 1029, 6, 1030, 1031, ...","{2459, 3489, 1954, 1445, 2470, 423, 4006, 2985..."
1,1,2,28,"{115713, 122882, 48516, 91529, 80906, 91658, 1...","{46970, 80489, 71535, 74458, 6874, 8798}"
2,2,3,18,"{70946, 2851, 5764, 4518, 3703, 26409, 2288, 8...","{1371, 5181, 7899, 5919}"
3,3,4,167,"{1025, 3079, 3083, 21, 1046, 2583, 4121, 538, ...","{904, 908, 910, 912, 914, 919, 920, 1947, 3996..."
4,4,5,40,"{1, 515, 261, 265, 527, 531, 21, 150, 534, 153...","{608, 232, 364, 110, 367, 247, 253, 349}"
5,5,6,294,"{2, 3, 4, 5, 6, 7, 8, 515, 10, 11, 516, 13, 52...","{509, 510, 405, 410, 412, 415, 416, 505, 419, ..."
6,6,7,111,"{1, 34319, 8207, 42002, 3114, 1584, 50, 58, 16...","{4995, 3977, 33162, 6539, 920, 3994, 924, 1954..."
7,7,8,43,"{2, 11, 141, 527, 21, 150, 282, 539, 32, 34, 2...","{235, 364, 236, 110, 367, 252, 377, 380, 253}"
8,8,9,34,"{3328, 4993, 5378, 5890, 5893, 1674, 5902, 373...","{5481, 5872, 6001, 371, 627, 1270, 2300}"
9,9,10,119,"{7169, 33794, 6155, 54286, 72720, 86548, 30749...","{103335, 103339, 78772, 81845, 104374, 81847, ..."


In [18]:
inp = transactions[['userId', 'trainMovies']]
inp.columns = ['Id', 'tr']
n_inp = inp.shape[0]

inp


Unnamed: 0,Id,tr
0,1,"{1024, 1, 1025, 3, 2048, 1029, 6, 1030, 1031, ..."
1,2,"{115713, 122882, 48516, 91529, 80906, 91658, 1..."
2,3,"{70946, 2851, 5764, 4518, 3703, 26409, 2288, 8..."
3,4,"{1025, 3079, 3083, 21, 1046, 2583, 4121, 538, ..."
4,5,"{1, 515, 261, 265, 527, 531, 21, 150, 534, 153..."
...,...,...
602,606,"{1, 8195, 6148, 7, 11, 69644, 4109, 15, 17, 18..."
603,607,"{1, 2053, 517, 2054, 1544, 3081, 11, 1036, 257..."
604,608,"{1, 4105, 10, 6157, 16, 21, 31, 32, 2080, 34, ..."
605,609,"{1, 137, 10, 1161, 650, 786, 150, 288, 161, 10..."


In [19]:
min_sup = 50
min_conf = 0.1

## FPTree

### 1-Itemsets

In [20]:
it_set_1 = set()

for index, row in inp.iterrows():
  Id, tr = row
  it_set_1 = it_set_1.union(tr)

it_set_1 = list(it_set_1)

print('distinct movies:', len(it_set_1))

mp = {}

for index, row in inp.iterrows():
  Id, tr = row
  for mId in tr:
    if mId in mp:
      mp[mId]+=1
    else:
      mp[mId]=1
      
it_df_1 = pd.DataFrame(list(mp.items()), columns=['mId', 'Support'])
it_df_1 = it_df_1.sort_values(by='Support',ascending=False)
it_df_1 = it_df_1[it_df_1['Support'] >= min_sup]

print('distinct movies after filtering minsup:', it_df_1.shape[0])

it_set_1 = it_df_1['mId'].tolist()

it_df_1

distinct movies: 7762
distinct movies after filtering minsup: 264


Unnamed: 0,mId,Support
196,318,307
141,296,289
51,593,265
12,2571,264
155,356,240
...,...,...
512,353,50
545,5816,50
1291,19,50
1719,786,50


### sorting movies based on frequency

In [21]:
def filter_and_sort_by_freq(movies):
    movies = list(movies)
    filtered_movies = filter(lambda x: (x in it_set_1), movies)
    sorted_movies = sorted(filtered_movies, key=lambda x: it_set_1.index(x))
    return sorted_movies

inp['tr'] = inp['tr'].apply(filter_and_sort_by_freq)
inp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inp['tr'] = inp['tr'].apply(filter_and_sort_by_freq)


Unnamed: 0,Id,tr
0,1,"[296, 593, 2571, 356, 260, 527, 1, 1196, 50, 1..."
1,2,"[318, 58559, 79132, 1704, 68157, 48516, 99114,..."
2,3,[]
3,4,"[593, 260, 1196, 1198, 2858, 588, 2762, 648, 6..."
4,5,"[318, 296, 589, 527, 1, 50, 150, 592, 588, 590..."
...,...,...
602,606,"[318, 296, 593, 2571, 356, 260, 589, 527, 1, 1..."
603,607,"[318, 296, 593, 2571, 260, 589, 527, 1, 1196, ..."
604,608,"[318, 296, 593, 2571, 356, 260, 589, 527, 1, 1..."
605,609,"[318, 296, 356, 589, 1, 150, 592, 590, 457, 10..."


### Construct FPTree

In [109]:
import uuid


class FPNode:
    def __init__(self, item, count, parent) -> None:
        self.item = item
        self.parent = parent
        self.children = {}
        self.link = None
        self.count = count

    def increment(self, count):
        self.count = self.count + count


class FPTree:
    def __init__(self, transactions, min_sup) -> None:
        self.transactions = transactions
        self.min_sup = min_sup
        self.header_table = defaultdict(list)
        self.root = None

    def fit(self):
        self.root = self._build_tree()

    def _build_tree(self):
        root = FPNode(None, 1, None)
        for transaction in self.transactions:
            self._insert_transaction(root, transaction)
        return root

    def _insert_transaction(self, node, transaction):
        if len(transaction) == 0:
            return

        first_item = transaction[0]
        if first_item in node.children:
            node.children[first_item].increment(1)
        else:
            new_node = FPNode(first_item, 1, node)
            node.children[first_item] = new_node
            self.header_table[first_item].append(new_node)

        # Recursively insert the rest of the transaction
        remaining_tr = transaction[1:]
        self._insert_transaction(node.children[first_item], remaining_tr)

    def _prefix_path(self, node):
        path = []
        while node and node.parent.item is not None:
            path.append(node.item)
            node = node.parent
        return path[::-1]
    
    def mine_patterns(self):
        patterns = {}
        final_cond_base = []
        # Process items in reverse order of their frequency (from the header table)
        for item in reversed(it_set_1):
            conditional_patterns = []
            for node in self.header_table[item]:
                path = self._prefix_path(node)
                # print('Path for ', item, ':', path[1:10],'...')
                if path:
                    conditional_patterns.append( (path, node.count))

            leaf_item_freq = OrderedDict()
            for pattern in conditional_patterns:
                path, freq = pattern
                for node in path:
                    if node not in leaf_item_freq:
                        leaf_item_freq[node] = freq
                    else:
                        leaf_item_freq[node] += freq
            
            leaf_item_freq = {k:v for k,v in leaf_item_freq.items() if v >= self.min_sup}
            
            for pattern in conditional_patterns:
              path, freq = pattern
              temp = []
              for node in path:
                if(node in leaf_item_freq):
                  temp.append(node)
              
              final_cond_base.append(temp)
            
            if(len(conditional_patterns)==1):
              patterns[tuple(sorted(list(conditional_patterns[0][0])))] = conditional_patterns[0][1]
            else:
              conditional_tree = FPTree([p[0:len(p)-1] for p, _ in conditional_patterns], self.min_sup)
              conditional_tree.fit()
              conditional_patterns_freq = conditional_tree.mine_patterns()
              for pattern, count in conditional_patterns_freq.items():
                  patterns[tuple(sorted(list(pattern) + [item]))] = count

            item_count = sum([count for _, count in conditional_patterns])
            if item_count >= self.min_sup:
                patterns[(item,)] = item_count

        # unique_cond_base_set = set(map(tuple,final_cond_base))
        # unique_cond_base_list =list(unique_cond_base_set)

        return patterns

In [110]:
fpTree = FPTree(inp['tr'].tolist(), min_sup)
fpTree.fit()
patterns = fpTree.mine_patterns()
print(len(patterns))
# fpTree.visualize_tree(max_depth=2)

RecursionError: maximum recursion depth exceeded

In [77]:
print(fpTree.transactions)

[[296, 593, 2571, 356, 260, 527, 1, 1196, 50, 1198, 47, 2858, 1210, 592, 780, 590, 648, 608, 2959, 1197, 1291, 1214, 1270, 110, 1089, 2329, 1213, 1265, 316, 1240, 1136, 1073, 1580, 1206, 2716, 1097, 3147, 733, 1208, 2628, 6, 2115, 1222, 1258, 349, 223, 3793, 1220, 1080, 736, 2797, 2916, 2291, 2692, 2174, 367, 2700, 231, 1090, 163, 2353, 553, 1732, 2640, 1617, 1278, 919, 2268, 3253, 552, 1573, 596, 1127, 2078], [318, 58559, 79132, 1704, 68157, 48516, 99114, 91529], [], [593, 260, 1196, 1198, 2858, 588, 2762, 648, 608, 595, 1197, 1291, 1213, 1265, 1136, 1073, 1580, 4896, 1259, 21, 2683, 1219, 1080, 1225, 2692, 2174, 2791, 357, 1288, 3408, 4246, 1304, 1732, 1203, 708, 2078], [318, 296, 589, 527, 1, 50, 150, 592, 588, 590, 595, 597, 457, 34, 344, 153, 39, 21, 300, 594, 36, 596], [318, 593, 356, 589, 527, 47, 150, 592, 780, 32, 588, 590, 608, 595, 110, 165, 293, 597, 316, 10, 1073, 34, 344, 364, 153, 2, 587, 6, 329, 539, 161, 292, 349, 339, 185, 208, 141, 736, 16, 367, 104, 594, 62, 337, 23