In [39]:
import pandas as pd
import numpy as np
import json

In [2]:
rating_csv = 'dataset/ratings.csv'
movies_csv = 'dataset/movies.csv'

# Preprocessing

In [3]:
ratings_df = pd.read_csv(rating_csv)
ratings_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [4]:
movies_df = pd.read_csv(movies_csv)
movies_df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


## Cleaning

In [5]:
ratings_df = ratings_df.drop(labels=['timestamp'], axis=1)

In [6]:
ratings_df.head(100)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
95,1,1445,3.0
96,1,1473,4.0
97,1,1500,4.0
98,1,1517,5.0


## Transactional dataset

In [7]:
ratings_df = ratings_df[ratings_df['rating'] > 2]

In [8]:
ratings_df.head(100)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
96,1,1473,4.0
97,1,1500,4.0
98,1,1517,5.0
99,1,1552,4.0


In [9]:
user_counts = pd.DataFrame(ratings_df['userId'].value_counts().sort_values())
user_counts

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
442,2
508,6
293,10
329,11
431,13
...,...
610,1233
448,1255
599,1794
474,1853


In [10]:
active_users = user_counts[user_counts['count'] > 10].index
ratings_df = ratings_df[ratings_df['userId'].isin(active_users)]

In [34]:
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [35]:
transactions = (ratings_df
                 .groupby('userId')['movieId']
                 .apply(lambda x: (list(set(x)), len(x)))
                 .reset_index())

transactions[['movies', 'count']] = pd.DataFrame(transactions['movieId'].tolist(), index=transactions.index)
transactions.drop(columns=['movieId'], inplace=True)

transactions

Unnamed: 0,userId,movies,count
0,1,"[1024, 1, 1025, 3, 2048, 1029, 6, 1030, 1031, ...",226
1,2,"[115713, 122882, 48516, 91529, 80906, 91658, 1...",28
2,3,"[70946, 2851, 5764, 4518, 26409, 7991, 1275, 2...",18
3,4,"[1025, 3079, 3083, 21, 1046, 2583, 4121, 538, ...",167
4,5,"[1, 515, 261, 265, 527, 531, 21, 150, 534, 153...",40
...,...,...,...
602,606,"[1, 8195, 6148, 7, 11, 69644, 4109, 15, 17, 18...",1070
603,607,"[1, 517, 2053, 2054, 1544, 3081, 11, 1036, 257...",174
604,608,"[1, 4105, 10, 6157, 16, 21, 31, 32, 2080, 34, ...",670
605,609,"[1, 137, 10, 650, 1161, 786, 150, 288, 161, 10...",37


In [36]:
transactions.to_csv('transaction_dataset.csv', header=True)

In [59]:
transactions = pd.read_csv('transaction_dataset.csv')
transactions.head(10)

Unnamed: 0.1,Unnamed: 0,userId,movies,count
0,0,1,"[1024, 1, 1025, 3, 2048, 1029, 6, 1030, 1031, ...",226
1,1,2,"[115713, 122882, 48516, 91529, 80906, 91658, 1...",28
2,2,3,"[70946, 2851, 5764, 4518, 26409, 7991, 1275, 2...",18
3,3,4,"[1025, 3079, 3083, 21, 1046, 2583, 4121, 538, ...",167
4,4,5,"[1, 515, 261, 265, 527, 531, 21, 150, 534, 153...",40
5,5,6,"[2, 3, 4, 5, 6, 7, 8, 515, 10, 11, 516, 13, 52...",294
6,6,7,"[1, 8207, 34319, 42002, 3114, 1584, 50, 58, 16...",111
7,7,8,"[2, 11, 141, 527, 21, 150, 282, 539, 32, 34, 2...",43
8,8,9,"[3328, 4993, 5378, 5890, 5893, 1674, 5902, 373...",34
9,9,10,"[7169, 33794, 6155, 54286, 72720, 86548, 30749...",119


In [60]:
def split_movies(movies, test_size=0.2):
    movies = json.loads(movies)  # Convert set to list
    split_idx = int(len(movies) * (1 - test_size))  # Index for 80/20 split
    train_movies = set(movies[:split_idx])  # First 80% for training
    test_movies = set(movies[split_idx:])  # Last 20% for test
    return (train_movies, test_movies)

transactions[['trainMovies', 'testMovies']] = transactions['movies'].apply(
    lambda movies: pd.Series(split_movies(movies))
)
transactions = transactions.drop(labels='movies', axis=1)

In [61]:
transactions.head(10)

Unnamed: 0.1,Unnamed: 0,userId,count,trainMovies,testMovies
0,0,1,226,"{1024, 1, 1025, 3, 2048, 1029, 6, 1030, 1031, ...","{2459, 3489, 1954, 1445, 2470, 423, 4006, 2985..."
1,1,2,28,"{115713, 122882, 48516, 91529, 80906, 91658, 1...","{46970, 80489, 71535, 74458, 6874, 8798}"
2,2,3,18,"{70946, 2851, 5764, 4518, 3703, 26409, 2288, 8...","{1371, 5181, 7899, 5919}"
3,3,4,167,"{1025, 3079, 3083, 21, 1046, 2583, 4121, 538, ...","{904, 908, 910, 912, 914, 919, 920, 1947, 3996..."
4,4,5,40,"{1, 515, 261, 265, 527, 531, 21, 150, 534, 153...","{608, 232, 364, 110, 367, 247, 253, 349}"
5,5,6,294,"{2, 3, 4, 5, 6, 7, 8, 515, 10, 11, 516, 13, 52...","{509, 510, 405, 410, 412, 415, 416, 505, 419, ..."
6,6,7,111,"{1, 34319, 8207, 42002, 3114, 1584, 50, 58, 16...","{4995, 3977, 33162, 6539, 920, 3994, 924, 1954..."
7,7,8,43,"{2, 11, 141, 527, 21, 150, 282, 539, 32, 34, 2...","{235, 364, 236, 110, 367, 252, 377, 380, 253}"
8,8,9,34,"{3328, 4993, 5378, 5890, 5893, 1674, 5902, 373...","{5481, 5872, 6001, 371, 627, 1270, 2300}"
9,9,10,119,"{7169, 33794, 6155, 54286, 72720, 86548, 30749...","{103335, 103339, 78772, 81845, 104374, 81847, ..."
