In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
%matplotlib inline

In [24]:
file_path = './Data/letterbox_anonym.csv'
lboxd_data = pd.read_csv(file_path, sep=';', index_col=0)

In [25]:
lboxd_data

Unnamed: 0,user,title,rating
0,144,Puss in Boots: The Last Wish,4.5
1,144,The Guardians of the Galaxy Holiday Special,4.0
2,144,Dinosaur Hotel 2,2.0
3,144,Strange World,2.5
4,144,Zen - Grogu and Dust Bunnies,3.0
...,...,...,...
1433507,290,Newark Athlete,0.0
1433508,290,Roundhay Garden Scene,0.0
1433509,290,Sallie Gardner at a Gallop,0.0
1433510,290,This Land Is Mine,0.0


#### Looking at the dataset
- We have ratings from 0 to 5, 0 probably meaning that the movie was watched but not rated by the user, still useful for community detection though
- We have 108'276 movies
- We have 557 users

In [26]:
print('Unique ratings:', sorted(lboxd_data['rating'].unique()))
print('Number of unique movies:', len(lboxd_data['title'].unique()))
print('Number of unique users:', len(lboxd_data['user'].unique()))

Unique ratings: [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
Number of unique movies: 108276
Number of unique users: 557


There are 22 missing movies
- The rows are only missing the title of the movie, the user id and rating is present.
- Not sure what caused this problem but it is not possible to infer the titles as users are anonymous and the dataset doesn't have supporting data.

In [27]:
for column in lboxd_data.columns:

    i_nans = lboxd_data[lboxd_data[column].isna()].index.tolist()
    n_nans = sum(lboxd_data[column].isna())
    print(f'[{column}] column has [{n_nans}] missing values.')
    print(lboxd_data.iloc[i_nans, :])

[user] column has [0] missing values.
Empty DataFrame
Columns: [user, title, rating]
Index: []
[title] column has [22] missing values.
         user title  rating
44351      77   NaN     3.0
58668     206   NaN     0.0
121472     66   NaN     3.5
127009     87   NaN     3.5
160605    115   NaN     0.0
164796    253   NaN     4.0
240451    495   NaN     3.0
545240    302   NaN     0.0
550152     14   NaN     4.5
562797     12   NaN     1.0
687934    441   NaN     3.5
709728    148   NaN     1.0
790208    100   NaN     3.5
802929    190   NaN     5.0
845228    204   NaN     0.0
866134     33   NaN     1.5
905982    481   NaN     0.0
1001948   458   NaN     0.0
1060901   273   NaN     0.0
1194825   412   NaN     0.0
1247769   256   NaN     0.0
1250540   298   NaN     0.0
[rating] column has [0] missing values.
Empty DataFrame
Columns: [user, title, rating]
Index: []


Removing the rows with missing values
- clean_lboxd has 0 missing values in all columns.
- clean_lboxd has 22 less rows than the original dataset.

In [28]:
clean_lboxd = lboxd_data.dropna()

for column in clean_lboxd.columns:

    n_nans = sum(clean_lboxd[column].isna())
    print(f'[{column}] column has [{n_nans}] missing values.')

print('Number of ratings lost:',len(lboxd_data) - len(clean_lboxd))

[user] column has [0] missing values.
[title] column has [0] missing values.
[rating] column has [0] missing values.
Number of ratings lost: 22


Casting the columns to their appropriate datatypes
- To ensure that we have no problems with the types

In [31]:
clean_lboxd = clean_lboxd.astype({'user':int, 'title':str, 'rating':float})
clean_lboxd.dtypes

user        int32
title      object
rating    float64
dtype: object

In [33]:
clean_lboxd

Unnamed: 0,user,title,rating
0,144,Puss in Boots: The Last Wish,4.5
1,144,The Guardians of the Galaxy Holiday Special,4.0
2,144,Dinosaur Hotel 2,2.0
3,144,Strange World,2.5
4,144,Zen - Grogu and Dust Bunnies,3.0
...,...,...,...
1433507,290,Newark Athlete,0.0
1433508,290,Roundhay Garden Scene,0.0
1433509,290,Sallie Gardner at a Gallop,0.0
1433510,290,This Land Is Mine,0.0


#### Adding an ID columns as an alternative
- In case we feel that some movie titles are hard to work with
- Starting at 1000 to ensure there is no overlapping with the user ids (go up to 557)

In [34]:
# Add a unique ID for each movie starting at 1000 to ensure that the user IDs are not the same as the movie IDs
# Each movie title has to be assigned an unique ID
clean_lboxd['movie_id'] = clean_lboxd['title'].astype('category').cat.codes + 1000
clean_lboxd

Unnamed: 0,user,title,rating,movie_id
0,144,Puss in Boots: The Last Wish,4.5,66829
1,144,The Guardians of the Galaxy Holiday Special,4.0,87522
2,144,Dinosaur Hotel 2,2.0,25219
3,144,Strange World,2.5,77851
4,144,Zen - Grogu and Dust Bunnies,3.0,108598
...,...,...,...,...
1433507,290,Newark Athlete,0.0,58824
1433508,290,Roundhay Garden Scene,0.0,70149
1433509,290,Sallie Gardner at a Gallop,0.0,70959
1433510,290,This Land Is Mine,0.0,98758


In [35]:
# Check the minimum and maximum movie IDs
print(clean_lboxd['movie_id'].min())
print(clean_lboxd['movie_id'].max())

1000
109274


In [36]:
# Check minimum and maximum user IDs
print(clean_lboxd['user'].min())
print(clean_lboxd['user'].max())

0
556


In [37]:
# Check the number of unique movie IDs
len(clean_lboxd['movie_id'].unique())

108275

In [38]:
# Check which movie title corresponds to the movie ID 1000
print(clean_lboxd[clean_lboxd['movie_id'] == 1000])

# Check how many times that movie_id appears in the dataset
print(clean_lboxd['movie_id'].value_counts()[1000])

         user                  title  rating  movie_id
404493    167  !Women Art Revolution     4.5      1000
1040524   264  !Women Art Revolution     3.0      1000
2


#### Sampling from the dataset
- Our dataset is quite large (1M4 ratings), and while testing bilouvain we found that runtime was extremely long (didn't converge after almost 20h)
- To ensure that we can test our algorithm we need to sample the dataset, however it's hard to do sampling while keeping network structure.
- A way I thought of doing so is by removing movies that appear only once in the dataset (meaning only one user rated them) as they might not have a significant impact on the communities.

In [39]:
# Make a new df without the movies that appear only once
movies_count = clean_lboxd['title'].value_counts()
movies_to_keep = movies_count[movies_count > 1].index
new_lboxd_data = clean_lboxd[clean_lboxd['title'].isin(movies_to_keep)]
new_lboxd_data

Unnamed: 0,user,title,rating,movie_id
0,144,Puss in Boots: The Last Wish,4.5,66829
1,144,The Guardians of the Galaxy Holiday Special,4.0,87522
2,144,Dinosaur Hotel 2,2.0,25219
3,144,Strange World,2.5,77851
4,144,Zen - Grogu and Dust Bunnies,3.0,108598
...,...,...,...,...
1433506,290,Blacksmith Scene,0.0,13842
1433507,290,Newark Athlete,0.0,58824
1433508,290,Roundhay Garden Scene,0.0,70149
1433509,290,Sallie Gardner at a Gallop,0.0,70959


In [40]:
len(clean_lboxd) - len(new_lboxd_data)

45105

In [11]:
# Export the new df to a csv file
new_lboxd_data.to_csv('./Data/lboxd_no_single_IDcoded.csv', index=False)

In [13]:
# Export the original clean df to a csv file
clean_lboxd.to_csv('./Data/lboxd_IDcoded.csv', index=False)

In [10]:
# Convert the new_lboxd_data to a networkx bipartite graph with users and movies as nodes
G = nx.Graph()
G.add_nodes_from(new_lboxd_data['user'].unique(), bipartite=0)
G.add_nodes_from(new_lboxd_data['title'].unique(), bipartite=1)
G.add_edges_from(new_lboxd_data[['user', 'title']].values)

In [11]:
# Check if the graph is bipartite
nx.is_bipartite(G)

True

In [15]:
# Check the number of nodes and edges
print(nx.number_of_nodes(G))
print(nx.number_of_edges(G))

# Get the sets of nodes
users = {n for n, d in G.nodes(data=True) if d['bipartite'] == 0}
movies = {n for n, d in G.nodes(data=True) if d['bipartite'] == 1}

# Print the number of nodes in the sets
print(len(users))
print(len(movies))

63727
1388385
557
63170
