In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import time
import os

In [2]:
data_filepath = "./Data/letterbox_anonym.csv"
letterboxd = pd.read_csv(data_filepath, sep=";", index_col=0)

letterboxd

Unnamed: 0,user,title,rating
0,144,Puss in Boots: The Last Wish,4.5
1,144,The Guardians of the Galaxy Holiday Special,4.0
2,144,Dinosaur Hotel 2,2.0
3,144,Strange World,2.5
4,144,Zen - Grogu and Dust Bunnies,3.0
...,...,...,...
1433507,290,Newark Athlete,0.0
1433508,290,Roundhay Garden Scene,0.0
1433509,290,Sallie Gardner at a Gallop,0.0
1433510,290,This Land Is Mine,0.0


In [14]:
clean_lbxd = letterboxd.dropna()
clean_lbxd = clean_lbxd.rename(columns={'title': 'name'})
clean_lbxd

Unnamed: 0,user,name,rating
0,144,Puss in Boots: The Last Wish,4.5
1,144,The Guardians of the Galaxy Holiday Special,4.0
2,144,Dinosaur Hotel 2,2.0
3,144,Strange World,2.5
4,144,Zen - Grogu and Dust Bunnies,3.0
...,...,...,...
1433507,290,Newark Athlete,0.0
1433508,290,Roundhay Garden Scene,0.0
1433509,290,Sallie Gardner at a Gallop,0.0
1433510,290,This Land Is Mine,0.0


In [29]:
len(clean_lbxd['name'].unique())

108275

In [None]:
for rating in sorted(clean_lbxd['rating'].unique()):
    
    n_ratings = len(clean_lbxd[clean_lbxd['rating'] == rating])
    print(f"{rating}* -> {n_ratings}")

### Movie Databases

In [None]:
# Ended up not needing to use these databases (as they don't contain that many movies we have)

#mov_imdb_path = "./Dataset/movies_imdb.csv"
#mov_imdb = pd.read_csv(mov_imdb_path)

#mov_data_path = "./Dataset/movie_data.csv"
#mov_data = pd.read_csv(mov_data_path)

**Lbxd Movie Data**

Datasets not added to github because way too big !

In [4]:
mov_lbxd_path = "./Dataset/movies.csv"
mov_lbxd = pd.read_csv(mov_lbxd_path)

genre_lbxd_path = "./Dataset/lbxd_genres.csv"
genre_lbxd = pd.read_csv(genre_lbxd_path)

countries_lbxd_path = "./Dataset/lbxd_countries.csv"
countries_lbxd = pd.read_csv(countries_lbxd_path)

In [5]:
mov_lbxd = mov_lbxd.drop(['tagline', 'description'], axis=1)
mov_lbxd

Unnamed: 0,id,name,date,minute,rating
0,1000001,Barbie,2023.0,114.0,3.91
1,1000002,Parasite,2019.0,133.0,4.57
2,1000003,Everything Everywhere All at Once,2022.0,140.0,4.32
3,1000004,Fight Club,1999.0,139.0,4.27
4,1000005,Interstellar,2014.0,169.0,4.32
...,...,...,...,...,...
896395,1896389,伝七捕物帖 銀蛇呪文,,98.0,
896396,1896390,太阳的子民 Sa'icelen,,38.0,
896397,1896391,柴咲コウ CONCERT TOUR 2023 ACTOR'S THE BEST,,,
896398,1896392,相知相守民歌45演唱会,,262.0,


In [None]:
genre_lbxd

In [None]:
countries_lbxd

**Augmenting the dataset of lbxd movies with genre and country**

In [6]:
# Merging the movies from letterboxd dataset with the genres letterboxd dataset
merged_df = pd.merge(mov_lbxd, genre_lbxd, on='id', how='outer')

# Grouping according to id (because there are duplicate ids on genre)
# then aggregating according the name, date, minute, rating, and genre as a list to contain all genres
merged_df = merged_df.groupby(['id'], as_index=False).agg({
    'name': 'first',
    'date': 'first',
    'minute': 'first',
    'rating': 'first',
    'genre': list
})
merged_df

Unnamed: 0,id,name,date,minute,rating,genre
0,1000001,Barbie,2023.0,114.0,3.91,"[Comedy, Fantasy, Adventure]"
1,1000002,Parasite,2019.0,133.0,4.57,"[Comedy, Thriller, Drama]"
2,1000003,Everything Everywhere All at Once,2022.0,140.0,4.32,"[Science Fiction, Action, Adventure]"
3,1000004,Fight Club,1999.0,139.0,4.27,[Drama]
4,1000005,Interstellar,2014.0,169.0,4.32,"[Science Fiction, Drama, Adventure]"
...,...,...,...,...,...,...
896381,1896389,伝七捕物帖 銀蛇呪文,,98.0,,[nan]
896382,1896390,太阳的子民 Sa'icelen,,38.0,,[nan]
896383,1896391,柴咲コウ CONCERT TOUR 2023 ACTOR'S THE BEST,,,,[Music]
896384,1896392,相知相守民歌45演唱会,,262.0,,[nan]


In [19]:
# Same as above but by using the previously merged dataframe to add the countries
augm_mov_lbxd = pd.merge(merged_df, countries_lbxd, on='id', how='outer')
augm_mov_lbxd = augm_mov_lbxd.groupby(['id'], as_index=False).agg({
    'name': 'first',
    'date': 'first',
    'minute': 'first',
    'rating': 'first',
    'genre': 'first', # can't set it as a list again, else it duplicates the values
    'country': list
})

# This final dataset contains movie names, their genre (if it was found) and their country (if it was found)
# Sadly we have missing values for a lot of the information -> date, minute, rating, genre, country
augm_mov_lbxd = augm_mov_lbxd.rename(columns={'rating': 'mean_rating'})
augm_mov_lbxd

Unnamed: 0,id,name,date,minute,mean_rating,genre,country
0,1000001,Barbie,2023.0,114.0,3.91,"[Comedy, Fantasy, Adventure]","[UK, USA]"
1,1000002,Parasite,2019.0,133.0,4.57,"[Comedy, Thriller, Drama]",[South Korea]
2,1000003,Everything Everywhere All at Once,2022.0,140.0,4.32,"[Science Fiction, Action, Adventure]",[USA]
3,1000004,Fight Club,1999.0,139.0,4.27,[Drama],"[Germany, USA]"
4,1000005,Interstellar,2014.0,169.0,4.32,"[Science Fiction, Drama, Adventure]","[UK, USA]"
...,...,...,...,...,...,...,...
896381,1896389,伝七捕物帖 銀蛇呪文,,98.0,,[nan],[nan]
896382,1896390,太阳的子民 Sa'icelen,,38.0,,[nan],[nan]
896383,1896391,柴咲コウ CONCERT TOUR 2023 ACTOR'S THE BEST,,,,[Music],[Japan]
896384,1896392,相知相守民歌45演唱会,,262.0,,[nan],[nan]


In [None]:
mov_data

### Checking how many movies are inside the lbxd movie database

In [16]:
# Unique movie names from letterboxd ratings dataset
lbxd_movies = clean_lbxd['name'].unique()

In [10]:
start = time.time()
movies_fromdb = augm_mov_lbxd['name'].unique()
movies_in_db = []
movies_not_db = []
lbxd_movies = clean_lbxd['title'].unique()
moviesdb_set = set(movies_fromdb) # Improves performance from 17min to 0.15 seconds
for movie in lbxd_movies:
    if movie in moviesdb_set:
        movies_in_db.append(movie)
    else:
        movies_not_db.append(movie)

print(len(movies_in_db))
print(len(movies_not_db))
end = time.time()
total_time = end - start
print(f"Total time: {total_time/60}")

104680
3595
Total time: 0.004617702960968017


### Merging our ratings dataset with the augmented lbxd movie database according to the movie name
- 3595 movies could not be found on the movie database
- From the 104680 that were found, some can be missing values in ['date', 'minute', 'mean_rating', 'genre', 'country']
- In conclusion we have a lot of missing values, however this can still be useful for visualizing the communities later on and there's not much I can do to compensate for these missing values (as some movies don't have that inforamtion on databases and doing it manually would take too much time)

In [27]:
movies_in_db_df = augm_mov_lbxd[augm_mov_lbxd['name'].isin(lbxd_movies)]

augm_df = pd.merge(clean_lbxd['name'], movies_in_db_df[['name', 'date', 'minute', 'mean_rating', 'genre', 'country']], on='name', how='left')

augm_df = augm_df.groupby(['name'], as_index=False).agg({
    'date': 'first',
    'minute': 'first',
    'mean_rating': 'first',
    'genre': 'first',
    'country': 'first',

})

augm_df = augm_df.replace({None: np.nan})
augm_df

Unnamed: 0,name,date,minute,mean_rating,genre,country
0,!Women Art Revolution,,,,,
1,#005,2020.0,3.0,,"[Animation, Documentary]",[Spain]
2,#1 Cheerleader Camp,2010.0,95.0,2.44,[Comedy],[USA]
3,#1 Fan: A Darkomentary,2005.0,13.0,3.45,[Comedy],[USA]
4,#21xoxo,2019.0,9.0,,[Animation],[Belgium]
...,...,...,...,...,...,...
108270,死んだ僕の彼女,2020.0,4.0,,[nan],[Japan]
108271,火,2014.0,15.0,,[nan],[China]
108272,鬼鎮 (Ghosttown),2020.0,65.0,,[Western],[USA]
108273,소드 아트 온라인 1기,,,,,


In [39]:
# Printing the number of missing values for each column

for column in augm_df.columns:

    miss_vals = sum(augm_df[column].isna())

    print(f"{column}: {miss_vals} missing values")

name: 0 missing values
date: 4178 missing values
minute: 5295 missing values
mean_rating: 48254 missing values
genre: 3595 missing values
country: 3595 missing values


### Saving augmented movies dataframe to csv

In [40]:
# Saving the augmented movies dataframe to a csv file
augm_df.to_csv('./Data/augmented_movies.csv')