In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

In [3]:
# Utility function
import ast

# Need to convert the lists inside genre and movie back to lists because they are stored as strings
def convert_to_list(s):
    
    # If the value is a nan, just return a nan
    if pd.isna(s):
        return np.nan

    # Try to parse the value
    try:
        # Only parse the value if it looks like a str list
        if isinstance(s, str) and s.startswith('[') and s.endswith(']'):
            return ast.literal_eval(s)
        # In case it's something else, just return it
        else:
            return s
    # This catches cases where the list contains a nan -> [nan]
    except (ValueError, SyntaxError):
        return s

In [4]:
# Load the data
data_filepath = './Data/augm_lboxd.csv'
letterboxd = pd.read_csv(data_filepath)

# Cast genre and country columns from augm_movies from strings (stored that way) back to lists
letterboxd['genre'] = letterboxd['genre'].apply(convert_to_list)
letterboxd['country'] = letterboxd['country'].apply(convert_to_list)

In [5]:
letterboxd

Unnamed: 0,user,title,rating,movie_id,date,minute,mean_rating,genre,country
0,144,Puss in Boots: The Last Wish,4.5,66829,2022.0,103.0,4.16,"[Comedy, Action, Adventure, Family, Animation,...",[USA]
1,144,The Guardians of the Galaxy Holiday Special,4.0,87522,2022.0,45.0,3.25,"[Comedy, Adventure, Science Fiction]",[USA]
2,144,Dinosaur Hotel 2,2.0,25219,2022.0,80.0,,[Horror],[UK]
3,144,Strange World,2.5,77851,2022.0,102.0,2.84,"[Science Fiction, Animation, Adventure, Fantas...",[USA]
4,144,Zen - Grogu and Dust Bunnies,3.0,108598,2022.0,3.0,3.11,"[Fantasy, Animation, Science Fiction]","[Japan, USA]"
...,...,...,...,...,...,...,...,...,...
1433485,290,Newark Athlete,0.0,58824,1891.0,1.0,2.62,[Documentary],[USA]
1433486,290,Roundhay Garden Scene,0.0,70149,1888.0,1.0,3.27,[Documentary],[UK]
1433487,290,Sallie Gardner at a Gallop,0.0,70959,1878.0,1.0,3.69,"[Documentary, Animation]",[USA]
1433488,290,This Land Is Mine,0.0,98758,1943.0,103.0,3.73,"[Drama, War]",[USA]


### Sampling by taking only movies with more than a certain amount of ratings

In [6]:
sampled_dataframes = {}
movie_counts = letterboxd['title'].value_counts()
movie_range = range(0, 551, 10)

# For loop that goes from 0 to 401 and adds a dataframe to the sampled_dataframes dictionary that has removed the movies with the corresponding number of ratings
for i in movie_range:
    # Make a list of movies to keep and then filter the dataframe
    movies_to_keep = movie_counts[movie_counts > i].index
    sampled_lboxd = letterboxd[letterboxd['title'].isin(movies_to_keep)]
    sampled_dataframes[i] = sampled_lboxd

In [15]:
sampled_dataframes.keys()

dict_keys([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550])

### Sampling by taking all users but random amounts of movies

In [12]:
sampled_random_dataframes = {}
all_movies = letterboxd['title'].unique()
movie_range = range(500, 10000, 500)  # change the numbers here to change the range of samples

for movie_number in movie_range:
    
    random_movies = pd.Series(all_movies).sample(movie_number, random_state=42)  # random_state for reproducibility
    random_sample = letterboxd[letterboxd['title'].isin(random_movies)]
    sampled_random_dataframes[movie_number] = random_sample

In [14]:
sampled_random_dataframes.keys()

dict_keys([500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500])