This notebook does two things: merges the tropes and the letterboxd data, and then creates a dataframe with tropes as the columns

# Import libs and setup github root

In [320]:
import pandas as pd
import ast
import numpy as np

In [321]:
import os

def find_repo_root(start_path):
    """
    useful general function for finding the (first, closest) repo root so github file paths work the same on different machines 
    """
    current_path = os.path.abspath(start_path)
    
    while True:
        # Check for the existence of the .git directory or other indicators
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            return current_path
        
        parent_path = os.path.dirname(current_path)
        
        # Stop if we reach the root directory
        if parent_path == current_path:
            break
        
        current_path = parent_path

    return None  # Return None if not found

root = find_repo_root(os.getcwd())
print(root)

c:\Users\fitsl\Documents\Programming\UVM Programming Classes\PoCS\pocs_project


# Load, clean, and merge the trope and review data

In [322]:
df = pd.read_csv(f"{root}/Data/letterboxd_search.csv", index_col=0)
df.columns
df['Tropes'].iloc[0]

' ArgentineMedia,BlackMagic,ChekhovsGun,ChekhovsGunman,DramaticThunder,DueToTheDead,EvilMentor,ExactWords,FilmsOf20102014,GrayRainOfDepression,GreatOffscreenWar,HesBack,HistoricalHeroUpgrade,HowWeGotHere,ItAlwaysRainsAtFunerals,OffscreenMomentOfAwesome,RightfulKingReturns,SexyDiscretionShot,TheExile,ThouShaltNotKill,VeryLooselyBasedOnATrueStory,WeWantOurJerkBack,WhatsUpKingDude'

In [323]:
df[df['Movie'] == 'Alien']

Unnamed: 0,Movie,Tropes,Movie_strip,year,imdb_,letterboxd_search
14077,Alien,"AFatherToHisMen,AIIsACrapshoot,AbilityOverApp...",Alien,2024.0,18412256,https://letterboxd.com/search/Alien+2024/


In [324]:
df['nu_tropes'] = df['Tropes'].apply(lambda x: x.split(","))
# for trope in df[df['Movie'] == "Borat"]['nu_tropes']:
#     print(trope)

In [325]:
df.drop(columns=['Tropes', 'year', 'Movie_strip'], inplace=True)
df.rename(columns= {
    'Movie' : 'Title'
}, inplace=True)
df_old = df.copy()

In [326]:
df_comments = pd.read_parquet(f"{root}/Data/Whole_sets/cleaned_n=8_comments.parquet")
df_comments['Year'] = df_comments['Movie'].apply(lambda x: x.get('year'))
df_comments['url'] = df_comments['Movie'].apply(lambda x: x.get('url'))
df_comments.drop(columns=['reviews_extracted_lower', 'reviews_mash', 'Movie'], inplace=True)


In [327]:
df_comments.iloc[0]

letterboxd_search    https://letterboxd.com/search/Puerta+De+Hierro...
comments             [{'stars': '★★★★', 'review': 'Yo no entendí bi...
reviews_extracted    [Yo no entendí bien la película, Perón sabía q...
review_stars         [8.0, 9.0, 4.0, 10.0, 6.0, 5.0, 1.0, 10.0, 8.0...
review_dates         [09 Nov 2020, 17 Jul 2020, 08 Mar 2021, 21 Aug...
genres                                                              []
avg_rating                                               3.30 out of 5
directors                        [Víctor Laplace, Dieguillo Fernández]
Year                                                              2013
url                  https://letterboxd.com/film/puerta-de-hierro-e...
Name: 0, dtype: object

In [328]:
df_comments.head()

Unnamed: 0,letterboxd_search,comments,reviews_extracted,review_stars,review_dates,genres,avg_rating,directors,Year,url
0,https://letterboxd.com/search/Puerta+De+Hierro...,"[{'stars': '★★★★', 'review': 'Yo no entendí bi...","[Yo no entendí bien la película, Perón sabía q...","[8.0, 9.0, 4.0, 10.0, 6.0, 5.0, 1.0, 10.0, 8.0...","[09 Nov 2020, 17 Jul 2020, 08 Mar 2021, 21 Aug...",[],3.30 out of 5,"[Víctor Laplace, Dieguillo Fernández]",2013,https://letterboxd.com/film/puerta-de-hierro-e...
1,https://letterboxd.com/search/Better+Off+Dead....,"[{'stars': '★★★★½', 'review': ""this fucking gu...",[this fucking guy took a shower with socks on ...,"[9.0, 10.0, 8.0, 8.0, 10.0, 10.0, 9.0, 8.0, 7....","[18 Dec 2021, 06 Apr 2016, 29 Feb 2020, 25 Aug...","[Romance, Comedy]",3.54 out of 5,[Savage Steve Holland],1985,https://letterboxd.com/film/better-off-dead/
2,https://letterboxd.com/search/Team+America+Wor...,"[{'stars': '★★★★', 'review': 'Matt Damon knock...","[Matt Damon knocks it out of the park., it's b...","[8.0, 6.0, 8.0, 8.0, 7.0, 9.0, 7.0, 4.0, nan, ...","[, 16 Feb 2016, , 16 Oct 2019, 03 Sep 2016, 02...","[Action, Adventure, Comedy]",3.47 out of 5,[Trey Parker],2004,https://letterboxd.com/film/team-america-world...
3,https://letterboxd.com/search/The+Hills+Have+E...,"[{'stars': '½', 'review': ""I'm so tired of rap...",[I'm so tired of rape and sexual assault again...,"[1.0, 4.0, 7.0, 5.0, 8.0, 7.0, 3.0, 6.0, 9.0, ...","[24 Oct 2018, 20 Oct 2017, 18 Oct 2022, 02 Feb...","[Horror, Thriller]",3.01 out of 5,[Alexandre Aja],2006,https://letterboxd.com/film/the-hills-have-eye...
4,https://letterboxd.com/search/The+Face+Of+Fu+M...,"[{'stars': '★★★', 'review': 'It was the West G...",[It was the West German involvement in this pr...,"[6.0, 8.0, 5.0, 5.0, 6.0, 8.0, 1.0, 5.0, 6.0, ...","[28 Jan 2024, 08 Jan 2022, 20 Jan 2024, 28 Mar...","[Thriller, Crime]",2.94 out of 5,[Don Sharp],1965,https://letterboxd.com/film/the-face-of-fu-man...


In [329]:
df_merged = pd.merge(
    left=df_comments,
    right=df,
    on='letterboxd_search',
    how='left'
)

In [330]:
df_merged.columns

Index(['letterboxd_search', 'comments', 'reviews_extracted', 'review_stars',
       'review_dates', 'genres', 'avg_rating', 'directors', 'Year', 'url',
       'Title', 'imdb_', 'nu_tropes'],
      dtype='object')

# Pivot the dataframe to make it trope-centered 

## Setup and first trial

In [331]:
tropes = df_merged['nu_tropes'].explode()
tropes = tropes.unique().tolist()
df_t = pd.DataFrame({'trope' : tropes})
df_t

Unnamed: 0,trope
0,ArgentineMedia
1,BlackMagic
2,ChekhovsGun
3,ChekhovsGunman
4,DramaticThunder
...,...
37238,MultiTaskedConversation
37239,NelsonMandela
37240,TheRunningMan
37241,BullyHunter


In [332]:
is_float = df_merged['nu_tropes'].apply(lambda x: isinstance(x, float))

# Display rows where 'Rating' is a float
weird_cols = df_merged[is_float]
weird_cols['nu_tropes'].isna()

282    True
283    True
284    True
285    True
286    True
287    True
288    True
289    True
290    True
291    True
Name: nu_tropes, dtype: bool

In [333]:
def trope_grabber(trope, df, return_cols=['letterboxd_search', 'comments', 'reviews_extracted', 'review_stars',
       'review_dates', 'genres', 'avg_rating', 'directors', 'Year', 'url',
       'Title', 'imdb_']):
    

    def is_trope_in(x, trope):
        if isinstance(x, np.ndarray):
            x = x.tolist()  # Convert numpy array to list
        return trope in x if isinstance(x, list) else False

    try:
        # Boolean mask for matching tropes
        mask = df['nu_tropes'].apply(lambda x: pd.isna(x) if pd.isna(trope) else is_trope_in(x, trope))
        # Boolean mask for matching tropes
        mask = mask.astype(bool)  # Convert to boolean type if needed

        movies = df[mask]

        return movies[return_cols].values.tolist()
    except Exception as e:
        print(trope, e)

trope_grabber('Not', df_merged )

[['https://letterboxd.com/search/Waynes+World+1992/',
  '[{\'stars\': \'★★★★\', \'review\': "Garth lowkey builds a robot in this movie and we don\'t talk about that enough", \'date\': \'18 Jul 2020\'}, {\'stars\': \'★★★★\', \'review\': "If the people who don\'t like this movie were an ice cream flavour, they\'d be pralines and dick.", \'date\': \'\'}, {\'stars\': \'★★★½\', \'review\': \'Wayne and \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 Bill andGarth \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 \\xa0 TedExcellent🤝\', \'date\': \'09 May 2019\'}, {\'stars\': \'★★★★★\', \'review\': "Why is it that if a man kills another man in battle, it\'s called heroic, yet if he kills a man in the heat of passion, it\'s called murder?", \'date\': \'20 Jan 2022\'}, {\'stars\'

In [334]:
df_merged.columns

Index(['letterboxd_search', 'comments', 'reviews_extracted', 'review_stars',
       'review_dates', 'genres', 'avg_rating', 'directors', 'Year', 'url',
       'Title', 'imdb_', 'nu_tropes'],
      dtype='object')

In [335]:
return_cols=['letterboxd_search', 'comments', 'reviews_extracted', 'review_stars',
       'review_dates', 'genres', 'avg_rating', 'directors', 'Year', 'url',
       'Title', 'imdb_',]

In [336]:
df_t1 = df_t.iloc[-1000:-1].copy()
df_t1['matched_movies']= df_t1['trope'].apply(lambda trope: trope_grabber(trope, df_merged))

for i, col in enumerate(return_cols):
    df_t1[col] = df_t1['matched_movies'].apply(
        lambda x: x[i] if isinstance(x, list) and len(x) > i else None
    )
df_t1.shape

(999, 14)

In [337]:
df_t.shape

(37243, 1)

## Tropify the whole thing

In [346]:
dft_1 = df_t.copy()
dft_1['matched_movies']= dft_1['trope'].apply(lambda trope: trope_grabber(trope, df_merged))

print("movies matched")


nan setting an array element with a sequence.
movies matched


In [349]:
dft_1.shape

(37243, 2)

In [350]:
df_t2 = dft_1.copy()
for i, col in enumerate(return_cols):
    df_t2[col] = df_t2['matched_movies'].apply(lambda x: [lst[i] for lst in x] if x is not None else [])
df_t2.drop(columns=['matched_movies'], inplace=True)
df_t2.shape

(37243, 13)

In [351]:
df_t2

Unnamed: 0,trope,letterboxd_search,comments,reviews_extracted,review_stars,review_dates,genres,avg_rating,directors,Year,url,Title,imdb_
0,ArgentineMedia,[https://letterboxd.com/search/Puerta+De+Hierr...,"[[{'stars': '★★★★', 'review': 'Yo no entendí b...","[[Yo no entendí bien la película, Perón sabía ...","[[8.0, 9.0, 4.0, 10.0, 6.0, 5.0, 1.0, 10.0, 8....","[[09 Nov 2020, 17 Jul 2020, 08 Mar 2021, 21 Au...","[[], [Comedy], [History, Drama], [Thriller, Cr...","[3.30 out of 5, 2.34 out of 5, 3.95 out of 5, ...","[[Víctor Laplace, Dieguillo Fernández], [Marco...","[2013, 2020, 1985, 2000]",[https://letterboxd.com/film/puerta-de-hierro-...,"[Puerta De Hierro, Corazon Loco, The Offic...","[2512204.0, 11827806.0, 89276.0, 247586.0]"
1,BlackMagic,[https://letterboxd.com/search/Puerta+De+Hierr...,"[[{'stars': '★★★★', 'review': 'Yo no entendí b...","[[Yo no entendí bien la película, Perón sabía ...","[[8.0, 9.0, 4.0, 10.0, 6.0, 5.0, 1.0, 10.0, 8....","[[09 Nov 2020, 17 Jul 2020, 08 Mar 2021, 21 Au...","[[], [Drama, Mystery, Horror, Thriller], [Come...","[3.30 out of 5, 3.09 out of 5, 3.52 out of 5, ...","[[Víctor Laplace, Dieguillo Fernández], [Iain ...","[2013, 2005, 1998, 1973, 1982, 2009, 1990, 200...",[https://letterboxd.com/film/puerta-de-hierro-...,"[Puerta De Hierro, The Skeleton Key, Pract...","[2512204.0, 397101.0, 120791.0, 68288.0, 84749..."
2,ChekhovsGun,[https://letterboxd.com/search/Puerta+De+Hierr...,"[[{'stars': '★★★★', 'review': 'Yo no entendí b...","[[Yo no entendí bien la película, Perón sabía ...","[[8.0, 9.0, 4.0, 10.0, 6.0, 5.0, 1.0, 10.0, 8....","[[09 Nov 2020, 17 Jul 2020, 08 Mar 2021, 21 Au...","[[], [Action, Adventure, Comedy], [Horror, Thr...","[3.30 out of 5, 3.47 out of 5, 3.01 out of 5, ...","[[Víctor Laplace, Dieguillo Fernández], [Trey ...","[2013, 2004, 2006, 2006, 2013, 2001, 2006, 200...",[https://letterboxd.com/film/puerta-de-hierro-...,"[Puerta De Hierro, Team America World Pol...","[2512204.0, 372588.0, 454841.0, 454841.0, 1549..."
3,ChekhovsGunman,[https://letterboxd.com/search/Puerta+De+Hierr...,"[[{'stars': '★★★★', 'review': 'Yo no entendí b...","[[Yo no entendí bien la película, Perón sabía ...","[[8.0, 9.0, 4.0, 10.0, 6.0, 5.0, 1.0, 10.0, 8....","[[09 Nov 2020, 17 Jul 2020, 08 Mar 2021, 21 Au...","[[], [Drama], [Thriller, Drama, Action], [Scie...","[3.30 out of 5, 3.59 out of 5, 3.17 out of 5, ...","[[Víctor Laplace, Dieguillo Fernández], [Kōzab...","[2013, 1951, 2007, 2016, 2019, 2010, 2014, 201...",[https://letterboxd.com/film/puerta-de-hierro-...,"[Puerta De Hierro, Clothes Of Deception, S...","[2512204.0, 43681.0, 822854.0, 2543164.0, 4364..."
4,DramaticThunder,[https://letterboxd.com/search/Puerta+De+Hierr...,"[[{'stars': '★★★★', 'review': 'Yo no entendí b...","[[Yo no entendí bien la película, Perón sabía ...","[[8.0, 9.0, 4.0, 10.0, 6.0, 5.0, 1.0, 10.0, 8....","[[09 Nov 2020, 17 Jul 2020, 08 Mar 2021, 21 Au...","[[], [Action, Horror, Thriller, Science Fictio...","[3.30 out of 5, 2.92 out of 5, 3.68 out of 5, ...","[[Víctor Laplace, Dieguillo Fernández], [Steph...","[2013, 1994, 1992, 1999, 1982, 1961, 2005, 201...",[https://letterboxd.com/film/puerta-de-hierro-...,"[Puerta De Hierro, Death Machine, Death Be...","[2512204.0, 109575.0, 104070.0, 139809.0, 8365..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37238,MultiTaskedConversation,[https://letterboxd.com/search/Matewan+1987/],"[[{'stars': '★★★★★', 'review': '""You work, the...","[[""You work, they don't. That's all you get to...","[[10.0, 10.0, nan, 10.0, 9.0, 9.0, 9.0, 10.0, ...","[[20 Jun 2020, 07 Sep 2024, 02 Dec 2014, 27 Oc...","[[History, Drama]]",[4.09 out of 5],[[John Sayles]],[1987],[https://letterboxd.com/film/matewan/],[Matewan],[93509.0]
37239,NelsonMandela,[https://letterboxd.com/search/The+Running+Man...,[[]],[[]],[[]],[[]],"[[Science Fiction, Thriller]]",[None found],[[Edgar Wright]],[2025],[https://letterboxd.com/film/the-running-man-2...,[The Running Man],[14107334.0]
37240,TheRunningMan,[https://letterboxd.com/search/The+Running+Man...,[[]],[[]],[[]],[[]],"[[Science Fiction, Thriller]]",[None found],[[Edgar Wright]],[2025],[https://letterboxd.com/film/the-running-man-2...,[The Running Man],[14107334.0]
37241,BullyHunter,[https://letterboxd.com/search/Nancy+Drew+And+...,"[[{'stars': '★', 'review': 'nancy drew used ap...",[[nancy drew used apple earpods with an androi...,"[[2.0, 7.0, nan, nan, 4.0, nan, 6.0, nan, 4.0,...","[[18 Apr 2019, 09 May 2023, 14 Feb 2019, 14 Se...","[[Family, Mystery, Crime]]",[2.55 out of 5],[[Katt Shea]],[2019],[https://letterboxd.com/film/nancy-drew-and-th...,[Nancy Drew And The Hidden Staircase],[8323104.0]


It's too big to save the whole thing, sadly. But we can save smaller pieces. (in fact, probably everything except for the comments)

In [None]:
df_t3 = df_t2[['trope', 'Title', 'letterboxd_search']].copy()
df_t3
df_t3['length'] =df_t3['Title'].apply(lambda x: len(x))
df_t3.sort_values(by='length', ascending=False)
df_t3.to_csv(f"{root}/Data/Trope_storage/movies_by_trope_with_lettersearch_and_length.csv")

# Get the genre count for each trope

In [370]:
genres = df_merged['genres'].explode().unique().tolist()
genres = genres[1:]
genres

['Romance',
 'Comedy',
 'Action',
 'Adventure',
 'Horror',
 'Thriller',
 'Crime',
 'Drama',
 'History',
 'Mystery',
 'War',
 'Science Fiction',
 'TV Movie',
 'Fantasy',
 'Music',
 'Family',
 'Documentary',
 'Western',
 'Animation']

In [379]:
df_t2['genres'].iloc[0]
def genre_counter(genre, lst_of_lists):
    try:
        flattened_genres = np.concatenate(lst_of_lists)
        return np.sum(flattened_genres == genre)
    except:
        return 0

df_t2['genres'].iloc[:5].apply(lambda x: genre_counter('Drama', x))


0      2
1      4
2    625
3    198
4     33
Name: genres, dtype: int32

In [380]:
df_genre = df_t2[['trope', 'genres']].copy()
for genre in genres:
    df_genre[genre] = df_genre['genres'].apply(lambda x : genre_counter(genre, x))


df_genre

Unnamed: 0,trope,genres,Romance,Comedy,Action,Adventure,Horror,Thriller,Crime,Drama,...,Mystery,War,Science Fiction,TV Movie,Fantasy,Music,Family,Documentary,Western,Animation
0,ArgentineMedia,"[[], [Comedy], [History, Drama], [Thriller, Cr...",0,1,0,0,0,1,1,2,...,0,0,0,0,0,0,0,0,0,0
1,BlackMagic,"[[], [Drama, Mystery, Horror, Thriller], [Come...",2,6,3,4,19,5,0,4,...,4,0,1,0,12,1,3,0,0,1
2,ChekhovsGun,"[[], [Action, Adventure, Comedy], [Horror, Thr...",195,601,575,410,346,543,291,625,...,200,49,315,27,203,45,161,22,42,24
3,ChekhovsGunman,"[[], [Drama], [Thriller, Drama, Action], [Scie...",67,184,151,97,72,150,114,198,...,67,18,69,9,42,10,36,10,13,8
4,DramaticThunder,"[[], [Action, Horror, Thriller, Science Fictio...",11,36,21,20,32,20,10,33,...,15,2,17,1,27,3,13,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37238,MultiTaskedConversation,"[[History, Drama]]",0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
37239,NelsonMandela,"[[Science Fiction, Thriller]]",0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
37240,TheRunningMan,"[[Science Fiction, Thriller]]",0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
37241,BullyHunter,"[[Family, Mystery, Crime]]",0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0


In [382]:
df_genre.drop(columns='genres').to_csv(f"{root}/Data/liteweight/genre_counts_by_trope.csv")

In [None]:
df.sort_values(by='length', ascending=False).head(50)


Unnamed: 0,trope,Movie_Titles,letterboxd_searches,length
19175,,"[Puerta De Hierro, Better Off Dead..., A ...",['https://letterboxd.com/search/Puerta+De+Hier...,14533
33945,Film,"[Puerta De Hierro, Better Off Dead..., A ...",['https://letterboxd.com/search/Puerta+De+Hier...,14306
20608,O,"[Puerta De Hierro, Better Off Dead..., A ...",['https://letterboxd.com/search/Puerta+De+Hier...,14296
30457,V,"[Puerta De Hierro, Better Off Dead..., A ...",['https://letterboxd.com/search/Puerta+De+Hier...,9274
35354,Go,"[Puerta De Hierro, Better Off Dead..., Tea...",['https://letterboxd.com/search/Puerta+De+Hier...,8213
4186,It,"[Puerta De Hierro, Better Off Dead..., A ...",['https://letterboxd.com/search/Puerta+De+Hier...,6341
32643,Not,"[Puerta De Hierro, Better Off Dead..., Tea...",['https://letterboxd.com/search/Puerta+De+Hier...,5732
32849,Hero,"[Puerta De Hierro, Team America World Pol...",['https://letterboxd.com/search/Puerta+De+Hier...,5291
37985,Mon,"[Better Off Dead..., Team America World P...",['https://letterboxd.com/search/Better+Off+Dea...,4972
34838,Death,"[Team America World Police, The Hills Hav...",['https://letterboxd.com/search/Team+America+W...,4924
