In [258]:
import pandas as pd
import re
import os 
import ast

In [259]:


def find_repo_root(start_path):
    """
    useful general function for finding the (first, closest) repo root so github file paths work the same on different machines 
    """
    current_path = os.path.abspath(start_path)
    
    while True:
        # Check for the existence of the .git directory or other indicators
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            return current_path
        
        parent_path = os.path.dirname(current_path)
        
        # Stop if we reach the root directory
        if parent_path == current_path:
            break
        
        current_path = parent_path

    return None  # Return None if not found

root = find_repo_root(os.getcwd())


def fk_apply_literal(x):
    try:
        return ast.literal_eval(x)
    except Exception as e: 
        print(e, x)
        return None




In [260]:
df = pd.read_csv(f"{root}/Data/movie_n=8_comments.csv")
df = df.dropna(subset=['Movie', 'comments'])
df['Movie'][0]

"{'title': 'puerta-de-hierro-el-exilio-de-peron', 'url': 'https://letterboxd.com/film/puerta-de-hierro-el-exilio-de-peron/', 'directors': ['Víctor Laplace', 'Dieguillo Fernández'], 'rating': '3.30 out of 5', 'year': '2013', 'genres': []}"

In [261]:
def extract_reviews(reviews_str):
    try:
        # Convert the string representation of the list to an actual list
        reviews_list = ast.literal_eval(reviews_str)
        # Extract 'review' from each dictionary
        return [review['review'] for review in reviews_list]
    except (ValueError, SyntaxError):
        return []
df['reviews_extracted'] = df['comments'].apply(extract_reviews)
df['reviews_mash'] = df['reviews_extracted'].apply(lambda x: '\n'.join(x))
print(df['reviews_mash'][2])
print(df['Movie'][2])

Matt Damon knocks it out of the park.
it's better than anomalisa
Chris warned us about Cats but we didn't listen.
I choose to view this as a documentary.
Matt damon
The puking scene is comedy gold.
don't think this is quite as funny as i once did (and the "eww, gay stuff" vibe is even less welcome now), but i'd definitely forgotten about its visual virtuosity and how well it absorbed the motifs of American action movies. the best gags are simple nuts-and-bolts jokes about the limitations of the marionettes themselves, like when an elaborate fight is just a bunch of wriggling dolls, or when they have to make one look like a staggering drunk. well, that and the sex scene.
THE POLITICAL SATIRE: On the one hand, you've got the destructive excesses of American foreign policy. On the other, you've got some Hollywood libs who are a bit annoying. I think I know what Matt and Trey hate more.THE ACTION-MOVIE SATIRE: Replicates the cliches of bad Hollywood action movies so faithfully that it's...

In [262]:
df['Movie'] = df['Movie'].apply(lambda x : fk_apply_literal(x))
df['genres'] = df['Movie'].apply(lambda x: x.get('genres'))
df['avg_rating'] = df['Movie'].apply(lambda x: x.get('rating'))
df['directors'] = df['Movie'].apply(lambda x: x.get('director'))


In [263]:
df.iloc[3]

letterboxd_search    https://letterboxd.com/search/The+Hills+Have+E...
Movie                {'title': 'the-hills-have-eyes-2006', 'url': '...
comments             [{'stars': '½', 'review': "I'm so tired of rap...
reviews_extracted    [I'm so tired of rape and sexual assault again...
reviews_mash         I'm so tired of rape and sexual assault agains...
genres                                              [Horror, Thriller]
avg_rating                                               3.01 out of 5
directors                                                         None
Name: 3, dtype: object

In [264]:
df['genres'].value_counts().head(15)

genres
[Drama]               81
[Comedy]              70
[Horror]              36
[Documentary]         26
[Drama, Comedy]       22
[Comedy, Romance]     22
[Romance, Drama]      21
[Romance, Comedy]     21
[]                    17
[Action]              17
[Drama, Romance]      16
[Crime, Drama]        11
[Comedy, Drama]       11
[Horror, Thriller]    11
[Thriller, Horror]    11
Name: count, dtype: int64

In [265]:
exploded_genres = df['genres'].explode()
genre_types = exploded_genres.unique().tolist()
print(genre_types)



[nan, 'Romance', 'Comedy', 'Action', 'Adventure', 'Horror', 'Thriller', 'Crime', 'Drama', 'History', 'Mystery', 'War', 'Science Fiction', 'TV Movie', 'Fantasy', 'Music', 'Family', 'Documentary', 'Western', 'Animation']


In [266]:
df_drama =  df[df['genres'].apply(lambda lst: "Drama" in lst)].copy()
df_drama['genres'].value_counts()


genres
[Drama]                                     81
[Drama, Comedy]                             22
[Romance, Drama]                            21
[Drama, Romance]                            16
[Crime, Drama]                              11
                                            ..
[Fantasy, Drama, Adventure]                  1
[Science Fiction, Drama, Romance]            1
[Comedy, Drama, Fantasy]                     1
[Horror, Drama, Science Fiction, Action]     1
[Family, Drama]                              1
Name: count, Length: 222, dtype: int64

In [267]:
print(len(df_drama['reviews_mash'].iloc[1]))
drama_string = df['reviews_mash'].str.cat(sep=' ')
drama_string = re.sub(r"\\.\s*", '', drama_string)
print(len(drama_string))


30106
30249444


In [268]:
def text_to_freqdf(text=None, text_file=None, lower_b=True):
    """
    Strip all the words out of a file and put them into a frequency dataframe.
    By default casts everything to lowercase. Gets it ready for allotaxonometer.

    :param text str: a string with the text.
    :param text_file str:  The path to the text_file. 
    :param lower_b Bool: A boolean indicating whether to preserve case.
                             Enter False if you want to preserve capital versus not. 
    :returns pd.DataFrame: A DataFrame with word frequencies.
    """

    if text:
        if lower_b:
            text = text.lower()
    else:
        if text_file:
            with open(text_file, 'r') as f:
                text = f.read()
            if lower_b:
                text = text.lower()
        else:
            raise("Input either a string of text or a textfile.")
    
    words = re.findall(r'\b\w+\b', text)
    df = pd.DataFrame(words, columns=['types'])
    df = df.value_counts()
    df = df.reset_index(name='counts')
    df['probs'] = df['counts'] / df['counts'].sum()
    df['total_unique'] = len(df)
    return df


In [269]:
drama_freq = text_to_freqdf(text=drama_string)
drama_freq.to_csv(f'{root}/Data/Subset Data/genre_freq/drama_word_freq.csv', index=False)



In [270]:
def search_list(lst, genre):
    if genre != 'nan':
        return genre in lst
    else:
        if pd.isna(lst) or lst == '[]':
            return True
        else:
            return False

In [271]:
## get genre frequency csvs for all the movies we have so far

genre_types = genre_types[1:]
for genre in genre_types:
    print(f"starting {genre}")
    gdf  =  df[df['genres'].apply(lambda lst: genre in lst)].copy()
    g_string = gdf['reviews_mash'].str.cat(sep=' ')
    g_string = re.sub(r"\\.\s*", '', g_string)
    g_freq = text_to_freqdf(text=g_string)
    g_freq.to_csv(f'{root}/Data/Subset Data/genre_freq/{genre}_word_freq.csv', index=False)
    g_freq_short = g_freq.iloc[:10000].copy()
    g_freq_short.to_csv(f'{root}/Data/Subset Data/genre_freq_short/{genre}_word_freq_short.csv', index=False)


starting Romance
starting Comedy
starting Action
starting Adventure
starting Horror
starting Thriller
starting Crime
starting Drama
starting History
starting Mystery
starting War
starting Science Fiction
starting TV Movie
starting Fantasy
starting Music
starting Family
starting Documentary
starting Western
starting Animation
