In [150]:
import pandas as pd
import re
import os 
import ast
import spacy

nlp = spacy.load("en_core_web_sm")

In [151]:


def find_repo_root(start_path):
    """
    useful general function for finding the (first, closest) repo root so github file paths work the same on different machines 
    """
    current_path = os.path.abspath(start_path)
    
    while True:
        # Check for the existence of the .git directory or other indicators
        if os.path.isdir(os.path.join(current_path, '.git')) or \
           os.path.isfile(os.path.join(current_path, 'README.md')):
            return current_path
        
        parent_path = os.path.dirname(current_path)
        
        # Stop if we reach the root directory
        if parent_path == current_path:
            break
        
        current_path = parent_path

    return None  # Return None if not found

root = find_repo_root(os.getcwd())

root = root.replace('\\', '/')

def fk_apply_literal(x):
    try:
        return ast.literal_eval(x)
    except Exception as e: 
        print(e, x)
        return None




In [152]:
df = pd.read_csv(f"{root}/Data/movie_n=8_comments.csv")
df = df.dropna(subset=['Movie', 'comments'])
df.drop_duplicates(keep='first')
df['Movie'][0]

"{'title': 'puerta-de-hierro-el-exilio-de-peron', 'url': 'https://letterboxd.com/film/puerta-de-hierro-el-exilio-de-peron/', 'directors': ['Víctor Laplace', 'Dieguillo Fernández'], 'rating': '3.30 out of 5', 'year': '2013', 'genres': []}"

In [153]:
# import spacy
# import pandas as pd
# import time

# # Load SpaCy model
# nlp = spacy.load("en_core_web_sm")

# # Sample large DataFrame with a 'reviews' column
# data = {
#     'reviews': ["This product is great! I love it." * 10] * 10000  # 10000 rows, repeating review
# }
# df = pd.DataFrame(data)

# # Define the tokenization function
# def tokenize_review(review):
#     doc = nlp(review)
#     return [token.text for token in doc if not token.is_punct and not token.is_space]

# # Measure time for tokenizing the entire DataFrame column
# start_time = time.time()
# df['tokenized_reviews'] = df['reviews'].apply(tokenize_review)
# end_time = time.time()

# # Print elapsed time
# print(f"Tokenization took {end_time - start_time:.6f} seconds.")



def tokenize_reviews(reviews):
    texts, lemmas = [],[]
    for review in reviews:
        doc = nlp(review)
        text = [token.text for token in doc if not token.is_space]
        lemma = [token.lemma_ for token in doc if not token.is_space]
        text = [''.join(text[i:i+2]) if text[i] == "I" and text[i+1] == "'m" else text[i] for i in range(len(text)-1)]
        lemmas = [''.join(lemma[i:i+2]) if lemma[i] == "I" and lemma[i+1] == "'m" else lemma[i] for i in range(len(lemma)-1)]
        texts.append(text)
        lemmas.append(lemma)
    return (texts, lemmas)



In [154]:
def prune_spoilers(reviews):
    return [review.replace("This review may contain spoilers.I can handle the truth.", "") for review in reviews]

def extract_reviews(reviews_str):
    try:
        # Convert the string representation of the list to an actual list
        reviews_list = ast.literal_eval(reviews_str)
        # Extract 'review' from each dictionary
        return [review['review'] for review in reviews_list]
    except (ValueError, SyntaxError):
        return []
    
def convert_to_1_10(rating_str):
    # Count full stars and check for half star
    full_stars = 0 + rating_str.count('★')
    half_star = 0.5 if '½' in rating_str else 0

    # Combine the full stars and half star, then scale to 1-10
    score = full_stars + half_star
    score_1_10 = score * 2

    return int(score_1_10)

def extract_rating(reviews_str):
   try:
        # Convert the string representation of the list to an actual list
        reviews_list = ast.literal_eval(reviews_str)
        # Extract 'stars' from each dictionary
        stars =  [review['stars'] for review in reviews_list]
        return [convert_to_1_10(star) if star else pd.NA for star in stars]
   except:
       return []


def extract_date(reviews_str):
    reviews_list = ast.literal_eval(reviews_str)
    return [review['date'] for review in reviews_list]


df['reviews_extracted'] = df['comments'].apply(extract_reviews)
df['reviews_extracted'] = df['reviews_extracted'].apply(lambda review: prune_spoilers(review))
df['review_stars'] = df['comments'].apply(extract_rating)
df['review_dates'] = df['comments'].apply(extract_date)
df['reviews_extracted_lower'] = df['reviews_extracted'].apply(lambda review: prune_spoilers(review))
df['reviews_mash'] = df['reviews_extracted'].apply(lambda x: '\n'.join(x))


print(len(df['reviews_mash'][2]))
print(df['Movie'][2])

23846
{'title': 'team-america-world-police', 'url': 'https://letterboxd.com/film/team-america-world-police/', 'directors': ['Trey Parker'], 'rating': '3.47 out of 5', 'year': '2004', 'genres': ['Action', 'Adventure', 'Comedy']}


In [155]:
df['comments'].iloc[0]

'[{\'stars\': \'★★★★\', \'review\': \'Yo no entendí bien la película, Perón sabía que López Rega le tendía una trampa?\', \'date\': \'09 Nov 2020\'}, {\'stars\': \'★★★★½\', \'review\': \'Me pareció excelente. Impecable en términos históricos y con una forma de contar la historia bastante original y que funciona muy bien.Más que un Perón épico exiliado, está el Perón humano, que sufre la vil proscripción y sigue llorando por Eva. Increíblemente amena, maneja 18 años de historia muy bien explicados, sumado a la ambientación que está bien y la actuación de Víctor Laplace que está igualito al General.Tal vez pesa mucho que se haya hecho en pleno…\', \'date\': \'17 Jul 2020\'}, {\'stars\': \'★★\', \'review\': \'This review may contain spoilers.I can handle the truth.Esta película cuando la vi por primera vez me gustó mucho, pero volviéndola a ver siento que tiene muchísimos problemas. Es una película que no se calla nunca, todas las escenas tienen diálogos, es "Perón pensando en voz alta" y

In [156]:
df.iloc[10]

letterboxd_search          https://letterboxd.com/search/The+Dawns+Here+A...
Movie                      {'title': 'the-dawns-here-are-quiet', 'url': '...
comments                   [{'stars': '★★★★½', 'review': 'Absolutely fasc...
reviews_extracted          [Absolutely fascinating movie about female sol...
review_stars               [9, 9, 8, 8, 3, 7, 10, 10, 8, 4, 6, 8, 8, <NA>...
review_dates               [12 Jun 2019, 09 May 2020, 01 May 2022, 19 Jul...
reviews_extracted_lower    [Absolutely fascinating movie about female sol...
reviews_mash               Absolutely fascinating movie about female sold...
Name: 10, dtype: object

In [157]:
df['Movie'] = df['Movie'].apply(lambda x : fk_apply_literal(x))
df['genres'] = df['Movie'].apply(lambda x: x.get('genres'))
df['avg_rating'] = df['Movie'].apply(lambda x: x.get('rating'))
df['directors'] = df['Movie'].apply(lambda x: x.get('directors'))
df.to_parquet(f"{root}/Data/Whole_sets/cleaned_n=8_comments.parquet")

In [159]:
df['Movie'].iloc[0]

{'title': 'puerta-de-hierro-el-exilio-de-peron',
 'url': 'https://letterboxd.com/film/puerta-de-hierro-el-exilio-de-peron/',
 'directors': ['Víctor Laplace', 'Dieguillo Fernández'],
 'rating': '3.30 out of 5',
 'year': '2013',
 'genres': []}

In [None]:
# df_small['reviews_extracted'].iloc[3]

In [None]:
df.iloc[3]

letterboxd_search          https://letterboxd.com/search/The+Hills+Have+E...
Movie                      {'title': 'the-hills-have-eyes-2006', 'url': '...
comments                   [{'stars': '½', 'review': "I'm so tired of rap...
reviews_extracted          [I'm so tired of rape and sexual assault again...
reviews_extracted_lower    [I'm so tired of rape and sexual assault again...
reviews_mash               I'm so tired of rape and sexual assault agains...
genres                                                    [Horror, Thriller]
avg_rating                                                     3.01 out of 5
directors                                                               None
Name: 3, dtype: object

In [None]:
df['genres'].value_counts().head(15)

genres
[Drama]               770
[Comedy]              621
[Horror]              342
[Documentary]         231
[Drama, Romance]      220
[Comedy, Romance]     203
[Comedy, Drama]       201
[Romance, Drama]      181
[Drama, Comedy]       179
[Romance, Comedy]     143
[]                    124
[Thriller, Horror]    121
[Horror, Thriller]    119
[Drama, Crime]        108
[Action]              103
Name: count, dtype: int64

In [None]:
## of note -- neet to clean out any "This review may contain spoilers.I can handle the truth."
for reviews in df['reviews_extracted'].iloc[0:10]:
    for review in reviews:
        if  "this review may".lower() in review.lower():
            print(review)
            print('\n')

This review may contain spoilers!This Francois Ozon guy is pretty good, isn't he? Last year, I decided to dip my toes into Ozon's filmography with his 2003 film Swimming Pool. I absolutely loved that film and was keen to check out more of his work. Therefore, it was a pleasant surprise to stumble across Frantz, one of his more recent films, on BBC iplayer. As a result of enjoying my previous experience with Ozon's work, I did have high…




In [None]:
exploded_genres = df['genres'].explode()
genre_types = exploded_genres.unique().tolist()
print(genre_types)



[nan, 'Romance', 'Comedy', 'Action', 'Adventure', 'Horror', 'Thriller', 'Crime', 'Drama', 'History', 'Mystery', 'War', 'Science Fiction', 'TV Movie', 'Fantasy', 'Music', 'Family', 'Documentary', 'Western', 'Animation']


In [None]:
df_drama =  df[df['genres'].apply(lambda lst: "Drama" in lst)].copy()
df_drama['genres'].value_counts()


genres
[Drama]                                770
[Drama, Romance]                       220
[Comedy, Drama]                        201
[Romance, Drama]                       181
[Drama, Comedy]                        179
                                      ... 
[Drama, Romance, Mystery, Thriller]      1
[Comedy, Crime, Romance, Drama]          1
[Drama, Romance, Comedy, Family]         1
[Action, Western, Drama]                 1
[Drama, Romance, Horror]                 1
Name: count, Length: 975, dtype: int64

In [None]:
print(len(df_drama['reviews_mash'].iloc[1]))
drama_string = df['reviews_mash'].str.cat(sep=' ')
drama_string = re.sub(r"\\.\s*", '', drama_string)
print(len(drama_string))


29882
278197391


In [None]:
from collections import Counter
def text_to_freqdf(text=None, text_file=None, lower_b=True, n_gram=1):
    """
    Strip all the words out of a file and put them into a frequency dataframe.
    By default casts everything to lowercase. Gets it ready for allotaxonometer.

    :param text str: a string with the text.
    :param text_file str:  The path to the text_file. 
    :param lower_b Bool: A boolean indicating whether to preserve case.
                             Enter False if you want to preserve capital versus not. 
    :returns pd.DataFrame: A DataFrame with word frequencies.
    """

    if text:
        if lower_b:
            text = text.lower()
    else:
        if text_file:
            with open(text_file, 'r') as f:
                text = f.read()
            if lower_b:
                text = text.lower()
        else:
            raise ValueError("Input either a string of text or a textfile.")
    
    words = re.findall(r"\b\w+'?\w*\b", text)
    if n_gram == 1:
        df = pd.DataFrame(words, columns=['types'])
        df = df.value_counts()
        df = df.reset_index(name='counts')
        df['probs'] = df['counts'] / df['counts'].sum()
        total_unique = len(df)
        df['total_unique'] = total_unique
        return df
    else:
        bigrams = [f'{words[i]} {words[i+1]}' for i in range(len(words)-1)]
        bigram_counts = Counter(bigrams)
        bigram_df = pd.DataFrame(bigram_counts.items(), columns=['types', 'counts'])
        bigram_df['probs'] = bigram_df['counts'] / bigram_df['counts'].sum()
        bigram_df['total_unique'] = len(bigram_df)
        return bigram_df
    



In [None]:
drama_freq = text_to_freqdf(text=drama_string, n_gram=2)
drama_freq.to_csv(f'{root}/Data/Subset Data/Bigrams/genre_freq/drama_word_freq.csv', index=False)

In [None]:
def search_list(lst, genre):
    if genre != 'nan':
        return genre in lst
    else:
        if pd.isna(lst) or lst == '[]':
            return True
        else:
            return False

In [None]:
## get genre frequency csvs for all the movies we have so far
genre_types = genre_types[1:]
for genre in genre_types:
    print(f"starting {genre}")
    gdf  =  df[df['genres'].apply(lambda lst: genre in lst)].copy()
    g_string = gdf['reviews_mash'].str.cat(sep=' ')
    g_string = re.sub(r"\\.\s*", '', g_string)



    g_freq = text_to_freqdf(text=g_string)
    g_freq.to_csv(f'{root}/Data/Subset Data/Unigrams/genre_freq/{genre}_word_freq.csv', index=False)
    g_freq_short = g_freq.iloc[:10000].copy()
    g_freq_short.to_csv(f'{root}/Data/Subset Data/Unigrams/genre_freq_short/{genre}_word_freq_short.csv', index=False)
    g_freq_short_trimmed = g_freq.iloc[150:10150].copy()                                                                             
    g_freq_short_trimmed.to_csv(f'{root}/Data/Subset Data/Unigrams/genre_freq_short_trimmed/{genre}_word_freq_short_trimmed.csv', index=False)


    g_freq_b = text_to_freqdf(text=g_string, n_gram=2)
    g_freq_b.to_csv(f'{root}/Data/Subset Data/Bigrams/genre_freq/{genre}_bi_word_freq.csv', index=False)
    g_freq_b_short = g_freq_b.iloc[:10000].copy()
    g_freq_b_short.to_csv(f'{root}/Data/Subset Data/Bigrams/genre_freq_short/{genre}_bi_word_freq_short.csv', index=False)
    g_freq_b_short_trimmed = g_freq_b.iloc[150:10150].copy()                                                                             
    g_freq_b_short_trimmed.to_csv(f'{root}/Data/Subset Data/Bigrams/genre_freq_short_trimmed/{genre}_bi_word_freq_short_trimmed.csv', index=False)
    with open(f"{root}/Data/Subset Data/genre_text/{genre}_words.txt", 'w') as f:
        f.write(g_string)


starting Action
starting Adventure
starting Horror
starting Thriller
starting Crime
starting Drama
starting History
starting Mystery
starting War
starting Science Fiction
starting TV Movie
starting Fantasy
starting Music
starting Family
starting Documentary
starting Western
starting Animation
