In [None]:
import pandas as pd
import re
import os 
import ast
from Utility.toolbox import find_repo_root

root = find_repo_root()

In [3]:
df = pd.read_csv(f"{root}/Data/2020_trope_data/Scraped_Data/NER_parsed_reviews.csv")
df['NER_cleaned_reviews'] = df['NER_cleaned_reviews'].apply(ast.literal_eval)
df['reviews_mash'] = df['NER_cleaned_reviews'].apply(lambda x: '\n'.join(x))
df['length_reviews_mash'] = df['reviews_mash'].apply(len)
print("average length in characters: ", df['length_reviews_mash'].mean())



average length in characters:  24350.100201317717


In [4]:
df.columns

Index(['letterboxd_search', 'url', 'actors', 'roles', 'studio',
       'reviews_extracted', 'review_stars', 'review_dates',
       'letterboxd_directors', 'letterboxd_year', 'letterboxd_rating',
       'letterboxd_genres', 'NameIMDB', 'IMDB_rating', 'IMDB_ID',
       'letter_USD_Budget', 'letter_US_Gross', 'letter_WW_Gross',
       'NER_cleaned_reviews', 'reviews_mash', 'length_reviews_mash'],
      dtype='object')

In [5]:
df['reviews_mash'].iloc[0]



In [6]:
exploded_genres = df['letterboxd_genres'].explode()
genre_types = exploded_genres.unique().tolist()
print(genre_types)

["['Horror']", "['History', 'Drama']", "['Drama', 'Romance']", "['Drama', 'Crime', 'Action']", "['Comedy']", "['Drama', 'Action', 'Crime']", "['Science Fiction', 'Comedy', 'Drama']", "['Drama', 'Romance', 'Comedy']", "['TV Movie', 'Thriller']", "['Drama', 'History', 'War']", "['Drama', 'Crime', 'Romance']", "['Documentary']", "['Drama', 'Crime']", "['Horror', 'Comedy']", "['Western']", "['Drama', 'Mystery', 'War', 'Comedy']", "['Documentary', 'Animation']", "['Comedy', 'Animation']", "['Fantasy', 'Action']", "['Drama', 'Fantasy', 'Family']", "['Family', 'TV Movie', 'Drama', 'Fantasy']", "['Fantasy', 'Drama']", "['Family', 'Comedy']", "['Comedy', 'TV Movie']", "['Drama', 'TV Movie']", "['Fantasy', 'Horror']", "['TV Movie', 'Romance']", "['Comedy', 'Family', 'Romance']", "['Science Fiction', 'Crime']", "['Thriller', 'Action', 'Crime']", "['Drama']", "['Drama', 'Family', 'TV Movie']", "['Mystery', 'Science Fiction', 'Horror']", "['Drama', 'Thriller']", "['Fantasy', 'Mystery', 'Horror']", 

In [7]:
unique_genres = set()
for genre in genre_types:
    unique_genres.update(ast.literal_eval(genre)) 
unique_genres = list(unique_genres)

In [8]:
from collections import Counter
def text_to_freqdf(text=None, text_file=None, lower_b=True, n_gram=1):
    """
    Strip all the words out of a file and put them into a frequency dataframe.
    By default casts everything to lowercase. Gets it ready for allotaxonometer.

    :param text str: a string with the text.
    :param text_file str:  The path to the text_file. 
    :param lower_b Bool: A boolean indicating whether to preserve case.
                             Enter False if you want to preserve capital versus not. 
    :returns pd.DataFrame: A DataFrame with word frequencies.
    """

    if text:
        if lower_b:
            text = text.lower()
    else:
        if text_file:
            with open(text_file, 'r') as f:
                text = f.read()
            if lower_b:
                text = text.lower()
        else:
            raise ValueError("Input either a string of text or a textfile.")
    
    words = re.findall(r"\b\w+'?\w*\b", text)
    if n_gram == 1:
        df = pd.DataFrame(words, columns=['types'])
        df = df.value_counts()
        df = df.reset_index(name='counts')
        df['probs'] = df['counts'] / df['counts'].sum()
        total_unique = len(df)
        df['total_unique'] = total_unique
        return df
    else:
        bigrams = [f'{words[i]} {words[i+1]}' for i in range(len(words)-1)]
        bigram_counts = Counter(bigrams)
        bigram_df = pd.DataFrame(bigram_counts.items(), columns=['types', 'counts'])
        bigram_df['probs'] = bigram_df['counts'] / bigram_df['counts'].sum()
        bigram_df['total_unique'] = len(bigram_df)
        return bigram_df


In [9]:
## get genre frequency csvs for all the movies we have so far
genre_types = unique_genres[1:]
for genre in genre_types:
    print(f"starting {genre}")
    gdf  =  df[df['letterboxd_genres'].apply(lambda lst: genre in lst)].copy()
    g_string = gdf['reviews_mash'].str.cat(sep=' ')
    g_string = re.sub(r"\\.\s*", '', g_string)


    g_freq = text_to_freqdf(text=g_string)
    g_freq.to_csv(f'{root}/Data/Subset_Data_NER/Unigrams/genre_freq/{genre}_word_freq.csv', index=False)
    g_freq_short = g_freq.iloc[:10000].copy()
    g_freq_short.to_csv(f'{root}/Data/Subset_Data_NER/Unigrams/genre_freq_short/{genre}_word_freq_short.csv', index=False)
    g_freq_short_trimmed = g_freq.iloc[150:10150].copy()                                                                             
    g_freq_short_trimmed.to_csv(f'{root}/Data/Subset Data/Unigrams/genre_freq_short_trimmed/{genre}_word_freq_short_trimmed.csv', index=False)


    g_freq_b = text_to_freqdf(text=g_string, n_gram=2)
    g_freq_b.to_csv(f'{root}/Data/Subset_Data_NER/Bigrams/genre_freq/{genre}_bi_word_freq.csv', index=False)
    g_freq_b_short = g_freq_b.iloc[:10000].copy()
    g_freq_b_short.to_csv(f'{root}/Data/Subset_Data_NER/Bigrams/genre_freq_short/{genre}_bi_word_freq_short.csv', index=False)
    g_freq_b_short_trimmed = g_freq_b.iloc[150:10150].copy()                                                                             
    g_freq_b_short_trimmed.to_csv(f'{root}/Data/Subset_Data_NER/Bigrams/genre_freq_short_trimmed/{genre}_bi_word_freq_short_trimmed.csv', index=False)
    with open(f"{root}/Data/Subset_Data_NER/genre_text/{genre}_words.txt", 'w') as f:
        f.write(g_string)


starting Fantasy
starting Adventure
starting Documentary
starting TV Movie
starting Comedy
starting Animation
starting Science Fiction
starting Drama
starting Thriller
starting Western
starting Music
starting History
starting Mystery
starting Family
starting Horror
starting Action
starting Romance
starting Crime
