# Variables of Interest Generation

In [4]:
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import corpus
import pandas as pd
import time
import json

def check_if_word_exists(word):
    # english words dataset from https://www.kaggle.com/datasets/bwandowando/479k-english-words
    words_dictionary = json.load(open('../data/words_dictionary.json'))
    if words_dictionary.get(word.lower()) is None:
        return False
    return True

df = pd.read_csv('../data/movies_cleaned.csv')

sia = SentimentIntensityAnalyzer()

# for getting the remaining time estimate
start_time = time.time()

for index, row in df.iterrows():
    print(f'Processing row {index} of {len(df)}')
    time_remaining = ((time.time() - start_time) / (index + 1)) * (len(df) - index)
    minutes_remaining = time_remaining // 60
    print(f'Estimated time remaining: {minutes_remaining} minutes, {time_remaining % 60} seconds')
    # generate polarity scores
    title = row['title']
    title_sentiment = sia.polarity_scores(title)
    df.at[index, 'title_sentiment'] = title_sentiment['compound']

    overview = str(row['overview'])
    overview_sentiment = sia.polarity_scores(overview)
    df.at[index, 'overview_sentiment'] = overview_sentiment['compound']

    # generate length features
    title_length = len(title)
    df.at[index, 'title_length'] = title_length
    
    num_words_in_title = len(title.split())
    df.at[index, 'num_words_in_title'] = num_words_in_title

    title_words = title.split()
    average_title_word_length = sum(len(word) for word in title_words) / len(title_words)
    df.at[index, 'average_title_word_length'] = average_title_word_length

    # check if there is a word in the title that does not exist in the dictionary
    # first strip out  title punctuation, numbers, and stopwords like "of, the, a, etc."
    title = ''.join([char for char in title if char.isalpha() or char.isspace()])
    title_words_not_in_dict = [word for word in title.split() if word.lower() not in corpus.stopwords.words("english")]
    title_words_not_in_dict = [word for word in title_words_not_in_dict if not check_if_word_exists(word)]
    df.at[index, 'has_made_up_word'] = len(title_words_not_in_dict) > 0
    df.at[index, 'made_up_word_count'] = len(title_words_not_in_dict)

    
df.to_csv('../data/movies_with_vars.csv', index=False)
df.describe()

Processing row 0 of 7242
Estimated time remaining: 0.0 minutes, 19.67491865158081 seconds
['Godzilla']
Processing row 1 of 7242
Estimated time remaining: 30.0 minutes, 57.410629630088806 seconds
[]
Processing row 2 of 7242
Estimated time remaining: 33.0 minutes, 8.222163518269781 seconds
[]
Processing row 3 of 7242
Estimated time remaining: 29.0 minutes, 56.34311485290527 seconds
[]
Processing row 4 of 7242
Estimated time remaining: 28.0 minutes, 7.743713283538682 seconds
[]
Processing row 5 of 7242
Estimated time remaining: 25.0 minutes, 7.942129810651295 seconds
[]
Processing row 6 of 7242
Estimated time remaining: 24.0 minutes, 23.718858310154474 seconds
[]
Processing row 7 of 7242
Estimated time remaining: 22.0 minutes, 44.2594800889492 seconds
[]
Processing row 8 of 7242
Estimated time remaining: 22.0 minutes, 32.819777170816906 seconds
[]
Processing row 9 of 7242
Estimated time remaining: 22.0 minutes, 47.81740379333496 seconds
[]
Processing row 10 of 7242
Estimated time remainin

Unnamed: 0,id,popularity,vote_average,vote_count,title_sentiment,overview_sentiment,title_length,num_words_in_title,average_title_word_length,made_up_word_count
count,7242.0,7242.0,7242.0,7242.0,7242.0,7242.0,7242.0,7242.0,7242.0,7242.0
mean,241980.6,39.559858,6.507677,2164.695388,-0.018072,-0.114353,16.238332,2.890914,5.407279,0.177851
std,314072.4,114.886118,0.811147,3409.316786,0.244845,0.628845,9.704562,1.787952,1.822065,0.416966
min,5.0,9.95,2.895,11.0,-0.9375,-0.9957,1.0,1.0,1.0,0.0
25%,10135.25,17.909,6.0,350.0,0.0,-0.7184,9.0,2.0,4.0,0.0
50%,42522.0,23.541,6.5125,942.5,0.0,-0.199,14.0,2.0,5.0,0.0
75%,438668.0,36.824,7.09075,2371.25,0.0,0.4767,20.0,4.0,6.0,0.0
max,1246596.0,5741.978,8.706,35469.0,0.891,0.9958,104.0,20.0,17.0,4.0
