# Variables of Interest Generation

In [8]:
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import corpus
import pandas as pd
import time
import json

# english words dataset from https://www.kaggle.com/datasets/bwandowando/479k-english-words
words_dictionary = json.load(open('../data/words_dictionary.json'))

def check_if_word_exists(word, words_dictionary):
    if words_dictionary.get(word.lower()) is None:
        return False
    return True

df = pd.read_csv('../data/movies_cleaned.csv')

sia = SentimentIntensityAnalyzer()

# for getting the remaining time estimate
start_time = time.time()

for index, row in df.iterrows():
    print(f'Processing row {index} of {len(df)}')

    # generate polarity scores
    title = row['title']
    title_sentiment = sia.polarity_scores(title)
    df.at[index, 'title_sentiment'] = title_sentiment['compound']

    overview = str(row['overview'])
    overview_sentiment = sia.polarity_scores(overview)
    df.at[index, 'overview_sentiment'] = overview_sentiment['compound']

    # generate length features
    title_length = len(title)
    df.at[index, 'title_length'] = title_length
    
    num_words_in_title = len(title.split())
    df.at[index, 'num_words_in_title'] = num_words_in_title

    title_words = title.split()
    average_title_word_length = sum(len(word) for word in title_words) / len(title_words)
    df.at[index, 'average_title_word_length'] = average_title_word_length

    # check if there is a word in the title that does not exist in the dictionary
    # first strip out  title punctuation, numbers, and stopwords like "of, the, a, etc."
    title = ''.join([char for char in title if char.isalpha() or char.isspace()])
    title_words_not_in_dict = [word for word in title.split() if word.lower() not in corpus.stopwords.words("english")]
    title_words_not_in_dict = [word for word in title_words_not_in_dict if not check_if_word_exists(word, words_dictionary)]
    df.at[index, 'has_made_up_word'] = len(title_words_not_in_dict) > 0
    df.at[index, 'made_up_word_count'] = len(title_words_not_in_dict)

    
df.to_csv('../data/movies_with_vars.csv', index=False)
df.describe()

Processing row 0 of 6507
Processing row 1 of 6507
Processing row 2 of 6507
Processing row 3 of 6507
Processing row 4 of 6507
Processing row 5 of 6507
Processing row 6 of 6507
Processing row 7 of 6507
Processing row 8 of 6507
Processing row 9 of 6507
Processing row 10 of 6507
Processing row 11 of 6507
Processing row 12 of 6507
Processing row 13 of 6507
Processing row 14 of 6507
Processing row 15 of 6507
Processing row 16 of 6507
Processing row 17 of 6507
Processing row 18 of 6507
Processing row 19 of 6507
Processing row 20 of 6507
Processing row 21 of 6507
Processing row 22 of 6507
Processing row 23 of 6507
Processing row 24 of 6507
Processing row 25 of 6507
Processing row 26 of 6507
Processing row 27 of 6507
Processing row 28 of 6507
Processing row 29 of 6507
Processing row 30 of 6507
Processing row 31 of 6507
Processing row 32 of 6507
Processing row 33 of 6507
Processing row 34 of 6507
Processing row 35 of 6507
Processing row 36 of 6507
Processing row 37 of 6507
Processing row 38 of 6

Unnamed: 0,id,popularity,vote_average,title_sentiment,overview_sentiment,title_length,num_words_in_title,average_title_word_length,made_up_word_count
count,6507.0,6507.0,6507.0,6507.0,6507.0,6507.0,6507.0,6507.0,6507.0
mean,223087.7,25.875517,6.455688,-0.016061,-0.115678,16.223605,2.887659,5.409678,0.177501
std,301292.4,11.720481,0.794394,0.244928,0.627946,9.645656,1.780141,1.818371,0.418226
min,5.0,9.95,2.895,-0.9375,-0.9877,1.0,1.0,1.0,0.0
25%,10052.0,17.486,5.975,0.0,-0.7184,10.0,2.0,4.0,0.0
50%,32823.0,22.174,6.492,0.0,-0.2023,14.0,2.0,5.0,0.0
75%,408884.5,31.149,7.0,0.0,0.46105,20.0,4.0,6.0,0.0
max,1237835.0,63.605,8.675,0.891,0.9958,104.0,20.0,17.0,4.0
