In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
# first time only: download the VADER lexicon
nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/exide/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [23]:
df = pd.read_csv("data/post_sharktank.csv")
#print(df['Unnamed: 0'])
df.drop(columns=["Unnamed: 0"], inplace=True)
print(df.columns)

Index(['name', 'blurb', 'disable_communication', 'country', 'deadline',
       'created_at', 'launched_at', 'staff_pick', 'backers_count',
       'usd_pledged', 'spotlight', 'name_len', 'name_len_clean', 'blurb_len',
       'blurb_len_clean', 'create_to_launch_days', 'launch_to_deadline_days',
       'launch_to_state_change_days', 'SuccessfulBool', 'USorGB', 'TOPCOUNTRY',
       'LaunchedTuesday', 'DeadlineWeekend', 'usd_goal', 'cat_Academic',
       'cat_Apps', 'cat_Blues', 'cat_Comedy', 'cat_Experimental',
       'cat_Festivals', 'cat_Flight', 'cat_Gadgets', 'cat_Hardware',
       'cat_Immersive', 'cat_Makerspaces', 'cat_Misc', 'cat_Musical',
       'cat_Places', 'cat_Plays', 'cat_Restaurants', 'cat_Robots',
       'cat_Shorts', 'cat_Software', 'cat_Sound', 'cat_Spaces',
       'cat_Thrillers', 'cat_Wearables', 'cat_Web', 'cat_Webseries',
       'usd_goal (log)', 'sharktank_wonderful', 'sharktank_wonderful_eval',
       'sharktank_daymond', 'sharktank_daymond_eval', 'sharktank_barbar

In [24]:
eval_cols = ['sharktank_wonderful_eval', 'sharktank_daymond_eval', 'sharktank_barbara_eval', 'sharktank_mark_eval']

# if any value is greater than 10 or less than 0, replace with NaN
for col in eval_cols:
    df[col] = df[col].where((df[col] >= 0) & (df[col] <= 10), np.nan)

# interpolate the average value accross all 4 columns
row_mean = df[eval_cols].mean(axis=1)
print(row_mean)

# use that average to replace the NaNs
for col in eval_cols:
    df[col] = df[col].fillna(row_mean)

0        7.617429
1        5.183316
2        6.790570
3        4.655704
4        6.059008
           ...   
20627    5.940384
20628    4.386917
20629    6.381973
20630    5.310835
20631    5.078759
Length: 20632, dtype: float64


In [25]:
# instantiate the analyzer
sia = SentimentIntensityAnalyzer()

# list of columns to score
cols_to_analyze = [
    'blurb',
    'sharktank_wonderful',
    'sharktank_daymond',
    'sharktank_barbara',
    'sharktank_mark'
]

# apply sentiment analysis and store the compound score
for col in cols_to_analyze:
    sentiment_col = f"{col}_sentiment"
    df[sentiment_col] = (
        df[col]
        .fillna('')  # avoid errors on NaN
        .apply(lambda txt: sia.polarity_scores(txt)['compound'])
    )


In [26]:
# Save post-sentiment data
print(df.columns)
df = df.sample(frac=1).reset_index(drop=True) # shuffle!
df.to_csv("data/post_sentiment.csv")

Index(['name', 'blurb', 'disable_communication', 'country', 'deadline',
       'created_at', 'launched_at', 'staff_pick', 'backers_count',
       'usd_pledged', 'spotlight', 'name_len', 'name_len_clean', 'blurb_len',
       'blurb_len_clean', 'create_to_launch_days', 'launch_to_deadline_days',
       'launch_to_state_change_days', 'SuccessfulBool', 'USorGB', 'TOPCOUNTRY',
       'LaunchedTuesday', 'DeadlineWeekend', 'usd_goal', 'cat_Academic',
       'cat_Apps', 'cat_Blues', 'cat_Comedy', 'cat_Experimental',
       'cat_Festivals', 'cat_Flight', 'cat_Gadgets', 'cat_Hardware',
       'cat_Immersive', 'cat_Makerspaces', 'cat_Misc', 'cat_Musical',
       'cat_Places', 'cat_Plays', 'cat_Restaurants', 'cat_Robots',
       'cat_Shorts', 'cat_Software', 'cat_Sound', 'cat_Spaces',
       'cat_Thrillers', 'cat_Wearables', 'cat_Web', 'cat_Webseries',
       'usd_goal (log)', 'sharktank_wonderful', 'sharktank_wonderful_eval',
       'sharktank_daymond', 'sharktank_daymond_eval', 'sharktank_barbar