In [None]:
import pandas as pd

In [None]:
import glob

filenames = glob.glob("data/*")
filenames[:5]

In [None]:
# !wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/upshot-trump-emolex/data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt -P data

In [None]:
speeches = [open(filename).read() for filename in filenames]
len(speeches)

In [None]:
speeches_df = pd.DataFrame({
    'text': speeches,
    'filename': filenames
})
speeches_df.head(3)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
matrix = vec.fit_transform(speeches_df['text'])
vocab = vec.get_feature_names()
wordcount_df = pd.DataFrame(matrix.toarray(), columns=vocab)
wordcount_df.head()

In [None]:
filepath = "NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
emolex_df = pd.read_csv(filepath,  names=["word", "emotion", "association"], skiprows=45, sep='\t', keep_default_na=False)
emolex_df = emolex_df.pivot(index='word', columns='emotion', values='association').reset_index()
emolex_df.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(vocabulary=emolex_df.word,
                      use_idf=False, 
                      norm='l1') # ELL - ONE
matrix = vec.fit_transform(speeches_df.text)
vocab = vec.get_feature_names()
wordcount_df = pd.DataFrame(matrix.toarray(), columns=vocab)
wordcount_df.head()

In [None]:
angry_words = emolex_df[emolex_df.anger == 1]['word']
trust_words = emolex_df[emolex_df.trust == 1]['word']
anticipation_words = emolex_df[emolex_df.anticipation == 1]['word']
disgust_words = emolex_df[emolex_df.disgust == 1]['word']
fear_words = emolex_df[emolex_df.fear == 1]['word']
joy_words = emolex_df[emolex_df.joy == 1]['word']
negative_words = emolex_df[emolex_df.negative == 1]['word']
positive_words = emolex_df[emolex_df.positive == 1]['word']
sadness_words = emolex_df[emolex_df.sadness == 1]['word']
surprise_words = emolex_df[emolex_df.surprise == 1]['word']


In [None]:
speeches_df['anger'] = wordcount_df[angry_words].sum(axis=1)
speeches_df.head(3)

In [None]:
speeches_df['anticipation'] = wordcount_df[anticipation_words].sum(axis=1)
speeches_df['disgust'] = wordcount_df[disgust_words].sum(axis=1)
speeches_df['fear'] = wordcount_df[fear_words].sum(axis=1)
speeches_df['joy'] = wordcount_df[joy_words].sum(axis=1)
speeches_df['negative'] = wordcount_df[negative_words].sum(axis=1)
speeches_df['positive'] = wordcount_df[positive_words].sum(axis=1)
speeches_df['sadness'] = wordcount_df[sadness_words].sum(axis=1)
speeches_df['surprise'] = wordcount_df[surprise_words].sum(axis=1)
speeches_df['trust'] = wordcount_df[trust_words].sum(axis=1)

In [None]:
def name_edit(filename):
    return filename[5:9]

In [None]:
speeches_df['year'] = speeches_df['filename']
speeches_df['year'] = speeches_df['year'].map(lambda x:name_edit(x))

In [None]:
speeches_df['year'] = speeches_df['year'].astype('int64')

In [None]:
speeches_df = speeches_df.sort_values(by='year')

In [None]:
speeches_df.head()

In [None]:
import seaborn as sns
import matplotlib as plt

In [None]:
sns.lineplot(data=speeches_df, x='filename', y='anger').set_xticklabels(
    labels=speeches_df['filename'], rotation=90)
sns.lineplot(data=speeches_df, x='filename', y='positive')


In [None]:
df = speeches_df[['year','anger', 'anticipation','disgust', 'fear', 'joy','negative', 'positive', 'sadness', 'surprise', 'trust' ]]



In [None]:
df = df.set_index('year')

In [None]:
# import matplotlib.pyplot as plt
# sns.lineplot(data=df.column1, color="g")
# ax2 = plt.twinx()
# sns.lineplot(data=df.column2, color="b", ax=ax2)

In [None]:
ax = sns.lineplot(data=df)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
sns.set(rc={'figure.figsize':(20,8)})

In [None]:
speeches_df.to_csv('sentiment.csv',index=False)