Using the main text you chose at the start of the seasomester, plot happiness time series in the following ways using the labMT lexicon


(a) Process (destroy) your text so that it is a simple text file with one 1-gram
per line—a vector of 1-grams.
To the extent possible, keep punctuation in as separate 1-grams. Periods,
commas, semicolons, em dashes, ellipses, ldots.

In [1]:
# !pip install matplotlib
# !pip install numpy
# !pip install shifterator
# !pip install nltk


import matplotlib as mpl
import numpy as np
import shifterator
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True




In [2]:
#take all the .txt files for WoT and compile them into one
import os
rootdir = "C:/Users/alexp/Documents/GitHub/WoT_SentimentAnalysis/txt"

lines = []
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        #print(file)
        with open(os.path.join(subdir, file), encoding="utf8") as f:
            lines.append(f.readlines())

In [37]:
import re

#ft = fulltext
ft=""
for line in lines:
    
    ft += str(line)

without_line_breaks = ft.replace("\\n', '\\n', '", " ")


print(without_line_breaks[1000:10000])

ke the thick-scattered dust of diamonds slowly dimming. The fat sickle of the moon hung low, giving barely light to make out the silhouettes of the men guarding the fireless camp in the sprawling copse of oak and leatherleaf. Fires would have given them away to the Aiel. He had fought the Aiel long before this war began, on the Shienaran marches, a matter of duty to friends. Aielmen were bad enough in daylight. Facing them in the night was as close to staking your life on the toss of a coin as made no difference. Of course, sometimes they found you without fires. Resting a gauntleted hand on his sword in its scabbard, he pulled his cloak back around himself and continued his round of the sentries through calf-deep snow. It was an ancient sword, made with the One Power before the Breaking of the World, during the War of the Shadow, when the Dark One had touched the world for a time. Only legends remained of that Age, except perhaps for what the Aes Sedai might know, yet the blade was ha

In [None]:
#how fast is this?
ft_tokenized = nltk.word_tokenize(without_line_breaks)


In [13]:
#Okay here need to put into a frequency dict of word:freq
#!pip install collections

import collections

freq_counts = collections.Counter(ft)

freq_counts

Counter({'[': 54,
         "'": 417523,
         'F': 15265,
         'o': 1397688,
         'r': 1074700,
         ' ': 4548419,
         'H': 43513,
         'a': 1560195,
         'i': 1120077,
         'e': 2454623,
         't': 1575202,
         '\\': 212221,
         'n': 1524444,
         ',': 507759,
         'N': 22646,
         'w': 472093,
         'd': 937617,
         'f': 363104,
         'v': 161845,
         'C': 17658,
         's': 1106325,
         '1': 958,
         'T': 90842,
         'h': 1253994,
         'k': 183160,
         '2': 605,
         'A': 64641,
         'W': 34592,
         'u': 502198,
         'l': 807230,
         '3': 480,
         'P': 18699,
         'c': 363069,
         '4': 415,
         'L': 24775,
         'g': 421644,
         '5': 306,
         'm': 420503,
         '6': 203,
         'S': 65828,
         'p': 250993,
         '7': 259,
         'I': 69639,
         '8': 197,
         'y': 350189,
         '9': 341,
         'B': 26803

In [None]:
def get_happiness_map(one_grams: list):
    """Build happiness dictionary for easy lookup.
    Args:
        one_grams (list): list of one grams for text
    """
    # from  https://hedonometer.org/words/labMT-en-v1/
    happiness_df = pd.read_csv('Hedonometer.csv')
    unique = set(one_grams)
    print(f'Generating happiness map for {len(unique)} words...')
    scores = {}
    not_found = []
    for word in unique:
        try:
            score = happiness_df[happiness_df['Word'] == word]['Happiness Score']
            scores[word] = float(score)
        except:
            not_found.append(word)
            continue
    print(f'Generated!')
    return scores, not_found

In [None]:
def get_happiness(window: list, happiness_map: dict, not_found: list) -> float:
    """Get the happiness score for a window (list of words).
    Args:
        window (list): list of words assumed to be in order of appearance
        happiness_map (dict): how we score each word, direct mapping
        not_found (list): list of words not found in our map
    Returns:
        float: happines score for window
    """
    # make sure all words being checked are in our map
    clean_window = [word for word in window if word not in not_found]
    if len(clean_window) == 0:
        word_scores = 0.0
    else:
        word_scores = np.nanmean(np.array([happiness_map[word] for word in clean_window]))
    return word_scores

In [None]:
def get_average_happiness(one_grams: list, happiness_map: dict, not_found: list):
    happiness_vals = []
    print('Calculating Average Happiness...')
    for gram in one_grams:
        if gram not in not_found:
            score = happiness_map[gram]
            happiness_vals.append(score)
    print('Done!')
    return np.nanmean(happiness_vals)

In [None]:
def filter_one_grams(one_grams: list, tolerance: float, happiness_map: dict, not_found: list, average_happiness: float):
    filtered = []
    lower_bound = average_happiness - tolerance
    upper_bound = average_happiness + tolerance
    for gram in one_grams:
        if gram in not_found:
            continue
        if lower_bound < happiness_map[gram] < upper_bound:
            continue
        else:
            filtered.append(gram)
    return filtered

In [None]:
def _happiness_worker(window_bounds: tuple, size: int):
        """Anonymous function to calculate score for a window."""
        start = window_bounds[0]
        stop = window_bounds[1]
        window_grams = one_grams[start:stop]
        return get_happiness(window_grams, happiness_map, not_found)

In [None]:
sizes = [1, 1.5, 2, 2.5, 3, 3.5, 4]
blocked = [0.5, 1, 1.5, 2, 2.5, 3, 3.5]
window_sizes = [int(10**val) for val in sizes]
happiness_map, not_found = get_happiness_map(one_grams)
average_happiness = get_average_happiness(one_grams, happiness_map, not_found)
to_plot = pd.read_csv('b_data.csv')
for i, size in enumerate(window_sizes):
    if size in to_plot.columns:
        continue
    group = np.zeros(len(one_grams),)  # initialize so function below can access it
    args = [((j, j + size), size) for j in range(len(one_grams))]
    for j, arg in enumerate(args):
        print(f'{j}/{len(args)} for step size {size}')
        group[j] = _happiness_worker(*arg)
    to_plot[size] = group
    to_plot.to_csv('b_data.csv')
to_plot = pd.read_csv('c_data.csv')
for i, size in enumerate(window_sizes):
    if size in to_plot.columns:
        continue
    tolerance = blocked[i]
    filtered_grams = filter_one_grams(one_grams, tolerance, happiness_map, not_found, average_happiness)
    group = np.zeros(len(filtered_grams),)  # initialize so function below can access it
    args = [((j, j + size), size) for j in range(len(filtered_grams))]
    for j, arg in enumerate(args):
        print(f'{j}/{len(args)} for step size {size}')
        group[j] = _happiness_worker(*arg)
    to_plot[size] = group
    to_plot.to_csv('c_data.csv')

Citations:

Dodds, Peter Sheridan, Kameron Decker Harris, Isabel M. Kloumann, Catherine A. Bliss, and Christopher M. Danforth. “Temporal patterns of happiness and information in a global social network: Hedonometrics and Twitter.” PLoS ONE 6, no. 12 (2011).