# CLS Sentiment analysis horror books

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/CLS_Shared_task

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt_tab')
from tqdm import tqdm

## Loading the data

Read the json file:

In [None]:
#the json file is a list of dictionaries. Each dictionary stands for one story.
with open('horror_tales_data_complete.json') as f:
    corpus = json.load(f)

print(corpus[0])

In [None]:
#Create dataframe
horror = pd.DataFrame(corpus)

# convert year to numeric
horror['year'] = pd.to_numeric(horror['year'], errors='coerce')

horror.head()

Extract the titles, lengths and the unpreprocessed, raw text of the stories from the dataframe as lists:

In [None]:
stories = [] #list of stories (raw text)
titles = [] #list of matching titles
length = [] #list of lengths
for index, row in horror.iterrows():
  stories.append(row['text'])
  titles.append(row['title'])
  length.append(row['length'])

print(titles)

In [None]:
# check if the number of titles and stories match
print(len(titles))
print(len(stories))

Inspecting the dataset:

In [None]:
#inspecting the dataset to make sure everything is correct
print(len(horror))
horror['length'].value_counts()

In [None]:
horror[['author1', 'author2']].value_counts()[:20][::-1].plot.barh(figsize=(8, 12));
#the 20 most frequent authors in the dataset
#we can see that 17 "sets of" authors wrote more than one text of our dataset;
#Bishop and Lovecraft collaborated on 2 texts, and the rest 16 are single authors.

## Preprocessing

In [None]:
horror['text'] = horror['text'].str.lower().str.strip() #lowercase and remove leading/trailing whitespaces
horror['text'] = [' '.join(t.split()) for t in horror['text']] #split at whitespace and join with ' ' to remove additional whitespaces
horror['text'] = [''.join([c for c in t if (c.isalpha() or c.isspace())]) for t in horror['text']]
horror

In [None]:
#bag-of-words model
horror['bow'] = horror['text'].str.split().apply(Counter)
horror.head()

## Analysis based on positive/negative sentiments

### NRC Lexicon and look-up dictionary (pos/neg)

In [None]:
emo = pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep='\t', header=None)
emo.columns = ['word', 'emotion', 'score']
emo = emo[emo['emotion'].isin({'positive', 'negative'})] #limit to 'positive' and 'negative'
emo = emo[emo['score'] != 0]
emo.sample(5)   #df with word and corresponding negative or positive sentiment

In [None]:
#inspect the words that are positive and negative at the same time: not many/frequent
word_counts = emo['word'].value_counts()
count = 0
# Filter for words that appear exactly twice
words_with_two = word_counts[word_counts == 2].index
for index, row in emo.iterrows():
    if row['word'] in words_with_two:
      count += 1
      print(row)
print(count)

In [None]:
#note: The following dictionary also takes into account when multiple emotions (positive AND negative) are associated with a word
emotions = emo['emotion'].unique()
emo_lookup = defaultdict(list)

for w, e in zip(emo['word'], emo['emotion']):
  emo_lookup[w].append(e)

In [None]:
#test
for word in 'love hate adore teens'.split():
    print(word, ':', emo_lookup[word])

In [None]:
emo_presence = []

for story in tqdm(horror['bow']):
    story_emotions = Counter()
    for word in story:
      if word in emo_lookup:   #this needs to be added, not every word is in our lookup dictionary
        for emo in emo_lookup[word]:
          story_emotions[emo] += story[word]  #add the count of the word (see b-o-w) to the positive/negative counter in story_emotions
    emo_presence.append([story_emotions[e] for e in emotions]) #adds positive and negative counts for each text of the corpus to the emo_presence list

emo_presence = pd.DataFrame(emo_presence, columns=emotions)
emo_presence.head(10)

### Analysing the corpus

In [None]:
emo_presence.sum(axis=0).sort_values().plot.barh();
#surprisingly, positive words are more common than negative ones

Apply normalisation so that text length does not affect the results of our analysis:

In [None]:
horror['word_count'] = [sum(l.values()) for l in horror['bow']] #For each story (=row), sum the counts in the bag-of-words
horror.head()

In [None]:
#Create a seperate dataframe for the normalised data
norm_emo_presence = pd.DataFrame()
for emo in emo_presence:
    norm_emo_presence.loc[:, emo] = (emo_presence[emo] / horror['word_count']).fillna(0)
norm_emo_presence

The top ten most emotional text:

In [None]:
#ten stories with the highest sum of normalised positive and negative values.
#In other words, which ten stories consists of the most words associated with positive or negative sentiments.
horror.loc[norm_emo_presence.sum(axis=1).sort_values(ascending=False)[:10].index]

Top ten least emotional texts:

In [None]:
horror.loc[norm_emo_presence.sum(axis=1).sort_values(ascending=True)[:10].index]
#Shows the ten texts that include the least words associated with a positive or negative emotion (= the least emotional texts)

Top ten most positive texts:

In [None]:
horror.loc[norm_emo_presence['positive'].sort_values(ascending=False)[:10].index] #The ten most positive texts

Top ten most negative texts:

In [None]:
horror.loc[norm_emo_presence['negative'].sort_values(ascending=False)[:10].index] #The ten most negative texts
#Stories by Robert E. Howard appear notably often.

### Visualizing possible correlations with scatter plots

Initial observations in the top-10s:
- high emotional intensity/positive ~ earlier publication, longer texts
- low emotional intensity/negative ~ later publication, shorter texts

In [None]:
# calculate emotional intensity values
horror['emotional_intensity'] = norm_emo_presence['positive'] + norm_emo_presence['negative']

plt.figure(figsize=(8, 6))

# convert 'year' to numeric
horror['year'] = pd.to_numeric(horror['year'], errors='coerce')

# color the datapoints by length
length_categories = horror['length'].unique()
colors = plt.cm.tab10(np.linspace(0, 1, len(length_categories)))
for cat, color in zip(length_categories, colors):
    subset = horror[horror['length'] == cat]
    plt.scatter(
        subset['year'],
        subset['emotional_intensity'],
        color=color,
        alpha=0.7,
        label=cat
    )

plt.xticks([1800, 1850, 1900, 1950])
plt.title('Emotional Intensity ~ Year of Publication ~ Length')
plt.xlabel('Year')
plt.ylabel('Emotional Intensity')
plt.legend(title='Length')

plt.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout()
plt.show()

Observations:
- The emotional intensity across the genre seems to be **declining** over the time period in question.
- The plot also reflects that most short stories and novellas in the dataset falls into the period 1900-1950, meaning that if we observe a trend in this group of texts, it might be due to length/time/even author.
- On the other hand, novels are relatively evenly distributed across the timespan.

### Clustering
We use Ward's linkage to cluster horror stories with similar positive/negative sentiment patterns. The Euclidian distance is used to calculate the distance between the stories. The smaller the distance between stories is, the more similar they are.
 - Generally, shorter texts / longer texts cluster together respectively
 - The earliest works are rather close to each other
 - Works from the same author, e.g., Lovecraft/Howard, are close to each other (for the 17 authors with more than 1 works)

In [None]:
# colored according to length

import scipy.spatial.distance as scidist
import scipy.cluster.hierarchy as hierarchy
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
# Visualize
def plot_tree_noveltype(linkage_object, labels, lengthlist, figsize=(10, 40), ax=None):
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)
    with plt.rc_context({'lines.linewidth': 1.0}):
        dendrogram = hierarchy.dendrogram(
            linkage_object, labels=labels, ax=ax,
            link_color_func=lambda c: 'black',
            orientation='left',
            leaf_font_size=10)
    #print(dendrogram.keys())
    # Remove ticks and spines
    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')
    for s in ax.spines.values():
        s.set_visible(False)
    #add color based on length (novel vs novella vs short story vs novelette)
    for label in ax.get_yticklabels():
      story_title = label.get_text()
      idx_title = labels.index(story_title) #get the idx for the story title from our list of titles
      if lengthlist[idx_title] == 'novella':   #use the idx nr to assign a colour based on novel length
          label.set_color('red')
      elif lengthlist[idx_title] == 'short story':
          label.set_color('blue')
      elif lengthlist[idx_title] == 'novel':
          label.set_color('green')
      elif lengthlist[idx_title] == 'novelette':
          label.set_color('orange')
      else:
          label.set_color('black')
    #add a legend
    legend_length_types = [
        Patch(color='red', label='novella'),
        Patch(color='blue', label='short story'),
        Patch(color='green', label='novel'),
        Patch(color='orange', label='novelette')
        ]
    plt.legend(handles=legend_length_types, title="Length Type", loc='upper left')


# 1. Calculate pairwise distances (based on the distribution of the positive/negative emotions for each story)
dm = scidist.pdist(norm_emo_presence, 'euclidean')

# 2. Establish branch structure (linkage_object, linking the most similar texts)
linkage_object = hierarchy.linkage(dm, method='ward')

# below is a version which also displays the authors:
def format_authors(row):
    # Check if author2 exists and is not NaN/empty
    if pd.notna(row['author2']) and row['author2'] != '':
        return f"{row['author1']} & {row['author2']}"
    else:
        return row['author1']

horror['all_authors'] = horror.apply(format_authors, axis=1)
combined_labels = [f"{title} ({authors})" for title, authors in zip(titles, horror['all_authors'])]

plot_tree_noveltype(linkage_object, combined_labels, length) # titles instead of combined_labels for the version without authors

In [None]:
import seaborn as sns
#plot it with a heatmap
dendro_heatmap = sns.clustermap(
    norm_emo_presence,
    row_linkage=linkage_object,
    col_cluster=False, #no dendrogram for the columns (at the top of heatmap)
    yticklabels=combined_labels,
    cmap='coolwarm',
    figsize=(15, 27)
)

#add color based on length (novel vs novella vs short story vs novelette)
yticklabels = dendro_heatmap.ax_heatmap.get_yticklabels()
for label in yticklabels:
  story_title = label.get_text()
  idx_title = combined_labels.index(story_title) #get the idx for the story title from our list of titles
  if length[idx_title] == 'novella':   #use the idx nr to assign a colour based on novel length
      label.set_color('red')
  elif length[idx_title] == 'short story':
      label.set_color('blue')
  elif length[idx_title] == 'novel':
      label.set_color('green')
  elif length[idx_title] == 'novelette':
      label.set_color('orange')
  else:
      label.set_color('black')
#add a legend
legend_length_types = [
    Patch(color='red', label='novella'),
    Patch(color='blue', label='short story'),
    Patch(color='green', label='novel'),
    Patch(color='orange', label='novelette')
    ]
dendro_heatmap.ax_heatmap.legend(
    handles=legend_length_types,
    title="Length Type",
    loc='upper right',
    bbox_to_anchor=(2, 1) #adjust this to manually move the legend
)

#need to flip the heatmap and the dendrogram horizontally to have the same bottom-up order as the previous code (since we specified 'orientation=left' there)
dendro_heatmap.ax_heatmap.invert_yaxis()
dendro_heatmap.ax_row_dendrogram.invert_yaxis()

dendro_heatmap.ax_cbar.set_position((0.9, 0, .03, .09)) #adjust the position of the colourbar
dendro_heatmap.ax_cbar.set_title('colourbar')

plt.show()

In [None]:
# according to year

def plot_tree_yeargroups(linkage_object, labels, yearlist, figsize=(10, 40), ax=None):
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)
    with plt.rc_context({'lines.linewidth': 1.0}):
        dendrogram = hierarchy.dendrogram(
            linkage_object, labels=labels, ax=ax,
            link_color_func=lambda c: 'black',
            orientation='left',
            leaf_font_size=10)

    # Remove ticks and spines
    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')
    for s in ax.spines.values():
        s.set_visible(False)

    # Add color based on year ranges
    for label in ax.get_yticklabels():
        story_title = label.get_text()
        idx_title = labels.index(story_title)
        year = int(yearlist[idx_title])  # Convert to integer

        if year < 1800:
            label.set_color('purple')
        elif 1800 <= year <= 1849:
            label.set_color('blue')
        elif 1850 <= year <= 1899:
            label.set_color('green')
        elif 1900 <= year <= 1949:
            label.set_color('orange')
        elif 1950 <= year <= 1999:
            label.set_color('red')
        else:
            label.set_color('black')

    #add a legend
    legend_years = [
        Patch(color='purple', label='before 1800'),
        Patch(color='blue', label='1800-1849'),
        Patch(color='green', label='1850-1899'),
        Patch(color='orange', label='1900-1949'),
        Patch(color='red', label='1950-1999'),
        Patch(color='black', label='after 1999')
        ]
    plt.legend(handles=legend_years, title="Period", loc='upper left')

dm = scidist.pdist(norm_emo_presence, 'euclidean')
linkage_object = hierarchy.linkage(dm, method='ward')

plot_tree_yeargroups(linkage_object, combined_labels, horror['year']) # used the year column from the horror dataframe directly, in which values were converted to numeric (when making the scatter plot)

In [None]:
import seaborn as sns
#plot it with a heatmap
dendro_heatmap = sns.clustermap(
    norm_emo_presence,
    row_linkage=linkage_object,
    col_cluster=False, #no dendrogram for the columns (at the top of heatmap)
    yticklabels=combined_labels,
    cmap='coolwarm',
    figsize=(15, 27)
)

#add color based on year ranges
yticklabels = dendro_heatmap.ax_heatmap.get_yticklabels()
for label in yticklabels:
    story_title = label.get_text()
    idx_title = combined_labels.index(story_title)
    year = int(horror['year'][idx_title])  # Convert to integer

    if year < 1800:
        label.set_color('purple')
    elif 1800 <= year <= 1849:
        label.set_color('blue')
    elif 1850 <= year <= 1899:
        label.set_color('green')
    elif 1900 <= year <= 1949:
        label.set_color('orange')
    elif 1950 <= year <= 1999:
        label.set_color('red')
    else:
        label.set_color('black')

#add a legend
legend_years = [
    Patch(color='purple', label='before 1800'),
    Patch(color='blue', label='1800-1849'),
    Patch(color='green', label='1850-1899'),
    Patch(color='orange', label='1900-1949'),
    Patch(color='red', label='1950-1999'),
    Patch(color='black', label='after 1999')
    ]

dendro_heatmap.ax_heatmap.legend(
    handles=legend_years,
    title="Length Type",
    loc='upper right',
    bbox_to_anchor=(2, 1) #adjust this to manually move the legend
)

#need to flip the heatmap and the dendrogram horizontally to have the same bottom-up order as the previous code (since we specified 'orientation=left' there)
dendro_heatmap.ax_heatmap.invert_yaxis()
dendro_heatmap.ax_row_dendrogram.invert_yaxis()

dendro_heatmap.ax_cbar.set_position((0.9, 0, .03, .09)) #adjust the position of the colourbar
dendro_heatmap.ax_cbar.set_title('colourbar')

plt.show()

## Diachronic sentiment analysis (emotional arcs of the stories)
based on positive/negative emotions

### sentence-level:

Plotting emotional arcs at sentence level requires the original sentences (punctuation) to be preserved.

In [None]:
# Reread the horror dataframe:
with open('horror_tales_data_complete.json') as f:
    corpus = json.load(f)

#Create dataframe
horror_raw = pd.DataFrame(corpus)

# convert year to numeric
horror_raw['year'] = pd.to_numeric(horror_raw['year'], errors='coerce')

stories_raw = [] #list of stories (raw text)
titles = [] #list of matching titles
length = [] #list of lengths
for index, row in horror_raw.iterrows():
  stories_raw.append(row['text'])
  titles.append(row['title'])
  length.append(row['length'])

In [None]:
# NRC Lexicon and look-up dictionary
emo = pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep='\t', header=None)
emo.columns = ['word', 'emotion', 'score']
emo = emo[emo['emotion'].isin({'positive', 'negative'})] #limit to 'positive' and 'negative'
emo = emo[emo['score'] != 0]
emo.sample(5)   #df with word and corresponding negative or positive sentiment

#This look-up dictionary only maps a single sentiment to words
word2sent = dict(zip(emo['word'], emo['emotion']))

In [None]:
# function for DCT
from scipy.fftpack import dct
def get_dct_transform(raw_values, low_pass_size=5, x_reverse_len=100, scale_vals=False, scale_range=False):
    if not isinstance(raw_values, (list, np.ndarray)):
        raise ValueError("Input must be a numeric list or numpy array")
    raw_values = np.array(raw_values)
    if low_pass_size > len(raw_values):
        raise ValueError("low_pass_size must be less than or equal to the length of raw_values input vector")

    values_dct = dct(raw_values, norm='ortho')
    keepers = values_dct[:low_pass_size]
    padded_keepers = np.concatenate([keepers, np.zeros(x_reverse_len - low_pass_size)])
    dct_out = dct(padded_keepers, type=3, norm='ortho')

    if scale_vals and scale_range:
        raise ValueError("ERROR: scale_vals and scale_range cannot both be true.")
    if scale_vals:
        return (dct_out - np.mean(dct_out)) / np.std(dct_out)
    if scale_range:
        return (dct_out - np.min(dct_out)) / (np.max(dct_out) - np.min(dct_out))

    return dct_out

In [None]:
def sentiment_plots_sentence_level(texts, labels, dct=False):
    fig, ax = plt.subplots(figsize=(20, 5))

    for text, label in zip(texts, labels):
        df2 = pd.DataFrame(sent_tokenize(text), columns=['sentence']) #tokenizing at sentence level
        #Preprocessing the sentences
        df2['clean_sentence'] = df2['sentence'].str.lower().str.strip()
        df2['clean_sentence'] = [''.join([c for c in t if (c.isalpha() or c.isspace())]) for t in df2['clean_sentence']]
        df2['vocabulary'] = df2['clean_sentence'].str.split().apply(set) #Creating a vocabulary of unique words for each sentence

        #Calculate the sentiment (positive & negative) scores for the vocabulary of each sentence and store them in a dataframe
        sentiment_scores = []
        for bow in df2['vocabulary']:
            sent_cnts = Counter()
            for word in bow:
                if word in word2sent:
                    sent_cnts[word2sent[word]] += 1
            sentiment_scores.append([sent_cnts[e] for e in emotions])

        df2 = pd.concat([df2, pd.DataFrame(sentiment_scores, columns=emotions)]).fillna(0)
        df2['valence'] = df2['positive'] - df2['negative'] #Calculate the valence score for each sentence

        if not dct:
            df2['moving_average'] = df2['valence'].rolling(window=len(df2) // 10).mean()
            df2['position'] = df2.index / len(df2)
            df2.plot('position', 'moving_average', ax=ax, label=label)
        else:   #If dct=True then the DCT transformation will be applied.
            transformed_values = get_dct_transform(df2['valence'].values, x_reverse_len=100)
            plt.plot(transformed_values, label=label)

    plt.axhline(0, ls='--', c='lightgrey')
    plt.xlabel('Narrative Time', fontsize=12)
    plt.ylabel('Emotional Valence', fontsize=12)
    plt.tight_layout()
    plt.legend()

sentiment plot for one work:

In [None]:
sentiment_plots_sentence_level([stories_raw[2]], [titles[2]], dct=False)

sentiment plots for each segment of 50 years:

In [None]:
#filter by 50-year timespans:
horror_bf1800 = horror_raw[horror_raw["year"] < 1800]

bf1800_stories = horror_bf1800["text"].tolist()
bf1800_titles = horror_bf1800["title"].tolist()

sentiment_plots_sentence_level(bf1800_stories, bf1800_titles, dct=False)

Sentiment plots based on length type:

In [None]:
# horror_novels = horror[horror["length"] == "novel"]
# novels_stories = horror_novels["text"].tolist()
# novels_titles = horror_novels["title"].tolist()
# sentiment_plots_sentence_level(novels_stories, novels_titles, dct=False)

In [None]:
# plots with all works from one length category/time period are likely to be very cluttered.
# we can specify both the length and the year of publication.

df_to_plot = horror_raw[(horror_raw['length'] == 'novel') &
                 (horror_raw['year'] > 1750) &
                 (horror_raw['year'] < 1800)]

if len(df_to_plot) > 0:
    sentiment_plots_sentence_level(df_to_plot['text'].tolist(), df_to_plot['title'].tolist(), dct=False)
else:
    print("No stories found")

### chunk-level:
It is also possible to try to split the texts into a fixed number of chunks for all texts (instead of sentences/paragraphs). This way, the plots of short/long texts won't seem to differ too much in granularity and will become more comparable, and the shape of the curve (roughly) remains. However, this approach could also be problematic because chunking may split sentences or narrative sequences, and for short/long texts, 1 chunk = different length/"unit"/narrative function, etc.

The function below works on already preprocessed texts.

In [None]:
def sentiment_plots(texts, labels, dct=False):
    fig, ax = plt.subplots(figsize=(20, 5))

    for text, label in zip(texts, labels):
        # Split text into 100 equal-length chunks by word count
        words = text.split()
        total_words = len(words)
        chunk_sizes = [total_words // 100 + (1 if i < total_words % 100 else 0) for i in range(100)]

        chunks = []
        start_idx = 0
        for size in chunk_sizes:
            end_idx = start_idx + size
            chunk = ' '.join(words[start_idx:end_idx]) if start_idx < total_words else ''
            chunks.append(chunk)
            start_idx = end_idx

        df = pd.DataFrame(chunks, columns=['chunk'])
        df['vocabulary'] = df['chunk'].str.split().apply(set)

        sentiment_scores = []
        for bow in df['vocabulary']:
            sent_cnts = Counter()
            for word in bow:
                if word in word2sent:
                    sent_cnts[word2sent[word]] += 1
            sentiment_scores.append([sent_cnts[e] for e in emotions])

        df = pd.concat([df, pd.DataFrame(sentiment_scores, columns=emotions)]).fillna(0)
        df['valence'] = df['positive'] - df['negative']

        if not dct:
            df['moving_average'] = df['valence'].rolling(window=len(df) // 10).mean()
            df['position'] = df.index / len(df)
            df_plot = df.dropna(subset=['moving_average'])
            df_plot = df_plot[df_plot['moving_average'] != 0.0]
            df_plot.plot('position', 'moving_average', ax=ax, label=label)
        else:
            transformed_values = get_dct_transform(df['valence'].values, x_reverse_len=100)
            plt.plot(transformed_values, label=label)

    plt.axhline(0, ls='--', c='lightgrey')
    plt.xlabel('Narrative Time', fontsize=12)
    plt.ylabel('Emotional Valence', fontsize=12)
    plt.tight_layout()
    plt.legend()


In [None]:
# the function takes lists of texts and titles as input: for one text, "create" a list
sentiment_plots([stories[2]], [titles[2]], dct=False)

In [None]:
# for multiple texts, directly use (slices of) the lists:
sentiment_plots(stories[0:3], titles[0:3], dct=False)

## Analysis based on 8 emotions

### NRC Lexicon and look-up dictionary (8 emotions)

In [None]:
emo8 = pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep='\t', header=None)
emo8.columns = ['word', 'emotion', 'score']
emo8 = emo8[emo8['emotion'].isin({'positive', 'negative'}) == False] #exclude 'positive' and 'negative'
emo8 = emo8[emo8['score'] != 0]
emo8.sample(5)   #df with word and corresponding negative or positive sentiment
emo8.head()

In [None]:
emo8.emotion.unique()

In [None]:
#note: The following dictionary also takes into account when multiple emotions (positive AND negative) are associated with a word
emotions8 = emo8['emotion'].unique()
emo8_lookup = defaultdict(list)

for w, e in zip(emo8['word'], emo8['emotion']):
  emo8_lookup[w].append(e)

In [None]:
#test
for word in 'love hate adore tenacious'.split():
    print(word, ':', emo8_lookup[word])

In [None]:
emo8_presence = []

for story in tqdm(horror['bow']):
    story_emotions8 = Counter()
    for word in story:
      if word in emo8_lookup:   #this needs to be added, not every word is in our lookup dictionary
        for emotion in emo8_lookup[word]:
          story_emotions8[emotion] += story[word]  #add the count of the word (see b-o-w) to the positive/negative counter in story_emotions
    emo8_presence.append([story_emotions8[e] for e in emotions8]) #adds positive and negative counts for each text of the corpus to the emo_presence list

emo8_presence = pd.DataFrame(emo8_presence, columns=emotions8)
emo8_presence.head(10)

### Analysis (distribution 8 emotions)
- Trust, fear, anticipation and sadness as the top 4 emotions: confirms findings in previous study on Lovecraft's works and shows that it may be a general feature of horror fiction.
- Distinctive personal style of an author: Robert Howard's works are remarkably associated with negative emotions, fear and sadness.

In [None]:
emo8_presence.sum(axis=0).sort_values().plot.barh();
#Trust, fear and anticipation are the most common in the entire corpus


In [None]:
#Create a seperate dataframe for the normalised data
norm_emo8_presence = pd.DataFrame()
for emotion in emo8_presence:
    norm_emo8_presence.loc[:, emotion] = (emo8_presence[emotion] / horror['word_count']).fillna(0)
norm_emo8_presence

Top ten most emotional stories:

In [None]:
#Ten stories that consists of the most words associated with our eight emotions.
horror.loc[norm_emo8_presence.sum(axis=1).sort_values(ascending=False)[:10].index]

Top ten least emotional texts:

In [None]:
horror.loc[norm_emo8_presence.sum(axis=1).sort_values(ascending=True)[:10].index]
#Shows the ten texts that include the least words associated with the 8 emotions (= the least emotional texts)

Top ten text associated with the emotion 'fear':

In [None]:
horror.loc[norm_emo8_presence['fear'].sort_values(ascending=False)[:10].index] #The ten most positive texts
#Similary to the most negative texts, the stories by Robert E. Howard are very common.

Top ten texts that consist of the most words associated with 'joy':

In [None]:
horror.loc[norm_emo8_presence['joy'].sort_values(ascending=False)[:10].index] #The ten most positive texts

Top ten texts that consist of the most words associated with 'sadness':

In [None]:
horror.loc[norm_emo8_presence['sadness'].sort_values(ascending=False)[:10].index]
#The stories by Robert E. Howard appear mutliple times.

Top ten texts that consist of the most words associated with 'disgust':

In [None]:
horror.loc[norm_emo8_presence['disgust'].sort_values(ascending=False)[:10].index]

Top ten texts that consist of the most words associated with 'anticipation':

In [None]:
horror.loc[norm_emo8_presence['anticipation'].sort_values(ascending=False)[:10].index]

Top ten texts that consist of the most words associated with 'trust':

In [None]:
horror.loc[norm_emo8_presence['trust'].sort_values(ascending=False)[:10].index]

### Clustering (based on distribution of 8 emotions)

In [None]:
length[1]

In [None]:
import scipy.spatial.distance as scidist
import scipy.cluster.hierarchy as hierarchy
import matplotlib.pyplot as plt
# Visualize
def plot_tree_noveltype(linkage_object, labels, lengthlist, figsize=(10, 40), ax=None):
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)
    with plt.rc_context({'lines.linewidth': 1.0}):
        dendrogram = hierarchy.dendrogram(
            linkage_object, labels=labels, ax=ax,
            link_color_func=lambda c: 'black',
            orientation='left',
            leaf_font_size=10)
    #print(dendrogram.keys())
    # Remove ticks and spines
    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')
    for s in ax.spines.values():
        s.set_visible(False)
    #add color based on length (novel vs novella vs short story vs novelette)
    for label in ax.get_yticklabels():
      story_title = label.get_text()
      idx_title = labels.index(story_title) #get the idx for the story title from our list of titles
      if lengthlist[idx_title] == 'novella':   #use the idx nr to assign a colour based on novel length
          label.set_color('red')
      elif lengthlist[idx_title] == 'short story':
          label.set_color('blue')
      elif lengthlist[idx_title] == 'novel':
          label.set_color('green')
      elif lengthlist[idx_title] == 'novelette':
          label.set_color('orange')
      else:
          label.set_color('black')
    #add a legend
    legend_length_types = [
        Patch(color='red', label='novella'),
        Patch(color='blue', label='short story'),
        Patch(color='green', label='novel'),
        Patch(color='orange', label='novelette')
        ]
    plt.legend(handles=legend_length_types, title="Length Type", loc='upper left')


# 1. Calculate pairwise distances (based on the distribution of the 8 emotions for each story)
dm8 = scidist.pdist(norm_emo8_presence, 'euclidean')

# 2. Establish branch structure (linkage_object, linking the most similar texts)
linkage_object = hierarchy.linkage(dm8, method='ward')

plot_tree_noveltype(linkage_object, titles, length)

In [None]:
#plot it with a heatmap
dendro_heatmap = sns.clustermap(
    norm_emo8_presence,
    row_linkage=linkage_object,
    col_cluster=True, #Change this to False to remove dendrogram for the emotions (at the top of heatmap)
    yticklabels=combined_labels,
    cmap='coolwarm',
    figsize=(15, 27)
)

#add color based on length (novel vs novella vs short story vs novelette)
yticklabels = dendro_heatmap.ax_heatmap.get_yticklabels()
for label in yticklabels:
  story_title = label.get_text()
  idx_title = combined_labels.index(story_title) #get the idx for the story title from our list of titles
  if length[idx_title] == 'novella':   #use the idx nr to assign a colour based on novel length
      label.set_color('red')
  elif length[idx_title] == 'short story':
      label.set_color('blue')
  elif length[idx_title] == 'novel':
      label.set_color('green')
  elif length[idx_title] == 'novelette':
      label.set_color('orange')
  else:
      label.set_color('black')
#add a legend
legend_length_types = [
    Patch(color='red', label='novella'),
    Patch(color='blue', label='short story'),
    Patch(color='green', label='novel'),
    Patch(color='orange', label='novelette')
    ]
dendro_heatmap.ax_heatmap.legend(
    handles=legend_length_types,
    title="Length Type",
    loc='upper right',
    bbox_to_anchor=(2, 1.) #adjust this to manually move the legend
)

#need to flip the heatmap and the dendrogram horizontally to have the same order as the output of the previous code (since we specified 'orientation=left' there)
dendro_heatmap.ax_heatmap.invert_yaxis()
dendro_heatmap.ax_row_dendrogram.invert_yaxis()

dendro_heatmap.ax_cbar.set_position((0.9, 0, .03, .09)) #adjust the position of the colourbar
dendro_heatmap.ax_cbar.set_title('colourbar')

plt.show()

In [None]:
# colored according to year

dm8 = scidist.pdist(norm_emo8_presence, 'euclidean')
linkage_object = hierarchy.linkage(dm8, method='ward')

plot_tree_yeargroups(linkage_object, titles, horror['year'])

In [None]:
import seaborn as sns
#plot it with a heatmap
dendro_heatmap = sns.clustermap(
    norm_emo8_presence,
    row_linkage=linkage_object,
    col_cluster=True, #Change this to False to remove dendrogram for the emotions (at the top of heatmap)
    yticklabels=combined_labels,
    cmap='coolwarm',
    figsize=(15, 27)
)

#add color based on year ranges
yticklabels = dendro_heatmap.ax_heatmap.get_yticklabels()
for label in yticklabels:
    story_title = label.get_text()
    idx_title = combined_labels.index(story_title)
    year = int(horror['year'][idx_title])  # Convert to integer

    if year < 1800:
        label.set_color('purple')
    elif 1800 <= year <= 1849:
        label.set_color('blue')
    elif 1850 <= year <= 1899:
        label.set_color('green')
    elif 1900 <= year <= 1949:
        label.set_color('orange')
    elif 1950 <= year <= 1999:
        label.set_color('red')
    else:
        label.set_color('black')

#add a legend
legend_years = [
    Patch(color='purple', label='before 1800'),
    Patch(color='blue', label='1800-1849'),
    Patch(color='green', label='1850-1899'),
    Patch(color='orange', label='1900-1949'),
    Patch(color='red', label='1950-1999'),
    Patch(color='black', label='after 1999')
    ]

dendro_heatmap.ax_heatmap.legend(
    handles=legend_years,
    title="Length Type",
    loc='upper right',
    bbox_to_anchor=(2, 1) #adjust this to manually move the legend
)

#need to flip the heatmap and the dendrogram horizontally to have the same bottom-up order as the previous code (since we specified 'orientation=left' there)
dendro_heatmap.ax_heatmap.invert_yaxis()
dendro_heatmap.ax_row_dendrogram.invert_yaxis()

dendro_heatmap.ax_cbar.set_position((0.9, 0, .03, .09)) #adjust the position of the colourbar
dendro_heatmap.ax_cbar.set_title('colourbar')

plt.show()

### Emotional arcs (based on the top 4 emotions)

In [None]:
top4emo = ['trust', 'fear', 'anticipation', 'sadness']

emo4 = pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep='\t', header=None)
emo4.columns = ['word', 'emotion', 'score']
emo4 = emo4[emo4['emotion'].isin(top4emo)] #limit to top 4 emotions
emo4 = emo4[emo4['score'] != 0]
emo4

In [None]:
# check the scores:
emo4.score.value_counts()

In [None]:
# look-up dictionary:
word2emotion = {}
for _, row in emo4.iterrows():
    word = row['word']
    emotion = row['emotion']
    if emotion in top4emo:
        word2emotion[word] = emotion

In [None]:
def emo4_plot(text, label):
    """
    Plot how the top 4 emotions change throughout a text.

    Parameters:
        text (str): The text to analyze
        label (str): The title/label of the text
    """
    fig, ax = plt.subplots(figsize=(20, 5))

    # Define a color palette for the emotions
    emotion_colors = {
        top4emo[0]: 'red',      # trust
        top4emo[1]: 'blue',     # fear
        top4emo[2]: 'green',    # anticipation
        top4emo[3]: 'purple'    # sadness
    }

    # Chunking
    words = text.split()
    total_words = len(words)
    chunk_sizes = [total_words // 100 + (1 if i < total_words % 100 else 0) for i in range(100)]

    chunks = []
    start_idx = 0
    for size in chunk_sizes:
        end_idx = start_idx + size
        chunk = ' '.join(words[start_idx:end_idx]) if start_idx < total_words else ''
        chunks.append(chunk)
        start_idx = end_idx

    df = pd.DataFrame(chunks, columns=['chunk'])
    # Store chunk lengths for normalization
    df['chunk_length'] = df['chunk'].apply(lambda x: len(x.split()))
    df['vocabulary'] = df['chunk'].str.split().apply(set)

    # Initialize emotion scores (0) for each chunk
    for emotion in top4emo:
        df[emotion] = 0.0

    # Calculate emotion scores for each chunk
    for i, bow in enumerate(df['vocabulary']):
        for word in bow:
            if word in word2emotion:
                emotion = word2emotion[word]
                if emotion in top4emo:
                    df.at[i, emotion] += 1

    # Normalize emotion scores by chunk length
    for emotion in top4emo:
        df[emotion] = df[emotion] / df['chunk_length']

    df['position'] = df.index / len(df)

    for emotion in top4emo:
        df[f'{emotion}_smooth'] = df[emotion].rolling(window=max(len(df)//10, 1),
                                                     min_periods=1).mean()

        ax.plot(df['position'], df[f'{emotion}_smooth'],
                label=emotion,
                color=emotion_colors[emotion],
                linewidth=2)

    plt.title(f'Top 4 Emotion Trajectories in "{label}"', fontsize=14)
    plt.xlabel('Narrative Time', fontsize=12)
    plt.ylabel('Emotion Intensity', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.3)
    plt.legend(fontsize=10)
    plt.tight_layout()
    plt.show()

In [None]:
# for an individual text:
emo4_plot(stories[2], titles[2])

In [None]:
#for multiple texts
for i in range(118):
    emo4_plot(stories[i], titles[i])

In [None]:
for entry in ['short story', 'novel', 'novella', 'novelette']:
    print(f"\n_______\nPlots for {entry}:\n_______\n")

    for i in range(len(length)):
        if length[i] == entry:
            emo4_plot(stories[i], titles[i])

In [None]:
#Two least common emotions
bottom2emo = ['surprise', 'disgust']

emo2 = pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep='\t', header=None)
emo2.columns = ['word', 'emotion', 'score']
emo2 = emo2[emo2['emotion'].isin(bottom2emo)] #limit to bottom 2 emotions
emo2 = emo2[emo2['score'] != 0]
emo2

In [None]:
# look-up dictionary:
word2emotion = {}
for _, row in emo2.iterrows():
    word = row['word']
    emotion = row['emotion']
    if emotion in bottom2emo:
        word2emotion[word] = emotion

In [None]:
def emo2_plot(text, label):
    """
    Plot how the bottom 2 emotions change throughout a text.

    Parameters:
        text (str): The text to analyze
        label (str): The title/label of the text
    """
    fig, ax = plt.subplots(figsize=(20, 5))

    # Define a color palette for the emotions
    emotion_colors = {
        bottom2emo[0]: 'red',      # surprise
        bottom2emo[1]: 'blue'     # disgust
    }

    # Chunking
    words = text.split()
    total_words = len(words)
    chunk_sizes = [total_words // 100 + (1 if i < total_words % 100 else 0) for i in range(100)]

    chunks = []
    start_idx = 0
    for size in chunk_sizes:
        end_idx = start_idx + size
        chunk = ' '.join(words[start_idx:end_idx]) if start_idx < total_words else ''
        chunks.append(chunk)
        start_idx = end_idx

    df = pd.DataFrame(chunks, columns=['chunk'])
    # Store chunk lengths for normalization
    df['chunk_length'] = df['chunk'].apply(lambda x: len(x.split()))
    df['vocabulary'] = df['chunk'].str.split().apply(set)

    # Initialize emotion scores (0) for each chunk
    for emotion in bottom2emo:
        df[emotion] = 0.0

    # Calculate emotion scores for each chunk
    for i, bow in enumerate(df['vocabulary']):
        for word in bow:
            if word in word2emotion:
                emotion = word2emotion[word]
                if emotion in bottom2emo:
                    df.at[i, emotion] += 1

    # Normalize emotion scores by chunk length
    for emotion in bottom2emo:
        df[emotion] = df[emotion] / df['chunk_length']

    df['position'] = df.index / len(df)

    for emotion in bottom2emo:
        df[f'{emotion}_smooth'] = df[emotion].rolling(window=max(len(df)//10, 1),
                                                     min_periods=1).mean()

        ax.plot(df['position'], df[f'{emotion}_smooth'],
                label=emotion,
                color=emotion_colors[emotion],
                linewidth=2)

    plt.title(f'Bottom 2 Emotion Trajectories in "{label}"', fontsize=14)
    plt.xlabel('Narrative Time', fontsize=12)
    plt.ylabel('Emotion Intensity', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.3)
    plt.legend(fontsize=10)
    plt.tight_layout()
    plt.show()

In [None]:
#for multiple texts
for i in range(118):
    emo2_plot(stories[i], titles[i])

In [None]:
for entry in ['short story', 'novel', 'novella', 'novelette']:
    print(f"\n_____\nPlots for {entry}:\n_____\n")

    for i in range(len(length)):
        if length[i] == entry:
            emo2_plot(stories[i], titles[i])