# Literary style

A notebook for "Case Study 2: Literary Style" in Gijs Aangenendt, Maria Skeppstedt & Karl Berglund (2025), "Applied NLP for Humanities Research", in XXXX.

Code written by Karl Berglund.

# 0. PREPARATIONS

### 0.1 Import libraries

In [None]:
import pandas as pd #for working with dataframes
import glob #for grabbing multiple files
import seaborn as sns #for making plots
import matplotlib.pyplot as plt #for making plots
from scipy.stats import linregress #for linear regression

### 0.2 Import metadata

In [None]:
metadata = pd.read_csv('/Users/karbe804/Desktop/literary_style.csv') #path to metadata file

In [None]:
#check out the dataframe
metadata.head()

In [None]:
len(metadata)

In [None]:
metadata["Category"].value_counts()

### 0.3 Import corpus data

In [None]:
#make a list of all the files in the path
text_files = glob.glob(f"/Users/karbe804/Desktop/literary_style/*.conll") #path to folder
text_files = sorted(text_files) #sort list to keep alphabetical order

In [None]:
#check out paths to corpus
text_files[:5]

In [None]:
len(text_files)

# 1. PUNCTUATION

### 1.1 Calculate punctuation

In [None]:
#choose punctuation, here semicolon
chosen_punctuation = ";"

#create empty variables to fill
chosen_punctuation_rel = []
tokens_total = []

for file_path in text_files: #loop over all novels in corpus (each path in the list of paths)
    
    #parse text data to pandas dataframe
    #note: pandas automatically deletes blank rows
    text_data = pd.read_csv(file_path, sep="\t", names=["token", "ling","POS","lemma"])
    
    #create two counters: one for all tokens, one for chosen punctuation
    tokens = 0
    punctuation = 0
    
    for token in text_data["token"]: #loop over all tokens in novel
        
        if not token == "PARAGRAPH_BREAK": #don't count paragraph breaks as tokens
            tokens += 1 #count token
            
            if token == chosen_punctuation: #set if condition for only the chosen punctuation
                punctuation += 1 #count chosen punctuation  
                
   
    rel_freqs = punctuation/tokens #calculate relative frequencies of chosen punctuation
    
    #append results to lists
    chosen_punctuation_rel.append(rel_freqs)
    tokens_total.append(tokens)

In [None]:
#add lists as variables to metadata dataframe
metadata["tokens"] = tokens_total
metadata["semicolons_rel"] = chosen_punctuation_rel

In [None]:
#check out dataframe with the added column
metadata.head()

In [None]:
metadata["tokens"].sum() #check total amount of tokens in corpus

### 1.2 Make bar plot

In [None]:
plt.figure(figsize=(10,5)) #set size of figure

metric_to_plot = "semicolons_rel" #choose metric to plot

#sort data descending according to metric in use
sorted_data = metadata.sort_values(metric_to_plot, ascending=False)

#create labels for legend
new_labels = ['Canonized authors \nwithout large readerships', 
              'Canonized authors \nwith large readerships', 
              'Popular genre fiction authors \nwith large readerships']  

#set order for legend
order = ['kanon', 'kanon_publ','publik']

#make a bar plot per novel for the chosen metric and with the specified legend order
barplot = sns.barplot(x=sorted_data["Title"], y=sorted_data[metric_to_plot],
            hue=sorted_data["Category"], dodge=False, hue_order=order)

#generate xticks labels through a loop
xticks_labels = []
for i in sorted_data.index:
    xticks_labels.append(sorted_data["Author"][i]+" – "+sorted_data["Title"][i]) #"author – title"

#use and specify xticks
plt.xticks(ticks=range(len(sorted_data["Title"])), labels=xticks_labels, rotation=90, size=6)

#specify axes labels 
plt.ylabel(metric_to_plot)
plt.xlabel(None) #here the xticks provide enough information

#set legends to bar plot
handles, labels = barplot.get_legend_handles_labels()

#plot figure
plt.legend(handles, new_labels, loc='upper right', bbox_to_anchor=(1, 1)) 

#export figure, set file path to where you want to place the file
plt.savefig("/yourpath/pyplot.png", format='png', dpi=300, bbox_inches='tight') 

# 2. SENTENCES AND PARAGRAPHS

### 2.1 Calculate sentence and paragraph lenghts

In [None]:
#create empty variables to fill
sentence_length_mean = []
paragraph_length_mean = []

for file_path in text_files: #loop over all novels in corpus (each path in the list of paths)
    
    #parse text data to pandas dataframe
    #note: pandas automatically deletes blank rows
    text_data = pd.read_csv(file_path, sep="\t", names=["token", "ling","POS","lemma"])
    
    #create three counters: one for all tokens, one for paragraph breaks, one for major delimiters
    tokens = 0
    paragraph_breaks = 0
    major_delimiters = 0
    
    for token in text_data["token"]: #loop over all tokens in novel
        
        if not token == "PARAGRAPH_BREAK": #don't count paragraph breaks as tokens
            tokens += 1 #count token
        
        else:
            paragraph_breaks += 1 #count paragraph break
    
    for ling in text_data["ling"]: #loop over second column in dataframe, with info about delimiters
        if ling == "MAD": #set if condition for counting only major delimiters ("MAD")
            major_delimiters += 1 #count major delimiter
                
   
    paragraph_len = tokens/paragraph_breaks #calculate average paragraph length
    sentence_len = tokens/major_delimiters #calculate average sentence length
    
    #append results to lists
    paragraph_length_mean.append(paragraph_len)
    sentence_length_mean.append(sentence_len)

In [None]:
#add lists as columns to metadata dataframe
metadata["paragraph_mean"] = paragraph_length_mean
metadata["sents_mean"] = sentence_length_mean

In [None]:
#check out dataframe with added columns
metadata.head()

### 2.2 Make scatterplot

In [None]:
plt.figure(figsize=(10,5)) #set size of figure

#set values for scatter plot
x_value = "sents_mean"
y_value = "paragraph_mean"

#create new labels for legend
new_labels = ['Canonized authors \nwithout large readerships', 
              'Canonized authors \nwith large readerships', 
              'Popular genre fiction authors \nwith large readerships']  

#set order for legend
order = ['kanon', 'kanon_publ','publik']


#make a scatter plot per novel for the chosen metrics and with the specified legend order
scatterplot = sns.scatterplot(x=metadata[x_value], y=metadata[y_value],
            hue=metadata["Category"], hue_order=order)

#set axes labels
plt.xlabel("Mean sentence length (tokens)")
plt.ylabel("Mean paragraph length (tokens)")

#set legend to scatter plot
handles, labels = scatterplot.get_legend_handles_labels()

#plot figure
plt.legend(handles, new_labels, loc='lower right')

# plot authors and titles for novels
for i in range(len(metadata)):
    x_offset = 0.3 #this offset is to make graph prettier
    y_offset = 0
    
    # plot only certain outliers
    list_of_exceptions = ["Enhörningarna", "De dömdas ö", "Chitambo","Chefen fru Ingeborg", 
                          "Kvinnan och nåden","Guds vackra värld I–II","Pennskaftet"]
    
    if metadata["Title"].iloc[i] in list_of_exceptions: #conditional statement
    
        plt.text(x=metadata[x_value].iloc[i] + x_offset,
                 y=metadata[y_value].iloc[i] + y_offset,
                 s=metadata["Author"].iloc[i]+" - "+metadata["Title"].iloc[i], #"author – title"
                 fontdict=dict(color="black", size=4))
    
#export figure, set file path to where you want to place the file
plt.savefig("/yourpath/pyplot.png", format='png', dpi=300, bbox_inches='tight')

### 2.3 Calculate mean scores on group level

In [None]:
#create subsets on group levels
canon = metadata[metadata["Category"] == "kanon"]
canon_publ = metadata[metadata["Category"] == "kanon_publ"]
popular = metadata[metadata["Category"] == "publik"]

In [None]:
#calculate sentence mean scores per subset
print(canon["sents_mean"].mean(),canon_publ["sents_mean"].mean(),popular["sents_mean"].mean())

In [None]:
#calculate paragraph mean scores per subset
print(canon["paragraph_mean"].mean(),canon_publ["paragraph_mean"].mean(),popular["paragraph_mean"].mean())

# 3. PARTS OF SPEECH

### 3.1 Calculate parts of speech

In [None]:
#create empty lists to fill
adjs_rel_list = []
verbs_rel_list = []

for file_path in text_files: #loop over all novels in corpus (each path in the list of paths)
    
    #parse text data to pandas dataframe
    #note: pandas automatically deletes blank rows
    text_data = pd.read_csv(file_path, sep="\t", names=["token", "ling","POS","lemma"])
    
    #strip all paragraph breaks
    red_text_data = text_data[text_data["token"] != "PARAGRAPH_BREAK"]
    
    tokens = len(red_text_data) #everything in orig csv except blanks and paragraph breaks ==> tokens
    
    #create two counters: one for adjectives, one for verbs
    adjectives = 0
    verbs = 0
    
    for POS in red_text_data["POS"]: #loop over third column in dataframe, with info about parts of speech
        if POS == "ADJ": #conditional statement for counting only adjectives
            adjectives += 1 #count adjectives
        elif POS == "VERB": #conditional statement for counting only verbs
            verbs += 1 #count verbs
                
    adjectives_rel = adjectives/tokens #calculate average paragraph length
    verbs_rel = verbs/tokens #calculate average sentence length
    
    #append results to lists
    adjs_rel_list.append(adjectives_rel)
    verbs_rel_list.append(verbs_rel)

In [None]:
#add lists as columns to metadata dataframe
metadata["adjs_rel"] = adjs_rel_list
metadata["verbs_rel"] = verbs_rel_list

In [None]:
#check out dataframe with added columns
metadata.head()

### 3.2 Make scatterplot

In [None]:
plt.figure(figsize=(10,5)) #set size of figure

#set values for scatter plot
x_value = "verbs_rel"
y_value = "adjs_rel"

#create new labels for legend
new_labels = ['Canonized authors \nwithout large readerships', 
              'Canonized authors \nwith large readerships', 
              'Popular genre fiction authors \nwith large readerships']  

#set order for legend
order = ['kanon', 'kanon_publ','publik']

#make a scatter plot per novel for the chosen metrics and with the specified legend order
scatterplot = sns.scatterplot(x=metadata[x_value], y=metadata[y_value],
            hue=metadata["Category"], hue_order=order)

#set axes labels
plt.xlabel("Verbs (share)")
plt.ylabel("Adjectives (share)")

#set legend to scatter plot
handles, labels = scatterplot.get_legend_handles_labels()

#add regression line
sns.regplot(x=metadata[x_value], y=metadata[y_value], scatter=False, color="red", 
            line_kws={'linewidth': 1, 'alpha': 0.3})

#add axes labels
plt.xlabel("Verbs (share)")
plt.ylabel("Adjectives (share)")

# plot authors and titles for novels
for i in range(len(metadata)):
    x_offset = 0.0005 #this offset is to make graph prettier
    y_offset = 0
    
    # plot only certain outliers
    list_of_exceptions = ["Kvinnan och nåden", "Astarte","Chitambo","Kris","Kvartetten som sprängdes",
                         "Snörmakare Lekholm får en idé","Min död är min","Vingslag i natten",
                          "Till och från Högåsen","Barmhärtighet",
                         "Kejsarn av Portugallien","På dessa skuldror",]
    
    if metadata["Title"].iloc[i] in list_of_exceptions: #conditional statement
    
        plt.text(x=metadata[x_value].iloc[i] + x_offset,
                 y=metadata[y_value].iloc[i] + y_offset,
                 s=metadata["Author"].iloc[i]+" - "+metadata["Title"].iloc[i], #"author – title"
                 fontdict=dict(color="black", size=4))

#plot figure
plt.legend(handles, new_labels, loc='upper right')
        
#export figure, set file path to where you want to place the file
plt.savefig("/yourpath/pyplot.png", format='png', dpi=300, bbox_inches='tight')

### 3.3. Calculate linear regression R value

In [None]:
slope, intercept, r_value, p_value, std_err = linregress(metadata[x_value],metadata[y_value])
print(r_value)

### 3.4 Calculate means on group level

In [None]:
#create subsets on group level
canon = metadata[metadata["Category"] == "kanon"]
canon_publ = metadata[metadata["Category"] == "kanon_publ"]
popular = metadata[metadata["Category"] == "publik"]

In [None]:
#print mean scores of adjectives
print(canon["adjs_rel"].mean(),canon_publ["adjs_rel"].mean(),popular["adjs_rel"].mean())

In [None]:
#print mean scores of verbs
print(canon["verbs_rel"].mean(),canon_publ["verbs_rel"].mean(),popular["verbs_rel"].mean())

# 4. WRITE CSV-FILE INCLUDING ADDED COLUMNS

In [None]:
with open("/yourpath/literary_style_added.csv", "w") as file:
    metadata.to_csv(file)