In [None]:
import numpy as np
import pandas as pd
import statistics
from scipy.stats import pearsonr,spearmanr
from ast import literal_eval
import numpy as np

Note that:
- "split 1" in the thesis report refers "fold 5"/"DEFAULT" here. This datasplit used for Experiment 1.
- "val"/"validation" here means "development set" that is mentioned in the thesis report.

# Open results

In [None]:
# Loading experiment output
all_results = pd.read_excel('sgt.xlsx')
all_results = all_results[1:]

# Save to list results Wug Test data
ALL_BEAMS = [literal_eval(beam) for beam in all_results[6]] # Using literal_eval() to read Wug Test beam predictions and top 1 predictions
ALL_TOP1S = [[wugs[0], wugs[1]] for top1_preds in all_results[7] for wugs in literal_eval(top1_preds)]

# Lists contain for each of the 25 models:
# ALL_BEAMS = production probabiltiy for each nonce verb based on the beam predictions, e.g. : 'bize' -> 'bized' 0.85, 'boze' 0.00, other 0.15
# ALL_TOP1S = the top 1 prediction for each wug

# Loading Wug Test nonce verbs 

In the same way as in the experiment notebook

In [None]:
def all_wug_data():
    wugs,wugs_reg,wugs_irreg1,wugs_irreg2=[],[],[],[]
    
    # Load all wug forms
    with open("KCamericanwugs.txt", encoding="utf8") as KCwug_data: # A&H nonce verb present tense in K&C AE phonetic transcriptions
        for wug in KCwug_data:
            wugs.append("".join(wug.split()))
            
    with open("reg.txt", encoding="utf8") as KCwug_data: # A&H regular inflection in K&C AE phonetic transcriptions
        for wug in KCwug_data:
            wugs_reg.append("".join(wug.split()))       
            
    with open("irreg1.txt", encoding="utf8") as KCwug_data: # A&H irregular inflection in K&C AE phonetic transcriptions
        for wug in KCwug_data:
            wugs_irreg1.append("".join(wug.split()))    
            
    with open("irreg2.txt", encoding="utf8") as KCwug_data: # A&H 2nd option irregular inflection in K&C AE phonetic transcriptions
        for wug in KCwug_data:
            wugs_irreg2.append("".join(wug.split()))
    return wugs, wugs_reg ,wugs_irreg1, wugs_irreg2

In [None]:
wugs, wugs_reg ,wugs_irreg1, wugs_irreg2 = all_wug_data()

# Prepare to aggregate the beam predictions/probabilities: create a clear list

wugs_other=[]
for i in range(len(wugs)):
    wugs_other.append("")

ALL_results=[]
for wug_reg, wug_irreg1, wug_irreg2, wug_other in zip(wugs_reg, wugs_irreg1, wugs_irreg2, wugs_other):
    count = [[wug_reg,0.0], [wug_irreg1,0.0], [wug_irreg2,0.0], [wug_other,0.0]]
    ALL_results.append(count)

# Function to compute all correlations

In the same way as in the experiment notebook -- except here is the extended version taking into account A&H rating data and top 1 predictions.

In [None]:
def print_correlations(results,top1_forms):
    
    BEAM_PROD_CORR,BEAM_RATING_CORR,TOP1_PROD_CORR,TOP1_RATING_CORR = [],[],[],[]
    
    # Open A&H experiment results (production experiment and rating experiment)
    AH_df = pd.read_excel("AHresults.xlsx")
    AH_df_ratings = pd.read_excel("AHresults2.xlsx")

    # Structure model beam results in a similar way
    beam_results = [[r[0][1],r[1][1]] for r in results]
    beam_results_df = pd.DataFrame(beam_results, index=wugs, columns=['reg', 'irreg1'])
    
    # Regular wug inflection
    x=beam_results_df["reg"]
    y=AH_df["reg"]
    # Irregular wug inflection
    a=beam_results_df["irreg1"]
    b=AH_df["irreg1"] 

    # Correlation betweem BEAM and production/rating probabilities A&H
    print("Production probabilities vs. Beam probabilities\n")
    
    # Regular class
    print("Regular class")
    corr, p_val = spearmanr(x, y)
    BEAM_PROD_CORR.append([corr, p_val])
    print("SPEARMAN:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")
    
    corr, p_val = pearsonr(x, y)
    BEAM_PROD_CORR.append([corr, p_val])
    print("PEARSON:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")

    # Irregular 1 class
    print("Irregular 1 class")
    corr, p_val = spearmanr(a, b)
    BEAM_PROD_CORR.append([corr, p_val])
    print("SPEARMAN:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")
    
    corr, p_val = pearsonr(a, b)
    BEAM_PROD_CORR.append([corr, p_val])
    print("PEARSON:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")

    print("\n\n")

    # Ratings vs. Beam probabilities
    print("Ratings vs. Beam probabilities")
    y2 = AH_df_ratings["reg"]
    b2 = AH_df_ratings["irreg1"]

    # Regular class
    print("Regular class")
    corr, p_val = spearmanr(x, y2)
    BEAM_RATING_CORR.append([corr, p_val])
    print("SPEARMAN:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")
    
    corr, p_val = pearsonr(x, y2)
    BEAM_RATING_CORR.append([corr, p_val])
    print("PEARSON:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")

    # Irregular 1 class
    print("Irregular 1 class")
    corr, p_val = spearmanr(a, b2)
    BEAM_RATING_CORR.append([corr, p_val])
    print("SPEARMAN:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")
    
    corr, p_val = pearsonr(a, b2)
    BEAM_RATING_CORR.append([corr, p_val])
    print("PEARSON:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")
    

    # Correlation betweem top 1 prediction and production/rating probabilities A&H
    print("\n\n")
    print("Production probabilities vs. Top 1 Production Probabilities\n")
    
    top1_count = [[[wug_reg,0],[wug_irreg1,0]] for wug_reg,wug_irreg1 in zip(wugs_reg,wugs_irreg1)]
    nn=0
   
    for form in top1_forms:
        if nn % 58 == 0: # 58 is the number of nonce verb per wug test, so after every 58 nonce verbs, reset because it's a model/test
            nn=0 #nn is the current wug verb number
        if form[0]==top1_count[nn][0][0]: #count for wug verb nn how often it is regularly inflected
            top1_count[nn][0][1]+=1
        elif form[0]==top1_count[nn][1][0]:  #count for wug verb nn how often it is irregularly inflected
            top1_count[nn][1][1]+=1
        nn+=1

    division=len(top1_forms)/58  # 58 is the number of nonce verb per wug test, so this results in the number of experiments (i.e. in my case 25)
    for c in top1_count:
        c[0][1]/=division
        c[1][1]/=division

    top1_results = [[r[0][1],r[1][1]] for r in top1_count]
    top1_results_df = pd.DataFrame(top1_results, index=wugs, columns=['reg', 'irreg1'])  
    x3=top1_results_df["reg"]
    a3=top1_results_df["irreg1"]
        
    # Regular class
    print("Regular class")
    corr, p_val = spearmanr(x3, y)
    TOP1_PROD_CORR.append([corr, p_val])
    print("SPEARMAN:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")
    
    corr, p_val = pearsonr(x3, y)
    TOP1_PROD_CORR.append([corr, p_val])
    print("PEARSON:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")

    # Irregular 1 class
    print("Irregular 1 class")
    corr, p_val = spearmanr(a3, b)
    TOP1_PROD_CORR.append([corr, p_val])
    print("SPEARMAN:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")
    
    corr, p_val = pearsonr(a3, b)
    TOP1_PROD_CORR.append([corr, p_val])
    print("PEARSON:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")

    print("\n\n")

    # Ratings vs. Top 1 probabilities
    print("Ratings vs. Top 1 probabilities")
    y2 = AH_df_ratings["reg"]
    b2 = AH_df_ratings["irreg1"]

    # Regular class
    print("Regular class")
    corr, p_val = spearmanr(x3, y2)
    TOP1_RATING_CORR.append([corr, p_val])
    print("SPEARMAN:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")
    
    corr, p_val = pearsonr(x3, y2)
    TOP1_RATING_CORR.append([corr, p_val])
    print("PEARSON:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")

    # Irregular 1 class
    print("Irregular 1 class")
    corr, p_val = spearmanr(a3, b2)
    TOP1_RATING_CORR.append([corr, p_val])
    print("SPEARMAN:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")
    
    corr, p_val = pearsonr(a3, b2)
    TOP1_RATING_CORR.append([corr, p_val])
    print("PEARSON:")
    print(f"r = {corr:.4f} (p = {p_val:.4f})")

    return BEAM_PROD_CORR,BEAM_RATING_CORR,TOP1_PROD_CORR,TOP1_RATING_CORR

# ANALYSIS:

## AGGREGATE CORRELATION (n=25)

In [None]:
# Aggregate all (n=25) beam production probabilities
for model in ALL_BEAMS:
    model = np.array(model)
    for i,wug in enumerate(model):
        for j,form in enumerate(wug):
            ALL_results[i][j][1]+=float(form[1])


# Normalize the probabilties, dividing by 25 because there are 25 models
for results in ALL_results:
    results[0][1]/=25
    results[1][1]/=25
    results[2][1]/=25
    results[3][1]/=25 

all_correlations = print_correlations(ALL_results,ALL_TOP1S)

## AGGREGATE CORRELATION PER FOLD (5 * n=5)

In [None]:
# For each of the five folds
for i in range(5):
    print("\n\n\nCORRELATION FOLD ", i+1, '\n')
    
    # Make a list to aggregate all inflection predictions
    fold_results=[]
    for wug_reg, wug_irreg1, wug_irreg2,wug_other in zip(wugs_reg, wugs_irreg1, wugs_irreg2, wugs_other):
        count = [[wug_reg, 0.0],[wug_irreg1, 0.0],[wug_irreg2, 0.0],[wug_other, 0.0]]
        fold_results.append(count)

    # Aggregate and save the predictions from the 5 runs of the current fold
    where, towhere  = i*5, (i+1)*5 # for each fold take the next five runs 
    for model in ALL_BEAMS[where:towhere]:
        model = np.array(model)
        for i, wug in enumerate(model):
            for j, form in enumerate(wug):
                fold_results[i][j][1] += float(form[1])
                
    fold_top1 = ALL_TOP1S[where*58:towhere*58] # Top 1 forms are all stored in one list, so just take the five chunks of 58 nonce verbs of the current fold
    fold_correlations = print_correlations(fold_results, fold_top1)

# AVERAGE CORRELATION (n=25)

Compute correlation for each individual model

Ignore the print below the next cell, it prints the individual correlations, not the averages

In [None]:
all_individual_correlations=[]
print("\n IGNORE THE PRINT BELOW! (these are not the averages, but individual correlations)\n\n\n\n\n")
for i, individual_beam in enumerate(ALL_BEAMS): # Loop over the 25 simulations
    individual_top1 = ALL_TOP1S[i*58:(i+1)*58] # There are 58 top 1 predictions per simulation
    individual_correlation = print_correlations(individual_beam, individual_top1) 
    all_individual_correlations.append(individual_correlation)

Compute the average of the computed correlations

In [None]:
beam_prod,beam_rate = [0]*4, [0]*4
beam_prod0,beam_prod1,beam_prod2,beam_prod3=[],[],[],[]
beam_rate0,beam_rate1,beam_rate2,beam_rate3=[],[],[],[]

# Sort the individual correlations for the beam predictions into separate lists (regular and irregular class; spearman and pearson; A&H production and rating data)
for X in all_individual_correlations:
    #beam probs on production
    beam_prod0.append(X[0][0][0])   #reg spearman
    beam_prod1.append(X[0][1][0])   #reg pearson
    beam_prod2.append(X[0][2][0])   #irreg spearman
    beam_prod3.append(X[0][3][0])   #irreg pearson
    #beam probs on rating
    beam_rate0.append(X[1][0][0])   #reg spearman
    beam_rate1.append(X[1][1][0])   #reg pearson
    beam_rate2.append(X[1][2][0])   #irreg spearman
    beam_rate3.append(X[1][3][0])   #irreg pearson
    
beam_prod = [beam_prod0,beam_prod1,beam_prod2,beam_prod3]
beam_rate = [beam_rate0,beam_rate1,beam_rate2,beam_rate3]

# Compute the average correlations 
avg_bps,avg_brs=[],[],
for bp in beam_prod:
    avg_bps.append(round(statistics.mean(bp), 2))  
for br  in beam_rate:
    avg_brs.append(round(statistics.mean(br), 2))  

print("beam probs vs A&H production probs:")    
print(avg_bps)
print("beam probs vs A&H rating:")
print(avg_brs)

print("\n\nstructured as: avg reg spearman, avg reg pearson, avg irreg spearman, avg irreg pearson")

# AVERAGE ACCURACY (n=25) REAL VERBS

In [None]:
overall_T, irreg_T, reg_T, overall_V, irreg_V, reg_V = [], [], [], [], [], []

# Aggregate the 25 individual accuracy on the real verbs (training and development)
for x, y in zip(all_results[3], all_results[4]):
    X = x.strip('][').split(', ')
    Y = y.strip('][').split(', ')
    overall_T.append(float(X[0]))
    irreg_T.append(float(X[1]))
    reg_T.append(float(X[2]))
    overall_V.append(float(Y[0]))
    irreg_V.append(float(Y[1]))
    reg_V.append(float(Y[2]))

categories = [("Training Overall", overall_T), ("Training Irregular", irreg_T), ("Training Regular", reg_T),
              ("Validation Overall", overall_V), ("Validation Irregular", irreg_V), ("Validation Regular", reg_V)]

# For each category, print the average accuracy and standard deviation
for label, data in categories:
    print(f"{label}\t {statistics.mean(data):.2f}% \t (SD = {statistics.stdev(data):.2f})")


## Per fold (5 * n=5)

REMINDER: in the report 'split 1' is actually 'Fold 5' here. This is the data split used for Experiment 1

In [None]:
# Same code as above, but now looping over the 5 folds to compute the average over 5 runs each time

b, e = 0, 5

# For each of the five folds
for f in range(5):
    
    print("\nFold", f + 1)
    
    overall_T, irreg_T, reg_T, overall_V, irreg_V, reg_V = [], [], [], [], [], []
    
    # Append the accuracies of the 5 runs of the current fold, per class: overall, iregular, regular
    for x, y in zip(all_results[3][b:e], all_results[4][b:e]):
        X = x.strip('][').split(', ')
        Y = y.strip('][').split(', ')
        
        overall_T.append(float(X[0]))
        irreg_T.append(float(X[1]))
        reg_T.append(float(X[2]))
        overall_V.append(float(Y[0]))
        irreg_V.append(float(Y[1]))
        reg_V.append(float(Y[2]))
            
    categories = [("Training Overall", overall_T),("Training Irregular", irreg_T),("Training Regular", reg_T),
                  ("Validation Overall", overall_V), ("Validation Irregular", irreg_V), ("Validation Regular", reg_V)]
    
    # Compute average and standard deviation for current fold
    for label, data in categories:
        print(f"{label}\t {statistics.mean(data):.2f}% \t (SD = {statistics.stdev(data):.2f})")

    # Update b and e for the next fold
    b += 5
    e += 5


# AVERAGE NUMBER OF TRAINING EPOCHS

In [None]:

# Function to calculate and print average epochs for a given range
def print_average_epochs(results, start, end, fold_name):
    epochs = sum(int(e) for e in results[start:end])
    print(f"{fold_name}")
    print(f"Average epochs = {epochs / (end - start):.2f}")

# Number of epochs data from results
epochs_data = all_results[2]

# Global average number of epochs
print("Global average epochs:")
print(f"{sum(int(e) for e in epochs_data) / 25:.2f}")
print(" ")

# Average number of epochs per fold 
fold_sizes = [5, 5, 5, 5, len(epochs_data) - 21]  # Last fold takes the remaining data
start_index = 1

for i, fold_size in enumerate(fold_sizes):
    end_index = start_index + fold_size
    fold_name = f"Fold {i + 1}"
    print_average_epochs(epochs_data, start_index, end_index, fold_name)
    start_index = end_index


# ERROR CLASSIFICATION

In [None]:
exp_irreg_train = all_results[10]
exp_irreg_val = all_results[12]
exp_reg_train = all_results[9]
exp_reg_val = all_results[11]
all_train_irreg_errors = [literal_eval(x) for x in exp_irreg_train]
all_train_reg_errors = [literal_eval(x) for x in exp_reg_train]
all_val_irreg_errors = [literal_eval(x) for x in exp_irreg_val]
all_val_reg_errors = [literal_eval(x) for x in exp_reg_val]

In [None]:
def analyze_errors(irreg_errors, reg_errors):
    total_num_errors = 0
    num_overregs = 0
    num_blendings = 0
    
    obvious_overregs, check_overregs, blendings, copy, OTHER = [], [], [], [], []
    
    for model in reg_errors:
        total_num_errors+=len(model)

    # Loop through the aggregate set of incorrectly inflected verbs
    for model in irreg_errors:
        total_num_errors += len(model)
        for error in model:
            present = error[0]
            inflection = error[1]
            correct_past = error[2][0]
        
            # Regular verbs
            if correct_past == present+ "d" or correct_past == present + "«d" or correct_past == present + "t" or correct_past == "È" + present + 't':
                OTHER.append(error)
                
            # If the incorrect inflection of the verb is ...
            
            # the present form, it is a COPY ERROR
            elif inflection == present:
                copy.append(error)
                OTHER.append(error) #i.e., other includes copy errors, because I ended up not using the copy class!
            # the present form + d/ed, it is OVERREGULARISATION
            elif inflection == present + "d" or inflection == present + "«d":
                obvious_overregs.append(error)
            # the present form + t but the correct inflection is present form + d/ed, it is NOT overregularisation
            elif correct_past[:-1] == inflection[:-1] and correct_past[-1] == 'd' and inflection[-1] == 't': # e.g. "thaw - thawt" (instead of thawed) and "beg - begt" (instead of begged)
                OTHER.append(error)
            # the other present forms + t are OVERREGULARISATIONs (but thrown in a separate list so that I could double check manually)
            elif inflection == present + "t":
                check_overregs.append(error)
            # the correct past tense form, but ALSO inflected this irregularly, it is a BLENDING ERROR
            elif inflection == correct_past + "d" or inflection == correct_past + "«d" or inflection == correct_past + "t":
                blendings.append(error)
            else:
                OTHER.append(error)
    
    # Over-regularization and blending statistics
    num_overregs = len(obvious_overregs) + len(check_overregs)
    num_blendings = len(blendings)

    print("Total number of errors (aggregated):", total_num_errors)
    print("Total number of over-regularization errors:", num_overregs)
    print("Total number of blending errors:", num_blendings)

    return obvious_overregs, check_overregs, blendings, copy, OTHER

In [None]:
print("Training data analysis:")
obvious_overregs, check_overregs, blendings, copy, OTHER = analyze_errors(all_train_irreg_errors, all_train_reg_errors)

print("\nValidation data analysis:")
val_obvious_overregs, val_check_overregs, val_blendings, val_copy, val_OTHER = analyze_errors(all_val_irreg_errors, all_val_reg_errors)


# Regular inflection overall

In [None]:
regular_form_prob = 0

# Compute the average probabilty for inflecting nonce verbs regularly based on the aggregate beam results (n=25)
for beams in ALL_results:
    regular_form_prob += beams[0][1]    
print("Overall production probability for the regular form: ", round(100*regular_form_prob/58,2))

# Compute how often the regular class was the largest probability (i.e. larger than the other inflection classes irreg 1,2 and 'other')
regular_form_beam = 0
for beams in ALL_results:
    if beams[0][1] >= beams[1][1] and beams[0][1] >= beams[2][1] and beams[0][1] >= beams[3][1]:
        regular_form_beam += 1
print("Percentage regular inflection highest beam prob (aggregated n=25): ", round(100*regular_form_beam/58,2), "(",regular_form_beam," out of 58 wugs).")

# VISUALIZATIONS

I save excel files, so that i can plot graphs in excel:

In [None]:
with open("wugs_ortho.txt") as wugs_ortho:
    ortho_wugs = [wug.split("\t") for wug in wugs_ortho][1:]
pres_ortho_wugs = [wug[0] for wug in ortho_wugs]

regular,irregular1,irregular2,other=[],[],[],[]

f=[]
for i,F in enumerate(ALL_results):
    if F[2][0]!="":
        f.append(F[2][1])
    else:
        f.append(float(0))
df_results=[]
for i,w in enumerate(ALL_results):
    df_results.append([pres_ortho_wugs[i],100*w[0][1],100*w[1][1],100*f[i],100*w[3][1]])
    regular.append(100*w[0][1])
    irregular1.append(100*w[1][1])
    irregular2.append(100*f[i])#[2])    
    other.append(100*w[3][1])
df_results = pd.DataFrame(df_results, columns=['wug', 'regular', 'irregular 1', 'irregular 2', 'other'])

df_results.to_excel("plot.xlsx") 

# I plot the results in excel 

In [None]:
df_results