In [1]:
import os
import pandas as pd
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns

from src import config

In [2]:
def get_spearman_corr(df, column, name, conc, plot=False):
    # create merged dataset
    merged = pd.merge(left=conc, right=df[[column]], left_index=True, right_index=True, how='inner')
    merged.dropna(inplace=True)
    
    # get spearman correlation
    correlation, p_value = spearmanr(merged['concreteness'], merged[column])
    
    # print results
    print(f'Concreteness and {name}')
    print(f'Spearman Correlation Coefficient: {correlation:.4f}')
    print(f'P-value: {p_value:.4e}')
    
    if plot:
        # plot results and save figure
        g = sns.jointplot(x="concreteness", y=column, data=merged, 
                        kind="scatter", joint_kws={"s": 20, "alpha": 0.2})
        fig_name = f"joinplot_concreteness_{column}.png"
        plt.subplots_adjust(left=0.2)
        plt.savefig(os.path.join(config.FIGURES_DIR, fig_name), bbox_inches='tight')
        plt.show()
    

In [3]:
def run(language):
    folder = os.path.join(config.PROCESSED_DATA_DIR, language)
    hist = pd.read_csv(os.path.join(folder, f'hist_polysemy_score_{language}.csv'), sep=';', index_col=0)
    contemp = pd.read_csv(os.path.join(folder, f'contemp_polysemy_score_{language}.csv'), sep=';', index_col=0)
    conc = pd.read_csv(os.path.join(folder, f'concreteness_{language}.csv'), sep=';', index_col='Word')

    get_spearman_corr(hist, 'slope', f'Polysemy Score Evolution {language}', conc)
    get_spearman_corr(contemp, 'contemp_polysemy_score', f'Contemporary Polysemy Score {language}', conc)
    get_spearman_corr(hist, 'polysemy_score_1990', f'Historic Polysemy Score 1990s {language}', conc)

## German

In [4]:
run(language="german")

Concreteness and Polysemy Score Evolution german
Spearman Correlation Coefficient: 0.2739
P-value: 1.1263e-26
Concreteness and Contemporary Polysemy Score german
Spearman Correlation Coefficient: 0.1302
P-value: 4.4611e-12
Concreteness and Historic Polysemy Score 1990s german
Spearman Correlation Coefficient: 0.0213
P-value: 3.6167e-01


## English

In [5]:
run(language="english")

Concreteness and Polysemy Score Evolution english
Spearman Correlation Coefficient: -0.0542
P-value: 4.0462e-11
Concreteness and Contemporary Polysemy Score english
Spearman Correlation Coefficient: 0.3751
P-value: 0.0000e+00
Concreteness and Historic Polysemy Score 1990s english
Spearman Correlation Coefficient: -0.3520
P-value: 0.0000e+00


## French

In [6]:
run(language="french")

Concreteness and Polysemy Score Evolution french
Spearman Correlation Coefficient: -0.3043
P-value: 6.8459e-25
Concreteness and Contemporary Polysemy Score french
Spearman Correlation Coefficient: 0.0626
P-value: 1.9309e-02
Concreteness and Historic Polysemy Score 1990s french
Spearman Correlation Coefficient: -0.4069
P-value: 1.4084e-49


## Compare cutoff percentiles 

In [7]:
# load regerence dataset
polysemy_reference = pd.read_csv("data/external/english/concreteness_w_definition.csv", usecols=["Word"])
polysemy_reference = polysemy_reference.value_counts().reset_index()
polysemy_reference.set_index('Word', drop=True, inplace=True)

In [8]:
for cutoff_percentile in [75, 80, 85, 90, 95, 96, 97, 98, 99]:
    print(f"Cutoff Percentile: {cutoff_percentile}")
    contemp = pd.read_csv(f"data/processed/english/contemp_polysemy_score_english_{cutoff_percentile}.csv", sep=';', index_col=0)
    merged = contemp.merge(polysemy_reference, how='inner', left_index=True, right_index=True)
    correlation = merged.corr('spearman')[0].loc['contemp_polysemy_score']
    print(f"Correlation: {correlation}")
    print()

Cutoff Percentile: 75
Correlation: 0.09696642928195091

Cutoff Percentile: 80
Correlation: 0.10326978117923387

Cutoff Percentile: 85
Correlation: 0.11168319754831471

Cutoff Percentile: 90
Correlation: 0.12709812713166904

Cutoff Percentile: 95
Correlation: 0.15493007744129927

Cutoff Percentile: 96
Correlation: 0.1598535210175586

Cutoff Percentile: 97
Correlation: 0.16541104820575556

Cutoff Percentile: 98
Correlation: 0.16891976528187158

Cutoff Percentile: 99
Correlation: 0.16370377867411312

