In [1]:
import os
import warnings
import pandas as pd
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families import Gaussian
from statsmodels.genmod.families.links import identity
from sklearn.preprocessing import StandardScaler

from src import config
from src import process

In [2]:
# ignore runtime warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def get_spearman_corr(df, column, name, conc, plot=False):
    # create merged dataset
    merged = pd.merge(left=conc, right=df[[column]], left_index=True, right_index=True, how='inner')
    merged.dropna(inplace=True)
    
    # get spearman correlation
    correlation, p_value = spearmanr(merged['concreteness'], merged[column])
    
    # print results
    print(f'Concreteness and {name}')
    print(f'Spearman Correlation Coefficient: {correlation:.4f}')
    print(f'P-value: {p_value:.4e}')
    
    if plot:
        # plot results and save figure
        g = sns.jointplot(x="concreteness", y=column, data=merged, 
                        kind="scatter", joint_kws={"s": 20, "alpha": 0.2})
        fig_name = f"joinplot_concreteness_{column}.png"
        plt.subplots_adjust(left=0.2)
        plt.savefig(os.path.join(config.FIGURES_DIR, fig_name), bbox_inches='tight')
        plt.show()
    

In [4]:
def get_glm(df, column, name, conc, freq, plot=False):
    # create merged dataset
    merged = pd.merge(left=conc, right=df[[column]], left_index=True, right_index=True, how='inner')
    merged = pd.merge(left=merged, right=freq, left_index=True, right_index=True, how='inner')
    merged.dropna(inplace=True)
    
    # scale merged dataset
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(merged)
    merged = pd.DataFrame(scaled_data, columns=merged.columns)

    # glm(polysemy ~ conc*freq, data = data, family = gaussian(link="identity"))
    formula = f"{column} ~ concreteness*frequency"
    model = smf.glm(formula=formula, data=merged, family=Gaussian(link=identity())).fit()
    print(model.summary())

In [5]:
def run(language):
    folder = os.path.join(config.PROCESSED_DATA_DIR, language)
    hist = pd.read_csv(os.path.join(folder, f'hist_polysemy_score_{language}.csv'), sep=';', index_col=0)
    contemp = pd.read_csv(os.path.join(folder, f'contemp_polysemy_score_{language}.csv'), sep=';', index_col=0)
    conc = pd.read_csv(os.path.join(folder, f'concreteness_{language}.csv'), sep=';', index_col='Word')
    freq = process.get_most_frequent_words(
        input_dir=os.path.join(config.EXTERNAL_DATA_DIR, language), 
        input_file=config.FREQUENCY_FILENAMES[language], 
        language=language, 
        nr_words=20_000,
        vocab_only=False)
    
    get_spearman_corr(hist, 'slope', f'Polysemy Score Evolution {language}', conc)
    get_glm(hist, 'slope', f"Polysemy score Evolution with Frequency dependence {language}", conc, freq)
    print("\n\n\n")
    get_spearman_corr(contemp, 'contemp_polysemy_score', f'Contemporary Polysemy Score {language}', conc)
    get_glm(contemp, 'contemp_polysemy_score', f"Contemporary Polysemy score with Frequency dependence {language}", conc, freq)
    print("\n\n\n")
    get_spearman_corr(hist, 'polysemy_score_1990', f'Historic Polysemy Score 1990s {language}', conc)
    get_glm(hist, 'polysemy_score_1990', f"History Polysemy score 1990s with Frequency dependence {language}", conc, freq)

## German

In [6]:
run(language="german")

Concreteness and Polysemy Score Evolution german
Spearman Correlation Coefficient: 0.2739
P-value: 1.1263e-26
                 Generalized Linear Model Regression Results                  
Dep. Variable:                  slope   No. Observations:                 1468
Model:                            GLM   Df Residuals:                     1464
Model Family:                Gaussian   Df Model:                            3
Link Function:               identity   Scale:                         0.91962
Method:                          IRLS   Log-Likelihood:                -2019.5
Date:                Wed, 13 Dec 2023   Deviance:                       1346.3
Time:                        08:32:42   Pearson chi2:                 1.35e+03
No. Iterations:                     3   Pseudo R-squ. (CS):            0.08619
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
---------

## English

In [7]:
run(language="english")

Concreteness and Polysemy Score Evolution english
Spearman Correlation Coefficient: -0.0542
P-value: 4.0462e-11
                 Generalized Linear Model Regression Results                  
Dep. Variable:                  slope   No. Observations:                14842
Model:                            GLM   Df Residuals:                    14838
Model Family:                Gaussian   Df Model:                            3
Link Function:               identity   Scale:                         0.99593
Method:                          IRLS   Log-Likelihood:                -21028.
Date:                Wed, 13 Dec 2023   Deviance:                       14778.
Time:                        08:32:43   Pearson chi2:                 1.48e+04
No. Iterations:                     3   Pseudo R-squ. (CS):           0.004352
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
-------

## French

In [8]:
run(language="french")

Concreteness and Polysemy Score Evolution french
Spearman Correlation Coefficient: -0.3043
P-value: 6.8459e-25
                 Generalized Linear Model Regression Results                  
Dep. Variable:                  slope   No. Observations:                 1454
Model:                            GLM   Df Residuals:                     1450
Model Family:                Gaussian   Df Model:                            3
Link Function:               identity   Scale:                         0.94165
Method:                          IRLS   Log-Likelihood:                -2017.4
Date:                Wed, 13 Dec 2023   Deviance:                       1365.4
Time:                        08:32:44   Pearson chi2:                 1.37e+03
No. Iterations:                     3   Pseudo R-squ. (CS):            0.06267
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
--------

## Compare cutoff percentiles 

In [9]:
# load regerence dataset
polysemy_reference = pd.read_csv("data/external/english/concreteness_w_definition.csv", usecols=["Word"])
polysemy_reference = polysemy_reference.value_counts().reset_index()
polysemy_reference.set_index('Word', drop=True, inplace=True)

In [10]:
# load other reference
polysemy_reference_ = pd.read_csv('polysemy_from_diacronic.csv', sep=';', index_col=0)

In [11]:
for cutoff_percentile in [75, 80, 85, 90, 95, 96, 97, 98, 99]:
    print(f"Cutoff Percentile: {cutoff_percentile}")
    contemp = pd.read_csv(f"data/processed/english/contemp_polysemy_score_english_{cutoff_percentile}.csv", sep=';', index_col=0)
    merged = contemp.merge(polysemy_reference_, how='inner', left_index=True, right_index=True)
    # change 'word' to 0 for other dataset
    correlation = merged.corr('spearman')['word'].loc['contemp_polysemy_score']
    print(f"Correlation: {correlation}")
    print()

Cutoff Percentile: 75
Correlation: 0.1856562251272132

Cutoff Percentile: 80
Correlation: 0.188895760230789

Cutoff Percentile: 85
Correlation: 0.18798944245805527

Cutoff Percentile: 90
Correlation: 0.18664202957580978

Cutoff Percentile: 95
Correlation: 0.17560989044088063

Cutoff Percentile: 96
Correlation: 0.1666502282402278

Cutoff Percentile: 97
Correlation: 0.15882297459383482

Cutoff Percentile: 98
Correlation: 0.14201753593960414

Cutoff Percentile: 99
Correlation: 0.1133414986139982

