In [2]:
"""
    Code for processing Kuperman norms and getting results using handcrafted psycholingusitc features
    For convience, we have also included preprocessed versions of both datasets in the repo
    Author: Antonio Laverghetta Jr.
    alaverghett@usf.edu
"""
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import wordnet as wn
from scipy.stats import entropy, pearsonr
from os import listdir, rename
from os.path import isfile, join
from statistics import mean
from matplotlib import pyplot as plt

In [2]:
frequencies = pd.read_csv('opensubtitles_frequencies.txt', sep=r'\s{2,}')
frequencies

Unnamed: 0,text,frequency
0,you,28787591
1,i,27086011
2,the,22761659
3,to,17099834
4,a,14484562
...,...,...
1656991,drain-pipes,1
1656992,baskests,1
1656993,baldinis,1
1656994,anythning,1


In [3]:
kuperman = pd.read_excel("kuperman_norms.xls")
kuperman.drop(['Freq_pm','Nphon','Nsyll','AoA_Kup','Perc_known','Perc_known_lem','AoA_Bird_lem','AoA_Bristol_lem','AoA_Cort_lem','AoA_Schock','Word','Alternative.spelling'],axis=1,inplace=True)
kuperman.dropna(inplace=True)
kuperman.drop_duplicates(inplace=True,subset=['Lemma_highest_PoS'])
kuperman.rename({"Lemma_highest_PoS":"text","AoA_Kup_lem":"labels","Dom_PoS_SUBTLEX":"noun","pos_kept":'pos'},axis=1,inplace=True)
kuperman = kuperman.merge(frequencies,on='text',how='inner')

kuperman.loc[kuperman.pos != 'Noun','noun'] = 0
kuperman.loc[kuperman.pos == 'Noun','noun'] = 1

# get synsets
kuperman['synsets'] = 0
for index, row in kuperman.iterrows():
    if len(wn.synsets(row['text'])) == 0:
        kuperman.at[index, 'synsets'] = 1
    else:
        kuperman.at[index, 'synsets'] = len(wn.synsets(row['text']))

kuperman = kuperman[['text','noun','Nletters','frequency','synsets','labels','pos']]
kuperman

Unnamed: 0,text,noun,Nletters,frequency,synsets,labels,pos
0,a,0,1,14484562,7,2.893384,Article
1,aardvark,1,8,233,1,9.890000,Noun
2,abacus,1,6,256,2,8.690000,Noun
3,abalone,0,7,435,1,12.230000,Verb
4,abandon,0,7,9208,7,8.320000,Verb
...,...,...,...,...,...,...,...
30402,zoom,0,4,3399,5,8.570000,Verb
30403,zoon,1,4,39,1,12.430000,Noun
30404,zucchini,1,8,658,2,6.790000,Noun
30405,zwieback,1,8,13,1,16.100000,Noun


In [5]:
kuperman.to_csv('kuperman_psycholinguistic_features.csv')

In [3]:
"""
now for the wordbank data
note that this code does NOT annotate for part of speech, as we manually performed this step
"""

kuperman = pd.read_csv("AoA_baseline_features.csv")
# kuperman.drop(['Freq_pm','Nphon','Nsyll','AoA_Kup','Perc_known','Perc_known_lem','AoA_Bird_lem','AoA_Bristol_lem','AoA_Cort_lem','AoA_Schock','Word','Alternative.spelling'],axis=1,inplace=True)
# kuperman.dropna(inplace=True)
# kuperman.drop_duplicates(inplace=True,subset=['Lemma_highest_PoS'])
kuperman.rename({"Word":"text","AoA":"labels"},axis=1,inplace=True)
kuperman = kuperman.merge(frequencies,on='text',how='inner')

# kuperman.loc[kuperman.pos != 'Noun','pos'] = 0
# kuperman.loc[kuperman.pos == 'Noun','pos'] = 1

# get synsets
kuperman['synsets'] = 0
kuperman['Nletters'] = 0
for index, row in kuperman.iterrows():
    kuperman.at[index, 'Nletters'] = len(row['text'])
    if len(wn.synsets(row['text'])) == 0:
        kuperman.at[index, 'synsets'] = 1
    else:
        kuperman.at[index, 'synsets'] = len(wn.synsets(row['text']))

kuperman = kuperman[['text','category','noun','Nletters','frequency','synsets','labels']]
kuperman

Unnamed: 0,text,category,noun,Nletters,frequency,synsets,labels
0,cockadoodledoo,sounds,0,14,1,1,25
1,grrr,sounds,0,4,710,1,9
2,meow,sounds,0,4,2889,2,17
3,ouch,sounds,0,4,11779,1,18
4,vroom,sounds,0,5,700,2,17
...,...,...,...,...,...,...,...
595,because,connecting_words,0,7,880112,1,31
596,but,connecting_words,0,3,3631462,1,33
597,if,connecting_words,0,2,2630800,1,0
598,so,connecting_words,0,2,3434152,11,37


In [3]:
"""
    Map worbank norms onto a set of discrete labels
"""
wordbank = pd.read_csv("Wordbank_psycholinguistic_features.csv")

for index, row in wordbank.iterrows():
    if row['labels'] > 0.0 and row['labels'] <= 20.0:
        wordbank.at[index, 'labels'] = 0
    elif row['labels'] > 20.0 and row['labels'] <= 25.0:
        wordbank.at[index, 'labels'] = 1
    elif row['labels'] > 25.0 and row['labels'] <= 52.0:
        wordbank.at[index, 'labels'] = 2

wordbank.to_csv("Wordbank_psycholinguistic_features_classification.csv")