# Linear Regression: ElasicNet penalization

## New Version: K-fold + Fold-wise: Test on one fold, Train on the rest, with spearman correlation per fold averaged

- ElasticNet penalization
- perforance measures: R2, MAE, spearman rho

In [None]:
import pandas as pd
import numpy as np

from numpy import arange
from collections import defaultdict
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error,r2_score
from scipy.stats import spearmanr, kendalltau
from statistics import mean, stdev
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [None]:
# split dataset into N folds

dataset = 'filtered'
wordclass = 'nouns'
#wordclass = 'verbs'
#wordclass = 'adj'

foldN = 10

df = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M32_CosDisTable/NORMnoNAtypes_'+wordclass+'_'+dataset+'_CosDisTable32.csv'
#df= '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M4_POStable/NORMnoNAtypes_'+wordclass+'_filtered_POStable.csv'
#df = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M3_CosDisTable/NORMnoNAtypes_'+wordclass+'_'+dataset+'_CosDisTable.csv'
#df = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/original_data/targets_Brysbaert_' +dataset+'/'+'NORMnoNAtypes_'+wordclass+'_'+dataset+'_AssocTable.csv'
df = pd.read_csv(df)
df = df.drop(['Unnamed: 0'], axis=1)
#df = df.to_numpy()

#name folds in kfold column
from sklearn import model_selection
kf = model_selection.KFold(n_splits=foldN)   # initiate the k-fold class from model_selection module
for fold, (trn_, val_) in enumerate(kf.split(X=df)):  # fill the new kfold column
    df.loc[val_, 'kfold'] = fold
#split dataset df0 up to df(foldN-1)
folds = []
for num in range(0,foldN):
    globals()['df'+str(num)] = df[df['kfold'] == num]
    globals()['df'+str(num)] = globals()['df'+str(num)].drop(['kfold'],axis=1) #delete kfold column
    folds.append(globals()['df'+str(num)])  #list with the folddataframes

with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LinElasticNet_Stats.txt", "a") as f:
    print('New Version: K-fold + Fold-wise',file=f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+str(foldN)+': (fullamount,X-1=featureamount) '+str(df.shape),file =f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+str(foldN)+': (foldamount,X-1=featureamount) '+str(df0.shape),file=f)
#

In [None]:
# loop: ElasticNet Regression training on foldN-1, prediction on leftout fold, spearmancorrelation for those predictions


#ElasticNet REGRESSION WITH GRIDSEARCH HYPERPARAMETERS
r2 = []
mae = []
spearmancorr = []
spearmanp = []
kendallp = []
kendallcorr = []
bestestimator = []
bestscore = []
bestparams = []
bestindex = []
score = []

for i in range(0,len(folds)):
    # getting train and test folds for this itteration
    print('TESTED FOLD INDEX:' + str(i))
    subfolds = folds.copy() #copy folds dataframes 
    del subfolds[i]         #delete testing dataframe
    train = pd.concat(subfolds)  #concat all dataframes except testing one for training
    train = train.to_numpy()
    test = folds[i].to_numpy()  #get testing fold
    X = train[:, 1:]    #assoc vectors: predictors
    y = train[:, 0]     #CS: response variable
    Xtest = test[:, 1:] #assoc vectors: predictors
    ytest = test[:, 0]  #CS: response variable
    
    # ElasticNet REGRESSION
    alphas = [1e-05]        # arange(1e-6, 1e-4, 5e-5) 
    l1ratio = [0]    # 0 means L2 peanlty, 1 means L1 penalty, between mix
    tol = [1e-2]     # optimization tolerance (with 1e-4 not converging)
    params = [{ 'alpha' : alphas,
               'l1_ratio' : l1ratio,
               'tol' : tol
              }]
    model = ElasticNet(copy_X = True, max_iter = 1000, selection ='random', random_state = 1)  #define model
    # grid searching through best hyperparameters, cv is set to do no crossvalidation, refit does fit the model with the best parameters so model can be used for prediction afterwards
    cv = GridSearchCV(model, param_grid=params, cv=[(slice(None), slice(None))],scoring='neg_mean_absolute_error', return_train_score= False,refit=True)
    cv.fit(X,y)
    bestestimator.append(cv.best_estimator_)
    bestscore.append(cv.best_score_)
    bestparams.append(cv.best_params_)
    bestindex.append(cv.cv_results_['params'][cv.best_index_])
    sc = cv.score(X,y)
    score.append(sc)

    # Predicting test fold
    #setting up output dictionary
    out = {'CS HUMAN': [],'CS MODEL': []}
    out = defaultdict(list)
    #itterating through vectors
    v = 0
    while v<= (test.shape[0]-1):
        vector = Xtest[v,:]
        pred = round(float(cv.predict([vector])),2)
        out['CS MODEL'].append(pred)             #adding predicted value
        out['CS HUMAN'].append(ytest[v])         #adding human CS
        v += 1
    
    #correlation measures of human and model concreteness scores
    corr,pval = spearmanr(out['CS HUMAN'],out['CS MODEL'])
    corr1,pval1 = kendalltau(out['CS HUMAN'],out['CS MODEL'])
    rtwo = r2_score(out['CS HUMAN'],out['CS MODEL'])
    r2.append(round(rtwo,3))
    mae.append(mean_absolute_error(out['CS HUMAN'],out['CS MODEL']))
    spearmancorr.append(round(corr,3))
    spearmanp.append(pval)
    kendallp.append(pval1)
    kendallcorr.append(round(corr1,3))
    print('SPEARMAN CORRELATION: '+ str(corr))
    print('R2: ' + str(rtwo))
    print('MAE: ' + str(mean_absolute_error(out['CS HUMAN'],out['CS MODEL'])))

with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LinElasticNet_Stats.txt", "a") as f:
#if close to -1/1 both scores are pos/neg correlating, if p-value smaller 0.05 correl statistically significant
    #print('Best Estimator: '+str(cv.best_estimator_),file=f)
    print('Scorer: '+str(cv.scorer_),file=f)
    print('Number of Splits: '+str(cv.n_splits_),file=f)
    print('Number of Features: '+str(cv.n_features_in_),file=f)
    print('Best Scores: ' + str(bestscore), file=f)
    print('Best Scores Mean: ' + str(mean(bestscore)), file=f)
    print('Best Params: ' + str(bestparams), file=f)
    print('Seconds for Best fitting: '+str(cv.refit_time_),file=f)
    print('Spearman correaltion: ' + str(spearmancorr), file=f)
    print('Spearman corr Mean: ' + str(mean(spearmancorr)), file=f)
    print('Spearman Stddev.: '+str(stdev(spearmancorr)),file=f)
    print('Spearman p-value: ' + str(mean(spearmanp)), file=f)
    print('Spearman p-value Mean: ' + str(spearmanp), file=f)
    print('Kendall correlation: ' + str(kendallcorr), file=f)
    print('Kendall corr Mean: ' + str(mean(kendallcorr)), file=f)
    print('Kendall Stddev.: '+str(stdev(kendallcorr)),file=f)
    print('Kendall p-value: ' + str(kendallp), file=f)
    print('Kendall p-value Mean: ' + str(mean(kendallp)), file=f)
    print('MeanAbsoluteError on testdata: '+str(mae),file=f)
    print('Mean of MeanAbsoluteError on testdata: '+str(mean(mae)),file=f)
    print('R2 on testdata: ' + str(r2), file=f)
    print('R2 Mean on testdata: ' + str(mean(r2)), file=f) 
    print('Score R2 on traindata: '+str(score),file=f)
    print('Best Estimator: '+ str(bestestimator),file=f)    #see if always the same then enable the one above without listing
    print('Best Parametersettings: ' +str(bestindex),file=f)
    print("--------------------------------------------------------------------",file=f)

with open("/compLing/students/hiwi-theses/projects/aylin.wahl/CVRESULTSM32_noNAfinaltypes_LinElasticNet.txt", "a") as f:
    print('K-fold ElasticNet, Linear, '+wordclass+', '+str(foldN),file=f)
    print('alphas: '+str(alphas)+', l1-ratio: '+str(l1ratio)+', tol: '+str(tol),file=f)
    print(cv.cv_results_,file=f)
    print('----------------------------------------------------------------------------------------------------------',file=f)

print('FINISHED :)')

## New Version: 1-Fold - Extreme Set: Test on every Nth datapoint of the extreme dataset, Training on filtered dataset minus the extremes for testing, spearman on the predictions of the extreme testset

- penalization ElasticNet
- performance measures: MAE, speramn rho, R2


In [None]:
# imports
import pandas as pd
import numpy as np

from numpy import arange
from collections import defaultdict
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error,r2_score
from scipy.stats import spearmanr, kendalltau
from statistics import mean
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [None]:
# collect every nth out of extreme dataset as test

dataset = 'extreme'
wordclass = 'nouns'
#wordclass = 'verbs'
#wordclass = 'adj'

nth = 4


df = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/original_data/targets_Brysbaert_' +dataset+'/'+'NORMnoNAtypes_'+wordclass+'_'+dataset+'_AssocTable.csv'
train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M32_CosDisTable/NORMnoNAtypes_'+wordclass+'_filtered_CosDisTable32.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M4_POStable/NORMnoNAtypes_'+wordclass+'_filtered_POStable.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M3_CosDisTable/NORMnoNAtypes_'+wordclass+'_filtered_CosDisTable.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/original_data/targets_Brysbaert_filtered/NORMnoNAtypes_'+wordclass+'_filtered_AssocTable.csv'
train = pd.read_csv(train)
print(train.shape)
df = pd.read_csv(df)

#get every nth abst and conc target of extreme
abst = df.loc[df['CS'] < 2.5,:]
conc = df.loc[df['CS'] > 2.5,:]
target = []
target.append(abst.iloc[::nth, :] ) #select every nth row and add dataframe to list
target.append(conc.iloc[::nth, :] ) #select every nth row and add dataframe to list
target = pd.concat(target)
print(target.shape)
testtargets = list(target['Unnamed: 0']) # get test targets

#select testtargets out of filtered dataframe to have same vectors
test = pd.DataFrame(columns=list(train.columns))
for i in testtargets:
    test = test.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)
print(test.shape)

# delete testtargets out of filtered train dataframe
for i in testtargets:   #itterating through test targets
    train.drop(train.loc[train['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
print(train.shape)
#shuffle and clean train and test
train = train.drop(['Unnamed: 0'], axis=1) #drop targets for regression 
test = test.drop(['Unnamed: 0'], axis=1) #drop targets for regression
train = train.sample(frac=1).reset_index(drop=True) #shuffeling rows
test = test.sample(frac=1).reset_index(drop=True) #shuffeling rows
train = train.to_numpy()
test = test.to_numpy()
print(train.shape)
print(test.shape)

with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LinElasticNet_Stats.txt", "a") as f:
    print('New Version: 1-Fold - Extreme Set',file=f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+'1/'+str(nth)+': (trainamount,X-1=featureamount) '+str(train.shape),file =f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+'1/'+str(nth)+': (testamount,X-1=featureamount) '+str(test.shape),file=f)



In [None]:
# regression: test on 1/n of dataframe and training on rest, spearmancollection for those predictions
 
X = train[:, 1:]    #assoc vectors: predictors
y = train[:, 0]     #CS: response variable
Xtest = test[:, 1:] #assoc vectors: predictors
ytest = test[:, 0]  #CS: response variable

# ElasticNet REGRESSION
alphas = [1e-4]        # arange(1e-6, 1e-4, 5e-5) 
l1ratio = [0.75]    # 0 means L2 peanlty, 1 means L1 penalty, between mix
tol = [1e-2]     # optimization tolerance (with 1e-4 not converging)
params = [{ 'alpha' : alphas,
            'l1_ratio' : l1ratio,
            'tol' : tol
        }]
model = ElasticNet(copy_X = True, max_iter = 1000, selection ='random', random_state = 1)  #define model
# grid searching through best hyperparameters, cv is set to do no crossvalidation, refit does fit the model with the best parameters so model can be used for prediction afterwards
cv = GridSearchCV(model, param_grid=params, cv=[(slice(None), slice(None))],scoring='neg_mean_absolute_error', return_train_score= False,refit=True)
cv.fit(X,y)

# Predicting test fold
#setting up output dictionary
out = {'CS HUMAN': [],'CS MODEL': []}
out = defaultdict(list)
#itterating through vectors
v = 0
while v<= (test.shape[0]-1):
    vector = Xtest[v,:]
    pred = round(float(cv.predict([vector])),2)
    out['CS MODEL'].append(pred)             #adding predicted value
    out['CS HUMAN'].append(ytest[v])         #adding human CS
    v += 1

# Correaltion measures on all predicted CS values in 'out'
#correlation measures of human and model concreteness scores
corr,pval = spearmanr(out['CS HUMAN'],out['CS MODEL'])
corr1,pval1 = kendalltau(out['CS HUMAN'],out['CS MODEL'])
#print(out)
print('SPEARMAN CORRELATION: '+ str(corr))

with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LinElasticNet_Stats.txt", "a") as f:
#if close to -1/1 both scores are pos/neg correlating, if p-value smaller 0.05 correl statistically significant
    print('Scorer: '+str(cv.scorer_),file=f)
    print('Number of Splits: '+str(cv.n_splits_),file=f)
    print('Number of Features: '+str(cv.n_features_in_),file=f)
    print('Best Scores: ' + str(cv.best_score_), file=f)
    print('Best Params: ' + str(cv.best_params_), file=f)
    print('Seconds for Best fitting: '+str(cv.refit_time_),file=f)
    print('Spearman correaltion: ' + str(corr), file=f)
    print('Spearman p-value: ' + str(pval), file=f)
    print('Kendall correlation: ' + str(corr1), file=f)
    print('Kendall p-value: ' + str(pval1), file=f)
    print('MeanAbsoluteError on testdata: '+str(mean_absolute_error(out['CS HUMAN'],out['CS MODEL'])),file=f)
    print('R2 on testdata: ' + str(r2_score(out['CS HUMAN'],out['CS MODEL'])), file=f)
    print('Score R2 on traindata: '+str(cv.score(X,y)),file=f)
    print('Best Estimator: '+ str(cv.best_estimator_),file=f)
    print('Best Parametersettings: ' +str(cv.cv_results_['params'][cv.best_index_]),file=f)
    print("--------------------------------------------------------------------",file=f)

    
with open("/compLing/students/hiwi-theses/projects/aylin.wahl/CVRESULTSM32_noNAfinaltypes_LinElasticNet.txt", "a") as f:
    print('1-fold ElasticNet, Linear, '+wordclass+', '+str(nth),file=f)
    print('alphas: '+str(alphas)+', l1-ratio: '+str(l1ratio)+', tol: '+str(tol),file=f)
    print(cv.cv_results_,file=f)
    print('----------------------------------------------------------------------------------------------------------',file=f)

print('FINISHED :)')

# New Version: 4-Fold - Extreme Set: Test on every Nth datapoint of the extreme dataset, Training on filtered dataset minus the extremes for testing, spearman on the predictions of the extreme testset


In [1]:
# imports
import pandas as pd
import numpy as np

from numpy import arange
from collections import defaultdict
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error,r2_score
from scipy.stats import spearmanr, kendalltau
from statistics import mean
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [2]:
# collect every nth out of extreme dataset as test

dataset = 'extreme'
wordclass = 'nouns'
#wordclass = 'verbs'
#wordclass = 'adj'

nth = 4


df = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/original_data/targets_Brysbaert_' +dataset+'/'+'NORMnoNAtypes_'+wordclass+'_'+dataset+'_AssocTable.csv'
train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M32_CosDisTable/NORMnoNAtypes_'+wordclass+'_filtered_CosDisTable32.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M4_POStable/NORMnoNAtypes_'+wordclass+'_filtered_POStable.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M3_CosDisTable/NORMnoNAtypes_'+wordclass+'_filtered_CosDisTable.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/original_data/targets_Brysbaert_filtered/NORMnoNAtypes_'+wordclass+'_filtered_AssocTable.csv'
train = pd.read_csv(train)
print(train.shape)
df = pd.read_csv(df)

#get every nth abst and conc target of extreme for FOLD1
abst = df.loc[df['CS'] < 2.5,:]
conc = df.loc[df['CS'] >= 2.5,:]
target1 = []
target1.append(abst.iloc[::4, :] ) #select every nth row and add dataframe to list
target1.append(conc.iloc[::4, :] ) #select every nth row and add dataframe to list
target1 = pd.concat(target1)
testtargets1 = list(target1['Unnamed: 0']) # get test targets fold 1
#select testtargets out of filtered dataframe to have same vectors
test1 = pd.DataFrame(columns=list(train.columns))
for i in testtargets1:
    test1 = test1.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)
print(test1.shape)
# delete testtargets out of filtered train dataframe
train1 = train.copy()
for i in testtargets1:   #itterating through test targets
    train1.drop(train1.loc[train1['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
# delete testtargets out of extreme abst and conc 
for i in testtargets1:   #itterating through test targets
    if i in list(abst['Unnamed: 0']):
        abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
    else:
        conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
print('Train1 Fold: '+str(train1.shape))

#get every nth abst and conc target of extreme for FOLD2
target2 = []
target2.append(abst.iloc[::3, :] ) #select every nth row and add dataframe to list
target2.append(conc.iloc[::3, :] ) #select every nth row and add dataframe to list
target2 = pd.concat(target2)
testtargets2 = list(target2['Unnamed: 0']) # get test targets fold 1
#select testtargets out of filtered dataframe to have same vectors
test2 = pd.DataFrame(columns=list(train.columns))
for i in testtargets2:
    test2 = test2.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)
print(test2.shape)
# delete testtargets out of filtered train dataframe
train2 = train.copy()
for i in testtargets2:   #itterating through test targets
    train2.drop(train2.loc[train2['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
# delete testtargets out of extreme abst and conc 
for i in testtargets2:   #itterating through test targets
    if i in list(abst['Unnamed: 0']):
        abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
    else:
        conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
print('Train2 Fold: '+str(train2.shape))

#get every nth abst and conc target of extreme for FOLD3
target3 = []
target3.append(abst.iloc[::2, :] ) #select every nth row and add dataframe to list
target3.append(conc.iloc[::2, :] ) #select every nth row and add dataframe to list
target3 = pd.concat(target3)
testtargets3 = list(target3['Unnamed: 0']) # get test targets fold 1
#select testtargets out of filtered dataframe to have same vectors
test3 = pd.DataFrame(columns=list(train.columns))
for i in testtargets3:
    test3 = test3.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)
print(test3.shape)
# delete testtargets out of filtered train dataframe
train3 = train.copy()
for i in testtargets3:   #itterating through test targets
    train3.drop(train3.loc[train3['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
# delete testtargets out of extreme abst and conc 
for i in testtargets3:   #itterating through test targets
    if i in list(abst['Unnamed: 0']):
        abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
    else:
        conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
print('Train3 Fold: '+str(train2.shape))

#get every nth abst and conc target of extreme for FOLD4
target4 = []
target4.append(abst) #select every nth row and add dataframe to list
target4.append(conc) #select every nth row and add dataframe to list
target4 = pd.concat(target4)
testtargets4 = list(target4['Unnamed: 0']) # get test targets fold 1
#select testtargets out of filtered dataframe to have same vectors
test4 = pd.DataFrame(columns=list(train.columns))
for i in testtargets4:
    test4 = test4.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)
print(test4.shape)
# delete testtargets out of filtered train dataframe
train4 = train.copy()
for i in testtargets4:   #itterating through test targets
    train4.drop(train4.loc[train4['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
# delete testtargets out of extreme abst and conc 
#for i in testtargets4:   #itterating through test targets
#    if i in list(abst['Unnamed: 0']):
#        abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
#    else:
#        conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
print('Train4 Fold: '+str(train2.shape))

(3698, 27608)


  test1 = test1.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)


(186, 27608)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test


Train1 Fold: (3512, 27608)


  test2 = test2.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)


(186, 27608)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test


Train2 Fold: (3512, 27608)


  test3 = test3.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)


(184, 27608)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test


Train3 Fold: (3512, 27608)


  test4 = test4.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)


(184, 27608)
Train4 Fold: (3512, 27608)


In [9]:
# choosing the tetsfold for this run
#testfold = '1'
#testfold = '2'
#testfold = '3'
testfold = '4'

#train = train1
#train = train2
#train = train3
train = train4

#test = test1
#test = test2
#test = test3
test = test4

#shuffle and clean train and test
train = train.drop(['Unnamed: 0'], axis=1) #drop targets for regression 
test = test.drop(['Unnamed: 0'], axis=1) #drop targets for regression
train = train.sample(frac=1).reset_index(drop=True) #shuffeling rows
test = test.sample(frac=1).reset_index(drop=True) #shuffeling rows
train = train.to_numpy()
test = test.to_numpy()
print(train.shape)
print(test.shape)

with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LinElasticNet_Stats.txt", "a") as f:
    print('New Version: 1-Fold - Extreme Set',file=f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+'1/'+str(nth)+': (trainamount,X-1=featureamount) '+str(train.shape),file =f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+'1/'+str(nth)+': (testamount,X-1=featureamount) '+str(test.shape),file=f)
    print('FoldNumber: ' + testfold,file=f)

(3514, 27607)
(184, 27607)


In [10]:
# regression: test on 1/n of dataframe and training on rest, spearmancollection for those predictions
 
X = train[:, 1:]    #assoc vectors: predictors
y = train[:, 0]     #CS: response variable
Xtest = test[:, 1:] #assoc vectors: predictors
ytest = test[:, 0]  #CS: response variable

# ElasticNet REGRESSION
alphas = [1e-4]        # arange(1e-6, 1e-4, 5e-5) 
l1ratio = [0.25]    # 0 means L2 peanlty, 1 means L1 penalty, between mix
tol = [1e-2]     # optimization tolerance (with 1e-4 not converging)
params = [{ 'alpha' : alphas,
            'l1_ratio' : l1ratio,
            'tol' : tol
        }]
model = ElasticNet(copy_X = True, max_iter = 1000, selection ='random', random_state = 1)  #define model
# grid searching through best hyperparameters, cv is set to do no crossvalidation, refit does fit the model with the best parameters so model can be used for prediction afterwards
cv = GridSearchCV(model, param_grid=params, cv=[(slice(None), slice(None))],scoring='neg_mean_absolute_error', return_train_score= False,refit=True)
cv.fit(X,y)

# Predicting test fold
#setting up output dictionary
out = {'CS HUMAN': [],'CS MODEL': []}
out = defaultdict(list)
#itterating through vectors
v = 0
while v<= (test.shape[0]-1):
    vector = Xtest[v,:]
    pred = round(float(cv.predict([vector])),2)
    out['CS MODEL'].append(pred)             #adding predicted value
    out['CS HUMAN'].append(ytest[v])         #adding human CS
    v += 1

# Correaltion measures on all predicted CS values in 'out'
#correlation measures of human and model concreteness scores
corr,pval = spearmanr(out['CS HUMAN'],out['CS MODEL'])
corr1,pval1 = kendalltau(out['CS HUMAN'],out['CS MODEL'])
#print(out)
print('SPEARMAN CORRELATION: '+ str(corr))

with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LinElasticNet_Stats.txt", "a") as f:
#if close to -1/1 both scores are pos/neg correlating, if p-value smaller 0.05 correl statistically significant
    print('FOLD NUMBER' + testfold, file =f)
    print('Scorer: '+str(cv.scorer_),file=f)
    print('Number of Splits: '+str(cv.n_splits_),file=f)
    print('Number of Features: '+str(cv.n_features_in_),file=f)
    print('Best Scores: ' + str(cv.best_score_), file=f)
    print('Best Params: ' + str(cv.best_params_), file=f)
    print('Seconds for Best fitting: '+str(cv.refit_time_),file=f)
    print('Spearman correaltion: ' + str(corr), file=f)
    print('Spearman p-value: ' + str(pval), file=f)
    print('Kendall correlation: ' + str(corr1), file=f)
    print('Kendall p-value: ' + str(pval1), file=f)
    print('MeanAbsoluteError on testdata: '+str(mean_absolute_error(out['CS HUMAN'],out['CS MODEL'])),file=f)
    print('R2 on testdata: ' + str(r2_score(out['CS HUMAN'],out['CS MODEL'])), file=f)
    print('Score R2 on traindata: '+str(cv.score(X,y)),file=f)
    print('Best Estimator: '+ str(cv.best_estimator_),file=f)
    print('Best Parametersettings: ' +str(cv.cv_results_['params'][cv.best_index_]),file=f)
    print("--------------------------------------------------------------------",file=f)

    
with open("/compLing/students/hiwi-theses/projects/aylin.wahl/CVRESULTSM32_noNAfinaltypes_LinElasticNet.txt", "a") as f:
    print('1-fold ElasticNet, Linear, '+wordclass+', '+str(nth),file=f)
    print('alphas: '+str(alphas)+', l1-ratio: '+str(l1ratio)+', tol: '+str(tol),file=f)
    print(cv.cv_results_,file=f)
    print('----------------------------------------------------------------------------------------------------------',file=f)

print('FINISHED :)')

SPEARMAN CORRELATION: 0.6826358847590879
FINISHED :)
