# Logistic Regression predicting Concreteness Label

- ElasticNet penalization
- performance measures: F1, recall, precision


## K-fold + Fold-wise: BALANCED Test on one fold, Train on the rest, with spearman correlation per fold averaged¶

balanced: same amount concr and abstr in a fold

In [None]:
# import modules

import pandas as pd
import numpy as np

from numpy import arange
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error,r2_score, classification_report
from scipy.stats import spearmanr, kendalltau
from statistics import mean, stdev
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as score
import matplotlib.pyplot as plt

In [None]:
# get splitted categorical datset BALANCED with each fold having same amount concrete and abstract

# make CS categorical 1=concrete, 0=abstract
dataset = 'filtered'
wordclass = 'nouns'
#wordclass = 'verbs'
#wordclass = 'adj'

df = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M32_CosDisTable/NORMnoNAtypes_'+wordclass+'_'+dataset+'_CosDisTable32.csv'
#df = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M3_CosDisTable/NORMnoNAtypes_'+wordclass+'_'+dataset+'_CosDisTable.csv'
#df = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/original_data/targets_Brysbaert_' +dataset+'/'+'NORMnoNAtypes_'+wordclass+'_'+dataset+'_AssocTable.csv'
#df = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M4_POStable/NORMnoNAtypes_'+wordclass+'_filtered_POStable.csv'

df = pd.read_csv(df)
df = df.drop(['Unnamed: 0'], axis=1)
#df = df.to_numpy()

df['CS'].values[df['CS'].values < df['CS'].median()] = 0  #abstract categorical
df['CS'].values[df['CS'].values >= df['CS'].median()] = 1 #concrete categorical

#split dataset into N folds

foldN = 10

# separating concrete and abstract to take same percentage of concrete and abstarct into folds
conc = df.loc[df['CS'] == 1]
conc.reset_index(inplace = True, drop = True)
abst = df.loc[df['CS'] == 0]
abst.reset_index(inplace = True, drop = True)

#name folds in kfold column
from sklearn import model_selection
kf = model_selection.KFold(n_splits=foldN)   # initiate the k-fold class from model_selection module
for fold, (trn_, val_) in enumerate(kf.split(X=abst)):  # fill the new kfold column for abstract
    abst.loc[val_, 'kfold'] = fold
for fold, (trn_, val_) in enumerate(kf.split(X=conc)):  # fill the new kfold column for concrete
    conc.loc[val_, 'kfold'] = fold
df = pd.concat([abst,conc])   # conc and abst back into df woth kfold column
df = df.sample(frac=1).reset_index(drop=True) #shuffeling rows of df dataframe

#split dataset df0 up to df(foldN-1)
folds = []
for num in range(0,foldN):
    globals()['df'+str(num)] = df[df['kfold'] == num]
    globals()['df'+str(num)] = globals()['df'+str(num)].drop(['kfold'],axis=1) #delete kfold column
    folds.append(globals()['df'+str(num)])  #list with the folddataframes


with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LogElasticNet_Stats.txt", "a") as f:
    print('LogisticBalanced: K-fold + Fold-wise',file=f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+str(foldN)+': (fullamount,X-1=featureamount) '+str(df.shape),file =f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+str(foldN)+': (foldamount,X-1=featureamount) '+str(df0.shape),file=f)


In [None]:
# loop: ElasticNet Regression training on foldN-1, prediction on leftout fold, spearmancorrelation for those predictions

#ElasticNet REGRESSION WITH GRIDSEARCH HYPERPARAMETERS

bestscore = []
bestparams = []
permatches = []
perconc = []
perabst = []
meanacc = []
classrepprecision = []
classreprecall = []
classrepf1 = []

for i in range(0,len(folds)):
    # getting train and test folds for this itteration
    print('TESTED FOLD INDEX:' + str(i))
    subfolds = folds.copy() #copy folds dataframes 
    del subfolds[i]         #delete testing dataframe
    train = pd.concat(subfolds)  #concat all dataframes except testing one for training
    train = train.to_numpy()
    test = folds[i].to_numpy()  #get testing fold
    X = train[:, 1:]    #assoc vectors: predictors
    y = train[:, 0]     #CS: response variable
    y = y.astype(int)
    Xtest = test[:, 1:] #assoc vectors: predictors
    ytest = test[:, 0]  #CS: response variable
    ytest = ytest.astype(int)
    
    # LOGISTIC REGRESSION with variable LASSO(l1) or RIDGE(l2) or ElasticNet(both) penalty
    c = [100] # high c means trust training data, low value means dont make very high values because will ot be helpful for new data (here lower C better beacuse more way more features than samples)
    l1ratio = [0]
    tol = [1e-02]
    params = [{'C': c,  
            'l1_ratio': l1ratio,
            'tol': tol
            }]
    #model = Lasso(copy_X = True, tol=1e-2, max_iter = 1000, selection ='random', random_state = 1)  #define model
    model = LogisticRegression(penalty= 'elasticnet', max_iter= 1000, solver= 'saga')   # or solver='liblinear' / penalty='l1'
                            #  l1_ratio= 1,penalty= 'elasticnet' : l1_ratio =1 =LASSO, l1_ratio =0 =RIDGE, l1_ratio =0-1 ElasticNet
                            # saga solver chosen becuase support of all penalties

    # grid searching through best hyperparameters, cv is set to do no crossvalidation, refit does fit the model with the best parameters so model can be used for prediction afterwards
    cv = GridSearchCV(model, param_grid = params,cv=[(slice(None), slice(None))],scoring='f1', return_train_score=False,refit=True)
    cv.fit(X,y)
    bestscore.append(cv.best_score_)
    bestparams.append(cv.best_params_)
    
    # Predicting the test fold
    ymodel = cv.predict(Xtest)
    print(ymodel)
    print(ytest)
    #performance measure 
    matchesN = 0
    lableabstN = 0
    lableconcN = 0
   # csm = out['CS MODEL']
   # print(csm)
   # print('Number of concrete predictions: '+str(sum(csm)))
    print('Number of concrete predictions: '+str(sum(ymodel)))
   # csh = out['CS HUMAN']
    for ind, v in enumerate(ymodel):
        if ymodel[ind] == ytest[ind]:
            matchesN += 1
            if v == 0:         #counting lables of matching predictions
                lableabstN += 1
            else:
                lableconcN += 1
    permatches.append(matchesN / len(ymodel))
    perconc.append(lableconcN / sum(ymodel))
    perabst.append(lableabstN / (len(ymodel)-sum(ymodel)))
    meanacc.append(cv.score(Xtest, ytest))
    precision,recall,fscore,support= score(ytest,ymodel,average='macro')
    classrepprecision.append(precision)
    classreprecall.append(recall)
    classrepf1.append(fscore)
    #classrep.append(classification_report(ytest, ymodel, output_dict=True))
    
with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LogElasticNet_Stats.txt", "a") as f:
#if close to -1/1 both scores are pos/neg correlating, if p-value smaller 0.05 correl statistically significant
    print('Best Estimator: '+str(cv.best_estimator_),file=f)
    print('Scorer: '+str(cv.scorer_),file=f)
    print('Number of Splits: '+str(cv.n_splits_),file=f)
    print('Number of Features: '+str(cv.n_features_in_),file=f)
    print('Best Scores: ' + str(bestscore), file=f)
    print('Best Scores Mean: ' + str(mean(bestscore)), file=f)
    print('Best Params: ' + str(bestparams), file=f)
    print('Seconds for Best fitting: '+str(cv.refit_time_),file=f)
    print('Params: '+str(cv.cv_results_['params']),file=f)
    #print('Mean_train_score: '+str(cv.cv_results_['mean_train_score']),file=f)
    
    print('Matches/All predictions: ' + str(permatches),file=f)
    print('Matches/All predictions Mean: ' + str(mean(permatches)),file=f)
    print('Matches/All predictions Stddev: ' + str(stdev(permatches)),file=f)
    print('Matching CONC/CONCtest: '+str(perconc),file=f)
    print('Mean Matching CONC/CONCtest: '+str(mean(perconc)),file=f)
    print('Stddev Matching CONC/CONCetst: '+str(stdev(perconc)),file=f)
    print('Matching ABST/ABSTtest: '+str(perabst),file=f)
    print('Mean Matching ABST/ABSTtest: '+str(mean(perabst)),file=f)
    print('Stddev Matching ABST/ABSTtest: '+str(stdev(perabst)),file=f)
    print('Mean Accuracy: ' + str(meanacc), file=f)
    print('Mean of Mean Accuracy: ' + str(mean(meanacc)), file=f) 
    print('Mean Accuracy Stddev: ' + str(stdev(meanacc)), file=f) 
    print('ClassifReportprecision Mean: ' + str(mean(classrepprecision)), file=f)
    print('ClassifReportprecision Stdev: ' + str(stdev(classrepprecision)), file=f)
    print('ClassifReportrecall Mean: ' + str(mean(classreprecall)), file=f)
    print('ClassifReportrecall Stdev: ' + str(stdev(classreprecall)), file=f)
    print('ClassifReportF1score Mean: ' + str(mean(classrepf1)), file=f)
    print('ClassifReportF1score Stdev: ' + str(stdev(classrepf1)), file=f)
    print("--------------------------------------------------------------------",file=f)
    
with open("/compLing/students/hiwi-theses/projects/aylin.wahl/CVRESULTSM32_noNAfinaltypes_LogElasticNet.txt", "a") as f:
    print('K-fold ElasticNet, Logistic, '+wordclass+', '+str(foldN),file=f)
    print('C: '+str(c)+', l1-ratio: '+str(l1ratio)+', tol: '+str(tol),file=f)
    print(cv.cv_results_,file=f)
    print('----------------------------------------------------------------------------------------------------------',file=f)

print('FINISHED :)')


In [None]:
blabla = pd.read_csv('/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M3_CosDisTable/NORMnoNAtypes_verbs_filtered_CosDisTable.csv')
print(blabla[: 20])

## New Version: 1-Fold - Extreme Set: Test on every Nth datapoint of the extreme dataset, Training on filtered dataset minus the extremes for testing, F1 on the predictions of the extreme testset

- penalization ElasticNet
- performance measures: F1, recall, precision

In [None]:
# import modules

import pandas as pd
import numpy as np

from numpy import arange
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error,r2_score, classification_report
from scipy.stats import spearmanr, kendalltau
from statistics import mean, stdev
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as score
import matplotlib.pyplot as plt

In [None]:
# collect every nth out of extreme dataset as test

dataset = 'extreme'
wordclass = 'nouns'
#wordclass = 'verbs'
#wordclass = 'adj'

nth = 4

df = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/original_data/targets_Brysbaert_' +dataset+'/'+'NORMnoNAtypes_'+wordclass+'_'+dataset+'_AssocTable.csv'
train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M32_CosDisTable/NORMnoNAtypes_'+wordclass+'_filtered_CosDisTable32.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M4_POStable/NORMnoNAtypes_'+wordclass+'_filtered_POStable.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M3_CosDisTable/NORMnoNAtypes_'+wordclass+'_filtered_CosDisTable.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/original_data/targets_Brysbaert_filtered/NORMnoNAtypes_'+wordclass+'_filtered_AssocTable.csv'
train = pd.read_csv(train)
df = pd.read_csv(df)
train['CS'].values[train['CS'].values < 2.5] = 0  #abstract categorical
train['CS'].values[train['CS'].values >= 2.5] = 1 #concrete categorical
df['CS'].values[df['CS'].values < 2.5] = 0  #abstract categorical
df['CS'].values[df['CS'].values >= 2.5] = 1 #concrete categorical

#get every nth abst and conc target of extreme
abst = df.loc[df['CS'] == 0,:]
conc = df.loc[df['CS'] == 1,:]
target = []
target.append(abst.iloc[::nth, :] ) #select every nth row and add dataframe to list
target.append(conc.iloc[::nth, :] ) #select every nth row and add dataframe to list
target = pd.concat(target)
#target['CS'].values[target['CS'].values < 2.5] = 0  #abstract categorical
#target['CS'].values[target['CS'].values >= 2.5] = 1 #concrete categorical
testtargets = list(target['Unnamed: 0']) # get test targets

#select testtargets out of filtered dataframe to have same vectors
test = pd.DataFrame(columns=list(train.columns))
for i in testtargets:
    test = test.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)    
    
# delete testtargets out of filtered train dataframe
for i in testtargets:   #itterating through test targets
    train.drop(train.loc[train['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test

#shuffle and clean/make categorical train and test
train = train.drop(['Unnamed: 0'], axis=1) #drop targets for regression 
test = test.drop(['Unnamed: 0'], axis=1) #drop targets for regression
train = train.sample(frac=1).reset_index(drop=True) #shuffeling rows
test = test.sample(frac=1).reset_index(drop=True) #shuffeling rows
print(train)
print(test)
train = train.to_numpy()
test = test.to_numpy()
print(train.shape)
print(test.shape)
print(train)
print(test)

with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LogElasticNet_Stats.txt", "a") as f:
    print('New Version: 1-Fold - Extreme Set',file=f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+'1/'+str(nth)+': (trainamount,X-1=featureamount) '+str(train.shape),file =f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+'1/'+str(nth)+': (testamount,X-1=featureamount) '+str(test.shape),file=f)


In [None]:
# regression: test on 1/n of dataframe and training on rest, spearmancollection for those predictions
 
X = train[:, 1:]    #assoc vectors: predictors
y = train[:, 0]     #CS: response variable
y = y.astype(int)
Xtest = test[:, 1:] #assoc vectors: predictors
ytest = test[:, 0]  #CS: response variable
ytest = ytest.astype(int)
    
# LOGISTIC REGRESSION with variable LASSO(l1) or RIDGE(l2) or ElasticNet(both) penalty
c = [100] # high c means trust training data, low value means dont make very high values because will ot be helpful for new data (here lower C better beacuse more way more features than samples)
l1ratio = [0]
tol = [1e-02]
params = [{'C': c,  
        'l1_ratio': l1ratio,
        'tol': tol
        }]
#model = Lasso(copy_X = True, tol=1e-2, max_iter = 1000, selection ='random', random_state = 1)  #define model
model = LogisticRegression(penalty= 'elasticnet', max_iter= 1000, solver= 'saga')   # or solver='liblinear' / penalty='l1'
    #  l1_ratio= 1,penalty= 'elasticnet' : l1_ratio =1 =LASSO, l1_ratio =0 =RIDGE, l1_ratio =0-1 ElasticNet
    # saga solver chosen becuase support of all penalties

    # grid searching through best hyperparameters, cv is set to do no crossvalidation, refit does fit the model with the best parameters so model can be used for prediction afterwards
cv = GridSearchCV(model, param_grid = params,cv=[(slice(None), slice(None))],scoring='f1', return_train_score=False,refit=True)
cv.fit(X,y)  

# Predicting the test fold
ymodel = cv.predict(Xtest)

#performance measure 
matchesN = 0
lableabstN = 0
lableconcN = 0
print('Number of concrete predictions: '+str(sum(ymodel)))
for ind, v in enumerate(ymodel):
    if ymodel[ind] == ytest[ind]:
        matchesN += 1
        if v == 0:         #counting lables of matching predictions
            lableabstN += 1
        else:
            lableconcN += 1
precision,recall,fscore,support= score(ytest,ymodel,average='macro')

with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LogElasticNet_Stats.txt", "a") as f:
#if close to -1/1 both scores are pos/neg correlating, if p-value smaller 0.05 correl statistically significant
    print('Best Estimator: '+str(cv.best_estimator_),file=f)
    print('Scorer: '+str(cv.scorer_),file=f)
    print('Number of Splits: '+str(cv.n_splits_),file=f)
    print('Number of Features: '+str(cv.n_features_in_),file=f)
    print('Best Scores: ' + str(cv.best_score_), file=f)
    print('Best Params: ' + str(cv.best_params_), file=f)
    print('Seconds for Best fitting: '+str(cv.refit_time_),file=f)
    print('Params: '+str(cv.cv_results_['params']),file=f)
    
    print('Matches/All predictions: ' + str(matchesN / len(ymodel)),file=f)
    print('Matching CONC/CONCtest: '+str(lableconcN / sum(ymodel)),file=f)
    print('Matching ABST/ABSTtest: '+str(lableabstN / (len(ymodel)-sum(ymodel))),file=f)
    print('Mean Accuracy: ' + str(cv.score(Xtest, ytest)), file=f)
    print('ClassifReportprecision Mean: ' + str(precision), file=f)
    print('ClassifReportrecall Mean: ' + str(recall), file=f)
    print('ClassifReportF1score Mean: ' + str(fscore), file=f)
    print('R2: ' + str(cv.score(X, y)), file=f) 
    print('ClassifReport: ' + str(classification_report(ytest, ymodel, output_dict=True)), file=f)
    print("--------------------------------------------------------------------",file=f)            
    
    
with open("/compLing/students/hiwi-theses/projects/aylin.wahl/CVRESULTSM32_noNAfinaltypes_LogElasticNet.txt", "a") as f:
    print('K-fold ElasticNet, Logistic, '+wordclass+', '+str(nth),file=f)
    print('C: '+str(c)+', l1-ratio: '+str(l1ratio)+', tol: '+str(tol),file=f)
    print(cv.cv_results_,file=f)
    print('----------------------------------------------------------------------------------------------------------',file=f)


print('FINISHED :)')


## New Version: 4-Fold - Extreme Set: Test on every Nth datapoint of the extreme dataset, Training on filtered dataset minus the extremes for testing, F1 on the predictions of the extreme testset

In [1]:
# import modules

import pandas as pd
import numpy as np

from numpy import arange
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error,r2_score, classification_report
from scipy.stats import spearmanr, kendalltau
from statistics import mean, stdev
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as score
import matplotlib.pyplot as plt

In [2]:
# collect every nth out of extreme dataset as test

dataset = 'extreme'
wordclass = 'nouns'
#wordclass = 'verbs'
#wordclass = 'adj'

nth = 4


df = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/original_data/targets_Brysbaert_' +dataset+'/'+'NORMnoNAtypes_'+wordclass+'_'+dataset+'_AssocTable.csv'
train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M32_CosDisTable/NORMnoNAtypes_'+wordclass+'_filtered_CosDisTable32.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M4_POStable/NORMnoNAtypes_'+wordclass+'_filtered_POStable.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/M3_CosDisTable/NORMnoNAtypes_'+wordclass+'_filtered_CosDisTable.csv'
#train = '/compLing/students/hiwi-theses/data/aylin.wahl/MUDCATData/original_data/targets_Brysbaert_filtered/NORMnoNAtypes_'+wordclass+'_filtered_AssocTable.csv'
train = pd.read_csv(train)
print(train.shape)
df = pd.read_csv(df)
train['CS'].values[train['CS'].values < 2.5] = 0  #abstract categorical
train['CS'].values[train['CS'].values >= 2.5] = 1 #concrete categorical
df['CS'].values[df['CS'].values < 2.5] = 0  #abstract categorical
df['CS'].values[df['CS'].values >= 2.5] = 1 #concrete categorical


#get every nth abst and conc target of extreme for FOLD1
abst = df.loc[df['CS'] == 0,:]
conc = df.loc[df['CS'] == 1,:]
target1 = []
target1.append(abst.iloc[::4, :] ) #select every nth row and add dataframe to list
target1.append(conc.iloc[::4, :] ) #select every nth row and add dataframe to list
target1 = pd.concat(target1)
testtargets1 = list(target1['Unnamed: 0']) # get test targets fold 1
#select testtargets out of filtered dataframe to have same vectors
test1 = pd.DataFrame(columns=list(train.columns))
for i in testtargets1:
    test1 = test1.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)
print(test1.shape)
# delete testtargets out of filtered train dataframe
train1 = train.copy()
for i in testtargets1:   #itterating through test targets
    train1.drop(train1.loc[train1['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
# delete testtargets out of extreme abst and conc 
for i in testtargets1:   #itterating through test targets
    if i in list(abst['Unnamed: 0']):
        abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
    else:
        conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
print('Train1 Fold: '+str(train1.shape))

#get every nth abst and conc target of extreme for FOLD2
target2 = []
target2.append(abst.iloc[::3, :] ) #select every nth row and add dataframe to list
target2.append(conc.iloc[::3, :] ) #select every nth row and add dataframe to list
target2 = pd.concat(target2)
testtargets2 = list(target2['Unnamed: 0']) # get test targets fold 1
#select testtargets out of filtered dataframe to have same vectors
test2 = pd.DataFrame(columns=list(train.columns))
for i in testtargets2:
    test2 = test2.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)
print(test2.shape)
# delete testtargets out of filtered train dataframe
train2 = train.copy()
for i in testtargets2:   #itterating through test targets
    train2.drop(train2.loc[train2['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
# delete testtargets out of extreme abst and conc 
for i in testtargets2:   #itterating through test targets
    if i in list(abst['Unnamed: 0']):
        abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
    else:
        conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
print('Train2 Fold: '+str(train2.shape))

#get every nth abst and conc target of extreme for FOLD3
target3 = []
target3.append(abst.iloc[::2, :] ) #select every nth row and add dataframe to list
target3.append(conc.iloc[::2, :] ) #select every nth row and add dataframe to list
target3 = pd.concat(target3)
testtargets3 = list(target3['Unnamed: 0']) # get test targets fold 1
#select testtargets out of filtered dataframe to have same vectors
test3 = pd.DataFrame(columns=list(train.columns))
for i in testtargets3:
    test3 = test3.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)
print(test3.shape)
# delete testtargets out of filtered train dataframe
train3 = train.copy()
for i in testtargets3:   #itterating through test targets
    train3.drop(train3.loc[train3['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
# delete testtargets out of extreme abst and conc 
for i in testtargets3:   #itterating through test targets
    if i in list(abst['Unnamed: 0']):
        abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
    else:
        conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
print('Train3 Fold: '+str(train2.shape))

#get every nth abst and conc target of extreme for FOLD4
target4 = []
target4.append(abst) #select every nth row and add dataframe to list
target4.append(conc) #select every nth row and add dataframe to list
target4 = pd.concat(target4)
testtargets4 = list(target4['Unnamed: 0']) # get test targets fold 1
#select testtargets out of filtered dataframe to have same vectors
test4 = pd.DataFrame(columns=list(train.columns))
for i in testtargets4:
    test4 = test4.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)
print(test4.shape)
# delete testtargets out of filtered train dataframe
train4 = train.copy()
for i in testtargets4:   #itterating through test targets
    train4.drop(train4.loc[train4['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
# delete testtargets out of extreme abst and conc 
#for i in testtargets4:   #itterating through test targets
#    if i in list(abst['Unnamed: 0']):
#        abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
#    else:
#        conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
print('Train4 Fold: '+str(train2.shape))

(3698, 27608)


  test1 = test1.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)


(186, 27608)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test


Train1 Fold: (3512, 27608)


  test2 = test2.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)


(186, 27608)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test


Train2 Fold: (3512, 27608)


  test3 = test3.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)


(184, 27608)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abst.drop(abst.loc[abst['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conc.drop(conc.loc[conc['Unnamed: 0']==i].index, inplace=True)  #dropping rows in df that are in test


Train3 Fold: (3512, 27608)


  test4 = test4.append(train.loc[train['Unnamed: 0'] == i], ignore_index = True)


(184, 27608)
Train4 Fold: (3512, 27608)


In [9]:
# choosing the tetsfold for this run
#testfold = '1'
#testfold = '2'
#testfold = '3'
testfold = '4'

#train = train1
#train = train2
#train = train3
train = train4

#test = test1
#test = test2
#test = test3
test = test4

#shuffle and clean train and test
train = train.drop(['Unnamed: 0'], axis=1) #drop targets for regression 
test = test.drop(['Unnamed: 0'], axis=1) #drop targets for regression
train = train.sample(frac=1).reset_index(drop=True) #shuffeling rows
test = test.sample(frac=1).reset_index(drop=True) #shuffeling rows
train = train.to_numpy()
test = test.to_numpy()
print(train.shape)
print(test.shape)

with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LogElasticNet_Stats.txt", "a") as f:
    print('New Version: 1-Fold - Extreme Set',file=f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+'1/'+str(nth)+': (trainamount,X-1=featureamount) '+str(train.shape),file =f)
    print('NORMnoNAtypes_'+wordclass+'_'+dataset+'_'+'1/'+str(nth)+': (testamount,X-1=featureamount) '+str(test.shape),file=f)
    print('FoldNumber: ' + testfold,file=f)

(3514, 27607)
(184, 27607)


In [10]:
# regression: test on 1/n of dataframe and training on rest, spearmancollection for those predictions
 
X = train[:, 1:]    #assoc vectors: predictors
y = train[:, 0]     #CS: response variable
y = y.astype(int)
Xtest = test[:, 1:] #assoc vectors: predictors
ytest = test[:, 0]  #CS: response variable
ytest = ytest.astype(int)
    
# LOGISTIC REGRESSION with variable LASSO(l1) or RIDGE(l2) or ElasticNet(both) penalty
c = [100] # high c means trust training data, low value means dont make very high values because will ot be helpful for new data (here lower C better beacuse more way more features than samples)
l1ratio = [1]
tol = [1e-02]
params = [{'C': c,  
        'l1_ratio': l1ratio,
        'tol': tol
        }]
#model = Lasso(copy_X = True, tol=1e-2, max_iter = 1000, selection ='random', random_state = 1)  #define model
model = LogisticRegression(penalty= 'elasticnet', max_iter= 1000, solver= 'saga')   # or solver='liblinear' / penalty='l1'
    #  l1_ratio= 1,penalty= 'elasticnet' : l1_ratio =1 =LASSO, l1_ratio =0 =RIDGE, l1_ratio =0-1 ElasticNet
    # saga solver chosen becuase support of all penalties

    # grid searching through best hyperparameters, cv is set to do no crossvalidation, refit does fit the model with the best parameters so model can be used for prediction afterwards
cv = GridSearchCV(model, param_grid = params,cv=[(slice(None), slice(None))],scoring='f1', return_train_score=False,refit=True)
cv.fit(X,y)  

# Predicting the test fold
ymodel = cv.predict(Xtest)

#performance measure 
matchesN = 0
lableabstN = 0
lableconcN = 0
print('Number of concrete predictions: '+str(sum(ymodel)))
for ind, v in enumerate(ymodel):
    if ymodel[ind] == ytest[ind]:
        matchesN += 1
        if v == 0:         #counting lables of matching predictions
            lableabstN += 1
        else:
            lableconcN += 1
precision,recall,fscore,support= score(ytest,ymodel,average='macro')

with open("/compLing/students/hiwi-theses/projects/aylin.wahl/M32_noNAfinaltypes_LogElasticNet_Stats.txt", "a") as f:
#if close to -1/1 both scores are pos/neg correlating, if p-value smaller 0.05 correl statistically significant
    print('Best Estimator: '+str(cv.best_estimator_),file=f)
    print('Scorer: '+str(cv.scorer_),file=f)
    print('Number of Splits: '+str(cv.n_splits_),file=f)
    print('Number of Features: '+str(cv.n_features_in_),file=f)
    print('Best Scores: ' + str(cv.best_score_), file=f)
    print('Best Params: ' + str(cv.best_params_), file=f)
    print('Seconds for Best fitting: '+str(cv.refit_time_),file=f)
    print('Params: '+str(cv.cv_results_['params']),file=f)
    
    print('Matches/All predictions: ' + str(matchesN / len(ymodel)),file=f)
    print('Matching CONC/CONCtest: '+str(lableconcN / sum(ymodel)),file=f)
    print('Matching ABST/ABSTtest: '+str(lableabstN / (len(ymodel)-sum(ymodel))),file=f)
    print('Mean Accuracy: ' + str(cv.score(Xtest, ytest)), file=f)
    print('ClassifReportprecision Mean: ' + str(precision), file=f)
    print('ClassifReportrecall Mean: ' + str(recall), file=f)
    print('ClassifReportF1score Mean: ' + str(fscore), file=f)
    print('R2: ' + str(cv.score(X, y)), file=f) 
    print('ClassifReport: ' + str(classification_report(ytest, ymodel, output_dict=True)), file=f)
    print("--------------------------------------------------------------------",file=f)            
    
    
with open("/compLing/students/hiwi-theses/projects/aylin.wahl/CVRESULTSM32_noNAfinaltypes_LogElasticNet.txt", "a") as f:
    print('K-fold ElasticNet, Logistic, '+wordclass+', '+str(nth),file=f)
    print('C: '+str(c)+', l1-ratio: '+str(l1ratio)+', tol: '+str(tol),file=f)
    print(cv.cv_results_,file=f)
    print('----------------------------------------------------------------------------------------------------------',file=f)


print('FINISHED :)')


Number of concrete predictions: 131
FINISHED :)
