# Making pipelines 
Dataset A

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

#import custom functions
import Configure_Dataset
import Impute
import Normalize
import Combine


In [2]:
A, y1, y2 = Configure_Dataset.configure('Competition1_raw_data.xlsx')
A.columns

Index(['C1', 'C2', 'C4', 'C7', '%_long_sentences', '%_long_words',
       '%_positive_words', '%_negative_words', '%_uncertain_words', 'C3'',
       'C5'', 'C6'', 'one_hot_Manufacturing', 'one_hot_Other',
       'one_hot_Public Services', 'one_hot_Wholesale  Trade'],
      dtype='object')

In [3]:
A = Impute.impute_mean(A)

In [4]:
A = Normalize.minmax_scale(A)
print(A)

[[0.05392393 1.         0.74996242 ... 0.         0.         0.        ]
 [0.11988445 0.         0.58374777 ... 0.         0.         0.        ]
 [0.03851709 1.         0.71721278 ... 0.         0.         0.        ]
 ...
 [0.04910929 0.         0.75525545 ... 0.         0.         0.        ]
 [0.03273953 1.         0.76253853 ... 1.         0.         0.        ]
 [0.13047665 1.         0.7327755  ... 0.         0.         0.        ]]


In [5]:
A = Combine.combine(A, y1, y2)

In [6]:
A.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,Y1,Y2
0,0.053924,1.0,0.749962,0.001671,0.775008,0.541683,0.291209,0.313837,0.232749,1.0,0.03599,0.111111,1.0,0.0,0.0,0.0,0,1
1,0.119884,0.0,0.583748,0.000843,0.780245,0.473428,0.147348,0.398734,0.230548,0.0,0.118042,0.0,1.0,0.0,0.0,0.0,1,0
2,0.038517,1.0,0.717213,0.000238,0.77064,0.721351,0.88907,0.183452,0.25244,0.0,0.031013,0.0,1.0,0.0,0.0,0.0,1,0
3,0.095811,1.0,0.714502,0.000275,0.653036,0.71815,0.719433,0.222305,0.332049,0.0,0.030315,0.0,1.0,0.0,0.0,0.0,1,1
4,0.033702,1.0,0.499347,0.020605,0.710854,0.405407,0.259462,0.351622,0.246559,1.0,0.034602,0.05,0.0,0.0,1.0,0.0,0,1


In [9]:
# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []


#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):
    #Model building
    clf = LogisticRegression()
    X1_train, X1_test, y1_train, y1_test = train_test_split(predictors_y1, y1, test_size=0.2, random_state=123)
    clf.fit(X1_train, y1_train)

    y1_pred = clf.predict(X1_test)

    
    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=7)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(clf, X1_train, y1_train, cv=kfold, scoring=scoring)

    

    
    #calculate f1-score and AUC
    
    clf_roc_auc = roc_auc_score(y1_test, y1_pred)
    f1_score_lst.append(precision_recall_fscore_support(y1_test, y1_pred, average='weighted')[2])
    auc_lst.append(clf_roc_auc)


print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))

#result=logit_model.fit()
confusion_matrix_y1 = confusion_matrix(y1_test, y1_pred)


#print(result.summary())
print('Accuracy of classifier on test set: {:.2f}'.format(clf.score(X1_test, y1_test)))

print("10-fold cross validation average accuracy of classifier: %.3f" % (results.mean()))

print('Confusion Matrix for Logistic Regression Classfier:')
print(confusion_matrix_y1)

print('Classification Report for Logistic Regression Classfier:')
print(classification_report(y1_test, y1_pred))


F1 0.5934; AUC 0.5953 
Accuracy of classifier on test set: 0.59
10-fold cross validation average accuracy of classifier: 0.604
Confusion Matrix for Logistic Regression Classfier:
[[42 32]
 [23 38]]
Classification Report for Logistic Regression Classfier:
             precision    recall  f1-score   support

          0       0.65      0.57      0.60        74
          1       0.54      0.62      0.58        61

avg / total       0.60      0.59      0.59       135



Below code are used to evaluate model toward `Y2`. It is very similar to the code above - key difference is that `Y2` is imbalanced - so I wrote some code (under `# Begin oversampling`) to deal with that.

In [11]:
# lists for f1-score and AUC
f1_score_lst = []
auc_lst = []


#loop to calculate f1 and auc scores and present averages after 10 runs
for count in range (1,10):
    #Model building
    clf1 = LogisticRegression()

    
    # Splitting data into testing and training
    X2_train, X2_test, y2_train, y2_test = train_test_split(predictors_y2, y2, test_size=0.2, random_state=123)
    
    # Begin oversampling
    oversample = pd.concat([X2_train,y2_train],axis=1)
    max_size = oversample['Y2'].value_counts().max()
    lst = [oversample]
    for class_index, group in oversample.groupby('Y2'):
        lst.append(group.sample(max_size-len(group), replace=True))
    X2_train = pd.concat(lst)
    y2_train=pd.DataFrame.copy(X2_train['Y2'])
    del X2_train['Y2']
    
    # fitting model on oversampled data
    clf1.fit(X2_train, y2_train)
    
    y2_pred = clf1.predict(X2_test)
    
    
    #10-fold cross validation
    kfold = model_selection.KFold(n_splits=10, random_state=123)
    scoring = 'accuracy'
    results = model_selection.cross_val_score(clf1, X2_train, y2_train, cv=kfold, scoring=scoring)
    
    #calculate f1-score and AUC
    
    clf1_roc_auc = roc_auc_score(y2_test, y2_pred)
    
    
    #calculate average f1-score and AUC
    f1_score_lst.append(precision_recall_fscore_support(y2_test, y2_pred, average='weighted')[2])
    auc_lst.append(clf1_roc_auc)
    
    
print('F1 {:.4f}; AUC {:.4f} '.format(np.mean(f1_score_lst),np.mean(auc_lst)))

confusion_matrix_y2 = confusion_matrix(y2_test, y2_pred)


print('Accuracy of classifier on test set: {:.3f}'.format(clf1.score(X2_test, y2_test)))

print("10-fold cross validation average accuracy of clf1: %.3f" % (results.mean()))

print('Confusion Matrix for Classfier:')
print(confusion_matrix_y2)

print('Classification Report for Classfier:')
print(classification_report(y2_test, y2_pred))


F1 0.6012; AUC 0.5705 
Accuracy of classifier on test set: 0.570
10-fold cross validation average accuracy of clf1: 0.555
Confusion Matrix for Classfier:
[[20 15]
 [43 57]]
Classification Report for Classfier:
             precision    recall  f1-score   support

          0       0.32      0.57      0.41        35
          1       0.79      0.57      0.66       100

avg / total       0.67      0.57      0.60       135

