In [1]:
import os
import pandas as pd
import numpy as np
import pickle
import time

import matplotlib.pyplot as plt
import seaborn as sns
import module1_Preprocessing as module1
import module2_FeatureExtraction as module2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score , roc_auc_score, confusion_matrix

os.chdir('../')
LIWCPath=os.path.join('Processed','LIWC')
ResultDump='FinalResults'
seed=123

In [2]:
file_list=[]
for root,subdir,files in os.walk(os.path.join('Processed','Kaggle50')):
    file_list.extend([os.path.join(root,file) for file in files if file.endswith('.pickle')])
file_list

['Processed\\Kaggle50\\Kaggle-Filtered.pickle',
 'Processed\\Kaggle50\\Kaggle-Filtered_noNN.pickle',
 'Processed\\Kaggle50\\Kaggle-Filtered_noNNnoSW.pickle',
 'Processed\\Kaggle50\\Kaggle-Filtered_noSW.pickle',
 'Processed\\Kaggle50\\Kaggle.pickle',
 'Processed\\Kaggle50\\Kaggle_noNN.pickle',
 'Processed\\Kaggle50\\Kaggle_noNNnoSW.pickle',
 'Processed\\Kaggle50\\Kaggle_noSW.pickle']

### Define Models

In [3]:
from sklearn.naive_bayes import ComplementNB
cnb = ComplementNB()
cnb

ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)

In [4]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=100,random_state=123, class_weight='balanced', n_jobs=10 )
rf

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=10, oob_score=False,
                       random_state=123, verbose=0, warm_start=False)

In [5]:
from sklearn.linear_model import LogisticRegression
lgr=LogisticRegression(random_state=123, solver= 'liblinear',class_weight='balanced')
lgr

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=123, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
from sklearn import svm
svc = svm.SVC(random_state=seed, verbose=True, C = 1.0, class_weight='balanced', probability=True, gamma='auto')
svc

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=True, random_state=123, shrinking=True, tol=0.001,
    verbose=True)

In [7]:
import lightgbm as lgb
lgb = lgb.LGBMClassifier(objective='binary',metric='auc,binary_logloss',boosting_type='gbdt', class_weight='balanced',
                               learning_rate=0.01,n_estimators=10000,
                              random_state=seed,n_jobs=10)
lgb

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split',
               learning_rate=0.01, max_depth=-1, metric='auc,binary_logloss',
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=10000, n_jobs=10, num_leaves=31, objective='binary',
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

### Training with 5-folds Cross Validation. Loop for 4 different models and different files

In [None]:
%%time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.preprocessing import QuantileTransformer, quantile_transform

scaler=QuantileTransformer(random_state=seed) 
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = seed)
result_list=[]
all_result_list=[]
result_fileDF=pd.DataFrame()
models=[('lgb',lgb),('cnb',cnb),('rf',rf),('lgr',lgr),('svc',svc)]

for filepath in file_list:
    result_list=[]
    dir = os.path.split(filepath)[0]
    file = os.path.split(filepath)[1].replace('.pickle','')
    print('processing file:',file)
    
    with open(filepath, 'rb') as handle:
        data = pickle.load(handle)
    
        label=data[['MBTI','EI','SN','TF','JP']]
    
    data['tokens']=data['tokens'].apply(lambda x: ' '.join(x))
    
    start_time = time.time()
    print('converting to tfidf')
    cCount_Vect=CountVectorizer(analyzer='char_wb', ngram_range=(2,3),min_df=50, max_df=0.95, max_features=1500)
    wCount_Vect=CountVectorizer(tokenizer = module2.dummy, preprocessor=module2.dummy, token_pattern=module2.dummy,
                           analyzer='char_wb', ngram_range=(2,3),min_df=50, max_df=0.95, max_features=1500)
    TFIDF_Trans=TfidfTransformer(use_idf=True, sublinear_tf=False)
    cTfidf_Vect=TfidfVectorizer(analyzer='char_wb', ngram_range=(2,3),min_df=50, max_df=0.95, max_features=1500)
    wTfidf_Vect=TfidfVectorizer(tokenizer = module2.dummy, preprocessor=module2.dummy, token_pattern=module2.dummy, 
                                ngram_range=(1,3),min_df=50, max_df=0.95, max_features=1500)
    
    char_tf, char_tf_top=module2.df2vector(data['tokens'], cCount_Vect, 1500)
    word_tf, word_tf_top=module2.df2vector(data['tokens'], wCount_Vect, 1500)
    
    char_tfidf, char_tfidf_top=module2.df2vector(data['tokens'], cTfidf_Vect, 1500)
    word_tfidf, word_tfidf_top=module2.df2vector(data['tokens'], wTfidf_Vect, 1500)
    print("--- %s seconds ---" % (time.time() - start_time))
    del data
    
    LIWC=module2.LIWC(LIWCPath,filepath)

    combo = np.hstack([LIWC,word_tfidf, char_tfidf, word_tf, char_tf])    
    char_tf_col=char_tf_top[0].tolist()
    word_tf_col=word_tf_top[0].tolist()
    char_tfidf_col=char_tfidf_top[0].tolist()
    word_tfidf_col=word_tfidf_top[0].tolist()
    LIWC_col=LIWC.columns.tolist()
    
    char_tf=scaler.fit_transform(char_tf)
    word_tf=scaler.fit_transform(word_tf)
    char_tfidf=scaler.fit_transform(char_tfidf)
    word_tfidf=scaler.fit_transform(word_tfidf)
    LIWC=scaler.fit_transform(LIWC)
    

    
    features = [('combo',combo),('char_tf',char_tf), ('word_tf',word_tf), ('char_tfidf',char_tfidf), ('word_tfidf',word_tfidf),('LIWC',LIWC)]
    for feature_name,feature in features:    
        X=feature
        feature_col=[]
        if feature_name =='combo':
            feature_col=LIWC_col
            feature_col.extend(word_tfidf_col)
            feature_col.extend(char_tfidf_col)
            feature_col.extend(word_tf_col)
            feature_col.extend(char_tf_col)
        elif feature_name =='char_tfidf':
            feature_col =char_tfidf_col
        elif feature_name =='word_tfidf':
            feature_col =word_tfidf_col      
        elif feature_name =='LIWC':
            feature_col = LIWC_col
        
        
        for y_class in ['JP']:
            y=label[y_class]
            split = 0
            print('\n\n\nPerforming training and classification on:',file, feature_name, y_class)

            for train_index, test_index in skf.split(X, y):
                
                split += 1
                X_train, X_test = X[train_index,:],X[test_index,:]
                y_train, y_test = y[train_index],y[test_index]

                print('\nNow on split#',split, feature_name, y_class )
                for model_name, model in models:
                    result = {}
                    start_time = time.time()
                    if model_name == 'lgb':
                        lgb.fit(X_train,y_train,eval_set=[(X_test,y_test)],eval_metric='f1_score',
                                verbose=100,early_stopping_rounds=300)
                        y_pred =lgb.predict(X_test)
                        y_pred_proba = lgb.predict_proba(X_test)[:,1]
                        
                        # sorted(zip(clf.feature_importances_, X.columns), reverse=True)
                        feature_imp = pd.DataFrame(sorted(zip(lgb.feature_importances_,feature_col)), columns=['Value','Feature'])

                        plt.figure(figsize=(20, 10))
                        sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[:20])
                        plt.title('LightGBM Features (avg over folds)')
                        plt.tight_layout()
                        plt.show()
                        plt.savefig(filepath.replace('.pickle',f'_{feature_name}_lgb_importances.png'), format='png')
                        with open(filepath.replace('.pickle','_lgbfeatures.pickle'), 'wb') as handle:
                            pickle.dump(feature_imp, handle, protocol=pickle.HIGHEST_PROTOCOL)
                    else:
                        model.fit(X_train,y_train)
                        y_pred =model.predict(X_test)
                        y_pred_proba = model.predict_proba(X_test)[:,1]
                    f1=f1_score(y_test, y_pred, average='macro') 
                    accuracy=accuracy_score(y_test, y_pred)
                    cm=confusion_matrix(y_test, y_pred)
                    auc=roc_auc_score(y_test, y_pred_proba)
                    seconds=time.time() - start_time
                    print(model_name, accuracy, f1,auc)
                    result['file'] = file
                    result['cv_split#'] = split
                    result['class'] = y_class
                    result['Feature'] = feature_name
                    result['model'] = model_name
                    result['CM'] = cm
                    result['Acc'] = accuracy
                    result['f1-macro'] = f1
                    result['auc'] = auc
                    result['time-s'] = seconds
                    result_list.append(result)
                    print("--- %s seconds ---" %seconds)
    all_result_list.extend(result_list)
    result_fileDF = pd.DataFrame(result_list)
    result_fileDF.to_csv(os.path.join(ResultDummp,file+'.csv'))
