In [1]:
from models import *

In [2]:
def import_data(embedding):

    try:

        data = pd.read_csv("embedding_data/" + embedding + "test.csv", index_col=0)
        print("Data loaded ", embedding)
        print("Dataset shape ", data.shape)
    except:

        print("Error loading embedding data. Possible embedding types are: word2vec and tfidf")

    print("Splitting data into train and test")
    X_train, Y_train, X_test, Y_test = split_train_test(data)

    return X_train, Y_train, X_test, Y_test

In [3]:
def fit_predict(model, X_train, Y_train, X_test, Y_test):

    if model == 'dummy':
        Dummy(X_train, Y_train, X_test, Y_test)
    elif model == 'naivebayes':
        NaiveBayes(X_train, Y_train, X_test, Y_test)
    elif model == 'onevsrest':
        One_VS_Rest_SVM(X_train, Y_train, X_test, Y_test)
    elif model == 'onevsone':
        One_vs_One_SVM(X_train, Y_train, X_test, Y_test)
    elif model == 'randomforest':
        RandomForest(X_train, Y_train, X_test, Y_test)
    elif model == 'adaboost':
        AdaBoost(X_train, Y_train, X_test, Y_test)
    else:
        print("Error, potential models are: onevsrest, onevsone, randomfores and adaboost")

    return

In [4]:
embeddings = ['word2vec','tfidf','normbow']
# models = ['dummy', 'naivebayes', 'onevsrest', 'onevsone', 'randomforest','adaboost']
models = ['naivebayes', 'onevsrest', 'onevsone', 'randomforest','adaboost']

for embedding in embeddings:
    print(embedding.upper())
    X_train, Y_train, X_test, Y_test = import_data(embedding)
    for model in models:
        fit_predict(model, X_train, Y_train, X_test, Y_test)
    print("\n---------------------\n")

WORD2VEC
Data loaded  word2vec
Dataset shape  (413, 301)
Splitting data into train and test
Training shape  (330, 301)
Test shape  (83, 301)
Training target distributions 
0    140
2    116
1     74
Name: classes, dtype: int64
Test target distributions 
0    31
1    27
2    25
Name: classes, dtype: int64

'Model': 'naive bayes', 'Embedding': 'TBC',
'Accuracy': 0.711,
'Precision (class 0)': 0.826,'Recall (class 0)': 0.613,

'Model': 'one vs rest svm', 'Embedding': 'TBC',
'Accuracy': 0.711,
'Precision (class 0)': 0.792,'Recall (class 0)': 0.613,

'Model': 'one vs one svm', 'Embedding': 'TBC',
'Accuracy': 0.687,
'Precision (class 0)': 0.792,'Recall (class 0)': 0.613,

'Model': 'random forest', 'Embedding': 'TBC',
'Accuracy': 0.675,
'Precision (class 0)': 0.767,'Recall (class 0)': 0.742,

'Model': 'ada boost', 'Embedding': 'TBC',
'Accuracy': 0.735,
'Precision (class 0)': 0.786,'Recall (class 0)': 0.710,

---------------------

TFIDF
Data loaded  tfidf
Dataset shape  (413, 3592)
Splitting d

In [23]:
import pandas as pd

eval_df = pd.DataFrame([{
#     'Model': 'naive bayes', 'Embedding': 'word2vec',
# 'Accuracy': 0.711,
# 'Precision (class 0)': 0.826,'Recall (class 0)': 0.613,
# },{
'Model': 'one vs rest svm', 'Embedding': 'word2vec',
'Accuracy': 0.711,
'Precision (class 0)': 0.792,'Recall (class 0)': 0.613,
},{
'Model': 'one vs one svm', 'Embedding': 'word2vec',
'Accuracy': 0.687,
'Precision (class 0)': 0.792,'Recall (class 0)': 0.613,
},{
'Model': 'random forest', 'Embedding': 'word2vec',
'Accuracy': 0.675,
'Precision (class 0)': 0.767,'Recall (class 0)': 0.742,
},{
'Model': 'ada boost', 'Embedding': 'word2vec',
'Accuracy': 0.735,
'Precision (class 0)': 0.786,'Recall (class 0)': 0.710,
},{
# 'Model': 'naive bayes', 'Embedding': 'tfidf',
# 'Accuracy': 0.639,
# 'Precision (class 0)': 0.692,'Recall (class 0)': 0.581,
# },{
'Model': 'one vs rest svm', 'Embedding': 'tfidf',
'Accuracy': 0.807,
'Precision (class 0)': 0.828,'Recall (class 0)': 0.774,
},{
'Model': 'one vs one svm', 'Embedding': 'tfidf',
'Accuracy': 0.783,
'Precision (class 0)': 0.815,'Recall (class 0)': 0.710,
},{
'Model': 'random forest', 'Embedding': 'tfidf',
'Accuracy': 0.795,
'Precision (class 0)': 0.800,'Recall (class 0)': 0.903,
},{
'Model': 'ada boost', 'Embedding': 'tfidf',
'Accuracy': 0.759,
'Precision (class 0)': 0.781,'Recall (class 0)': 0.806,
},{
# 'Model': 'naive bayes', 'Embedding': 'normbow',
# 'Accuracy': 0.639,
# 'Precision (class 0)': 0.741,'Recall (class 0)': 0.645,
# },{
'Model': 'one vs rest svm', 'Embedding': 'normbow',
'Accuracy': 0.795,
'Precision (class 0)': 0.821,'Recall (class 0)': 0.742,
},{
'Model': 'one vs one svm', 'Embedding': 'normbow',
'Accuracy': 0.795,
'Precision (class 0)': 0.828,'Recall (class 0)': 0.774,
},{
'Model': 'random forest', 'Embedding': 'normbow',
'Accuracy': 0.795,
'Precision (class 0)': 0.812,'Recall (class 0)': 0.839,
},{
'Model': 'ada boost', 'Embedding': 'normbow',
'Accuracy': 0.723,
'Precision (class 0)': 0.786,'Recall (class 0)': 0.710,
}])

eval_df.style.background_gradient(cmap='BuGn') #.sort_values(by=["Model"])

Unnamed: 0,Model,Embedding,Accuracy,Precision (class 0),Recall (class 0)
0,one vs rest svm,word2vec,0.711,0.792,0.613
1,one vs one svm,word2vec,0.687,0.792,0.613
2,random forest,word2vec,0.675,0.767,0.742
3,ada boost,word2vec,0.735,0.786,0.71
4,one vs rest svm,tfidf,0.807,0.828,0.774
5,one vs one svm,tfidf,0.783,0.815,0.71
6,random forest,tfidf,0.795,0.8,0.903
7,ada boost,tfidf,0.759,0.781,0.806
8,one vs rest svm,normbow,0.795,0.821,0.742
9,one vs one svm,normbow,0.795,0.828,0.774
