In [None]:
import logging, sys, os, tqdm

# Main code is in ../models
SRC_DIR = os.path.abspath(os.path.join('..', 'models'))
if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

logging.basicConfig(level=logging.INFO)

from models import *

In [None]:
def import_data(embedding):

    try:
        data = pd.read_csv(
            os.path.join(SRC_DIR, "embedding_data",  embedding + "test.csv")
            ,index_col=0
            )
        logging.info(f"Data loaded {embedding}")
        logging.debug(f"Dataset shape {data.shape}")
    except:

        logging.error("Error loading embedding data. Possible embedding types are: word2vec and tfidf")

    logging.info("Splitting data into train and test")
    X_train, Y_train, X_test, Y_test = split_train_test(data)

    return X_train, Y_train, X_test, Y_test

In [None]:
def fit_predict(model, X_train, Y_train, X_test, Y_test):

    methods = {
        'dummy': Dummy,
        'naivebayes': NaiveBayes,
        'onevsrest': One_VS_Rest_SVM,
        'onevsone': One_vs_One_SVM,
        'randomforest': RandomForest,
        'adaboost': AdaBoost,
    }

    if model in methods:
        eval_dict = methods[model](X_train, Y_train, X_test, Y_test)
    else:
        logging.error(f"{model} unknown. Potential models are: {lits(methods.keys())}")
        eval_dict = {}

    return eval_dict

In [None]:
%%time

embeddings = ['word2vec','tfidf','normbow']
models = ['dummy', 'naivebayes', 'onevsrest', 'onevsone', 'randomforest','adaboost']

eval_list = []
for embedding in embeddings:
    logging.info("\n{0}\n".format(embedding.upper()))
    X_train, Y_train, X_test, Y_test = import_data(embedding)
    logging.info("\n{0}\n".format(20 * "-"))
    for model in models:
        t0 = datetime.now()
        logging.debug("\n{0} ({1})\n".format(model.upper(), embedding.upper()))
        eval_dict = fit_predict(model, X_train, Y_train, X_test, Y_test)        
        t1 = datetime.now()
        eval_dict["Embedding"] = embedding
        eval_dict["Train & test duration (s)"] = (t1 - t0).total_seconds()
#         eval_dict["Train & test duration"] = "{0:.2f} seconds".format((t1 - t0).total_seconds())
        eval_list.append(eval_dict)
        logging.info("\n{0}\n".format(20 * "-"))
    logging.info("\n{0}\n".format(20 * "="))

In [None]:
eval_df = pd.DataFrame(eval_list)
# eval_df.columns

eval_df[['Model', 'Embedding', 'Train & test duration (s)', 'Accuracy', 'Precision (class 0)',
       'Recall (class 0)', 'ROC AUC (ovr)', 'Cross-entropy loss']]\
    .style\
    .background_gradient(cmap='Greens', subset=['Accuracy', 'Precision (class 0)', 'Recall (class 0)', 'ROC AUC (ovr)'])\
    .background_gradient(cmap='Reds', subset=["Train & test duration (s)", "Cross-entropy loss"])