In [1]:
import os

pre_processed_path = os.path.join('..', 'data', 'csvs', "pre_processed.csv")
# pre_normalized_path = os.path.join('..', 'data', 'csvs', "pre_normalized.csv")
news_validation_path = os.path.join('..', 'data', 'csvs', "news_validation.csv")
covid_validation_path = os.path.join('..', 'data', 'csvs', "covid.csv")

data_frame_paths = [pre_processed_path]
data_frame_names = ["pre_processed"]

validations_paths = [news_validation_path, covid_validation_path]

In [2]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def add_vectorizer(codes):
    vectorizers = []
    vectorizers_names = []

    for code in codes:
        add_code = "_max" if code == 0 else ("_" + str(code))

        if code == 0:
            filtered_vocab = None
        else:
            filtered_vocab_path = os.path.join('..', 'data', 'vocabularies', "filtered_vocab_" + str(code) + ".csv")
            filtered_vocab = pd.read_csv(filtered_vocab_path).set_index('word').to_dict()['index']

        vectorizer = CountVectorizer(strip_accents="ascii", lowercase=True, vocabulary=filtered_vocab)
        vectorizer_norm = TfidfVectorizer(strip_accents="ascii", lowercase=True, vocabulary=filtered_vocab, use_idf = False, norm='l2')
        vectorizer_idf = TfidfVectorizer(strip_accents="ascii", lowercase=True, vocabulary=filtered_vocab)

        vectorizers += [vectorizer, vectorizer_norm, vectorizer_idf]
        vectorizers_names += ["CountVectorizer" + add_code, "TfidfVectorizer_norm" + add_code, "TfidfVectorizer_idf" + add_code]

    return vectorizers, vectorizers_names

In [4]:
vectorizers, vectorizers_names = add_vectorizer([0])

In [5]:
algs = []
algs_names = []

In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

Gaussain_NB = GaussianNB()
Bernoulli_NB = BernoulliNB()
Multinomial_NB = MultinomialNB()

algs += [Gaussain_NB, Bernoulli_NB, Multinomial_NB]
algs_names += ["Gaussain_NB", "Bernoulli_NB", "Multinomial_NB"]

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

Decision_Tree_Classifier = DecisionTreeClassifier()
Extra_Tree_Classifier = ExtraTreeClassifier()
Random_Forest_Classifier = RandomForestClassifier()
Extra_Trees_Classifier = ExtraTreesClassifier()

algs += [Decision_Tree_Classifier, Extra_Tree_Classifier, Random_Forest_Classifier, Extra_Trees_Classifier]
algs_names += ["Decision_Tree_Classifier", "Extra_Tree_Classifier", "Random_Forest_Classifier", "Extra_Trees_Classifier"]

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

knn = KNeighborsClassifier()
svc = SVC()

algs += [knn, svc]
algs_names += ["KNN", "SVC"]

In [9]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

AdaBoost = AdaBoostClassifier()
GradientBoost = GradientBoostingClassifier()

algs += [AdaBoost, GradientBoost]
algs_names += ["AdaBoost", "GradientBoost"]

In [10]:
# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

LGBM = LGBMClassifier()
CatBoost = CatBoostClassifier()

algs += [LGBM, CatBoost]
algs_names += ["LGBM", "CatBoost"]

In [11]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=1000)

algs += [mlp]
algs_names += ["MLP"]

In [12]:
import importlib
import sys

import pandas as pd
import train

sys.path.append( '.' )
importlib.reload(train)

df = train.get_result_data_frames(algs, algs_names, vectorizers, vectorizers_names, data_frame_paths, data_frame_names, validations_paths)
df = df.sort_values(by=['algorithm',"accuracy"], ascending=[True, False]).reset_index(drop=True)
df

Learning rate set to 0.020553
0:	learn: 0.6702545	total: 518ms	remaining: 8m 37s
1:	learn: 0.6497593	total: 818ms	remaining: 6m 47s
2:	learn: 0.6254244	total: 1.11s	remaining: 6m 10s
3:	learn: 0.6027634	total: 1.43s	remaining: 5m 57s
4:	learn: 0.5833463	total: 1.74s	remaining: 5m 45s
5:	learn: 0.5639826	total: 2.03s	remaining: 5m 36s
6:	learn: 0.5437388	total: 2.35s	remaining: 5m 32s
7:	learn: 0.5261560	total: 2.64s	remaining: 5m 27s
8:	learn: 0.5097533	total: 2.95s	remaining: 5m 24s
9:	learn: 0.4935026	total: 3.25s	remaining: 5m 21s
10:	learn: 0.4815549	total: 3.56s	remaining: 5m 20s
11:	learn: 0.4680263	total: 3.86s	remaining: 5m 17s
12:	learn: 0.4552471	total: 4.16s	remaining: 5m 15s
13:	learn: 0.4456582	total: 4.45s	remaining: 5m 13s
14:	learn: 0.4348680	total: 4.77s	remaining: 5m 13s
15:	learn: 0.4229787	total: 5.08s	remaining: 5m 12s
16:	learn: 0.4129972	total: 5.38s	remaining: 5m 10s
17:	learn: 0.4032268	total: 5.67s	remaining: 5m 9s
18:	learn: 0.3943526	total: 5.96s	remaining: 

Unnamed: 0,algorithm,dataset,vectorizer,accuracy,precision,recall,f1,custom_accuracy,covid_accuracy,train_time,predict_time
0,AdaBoost,pre_processed,TfidfVectorizer_norm_max,0.944,0.9472,0.9411,0.9442,0.6429,0.5667,438.69,18.18
1,AdaBoost,pre_processed,TfidfVectorizer_idf_max,0.9426,0.9446,0.9411,0.9429,0.5714,0.5333,439.32,18.25
2,AdaBoost,pre_processed,CountVectorizer_max,0.9398,0.937,0.9439,0.9404,0.6429,0.5,432.18,18.24
3,Bernoulli_NB,pre_processed,CountVectorizer_max,0.9157,0.8688,0.9807,0.9213,0.5714,0.5,11.59,1.63
4,Bernoulli_NB,pre_processed,TfidfVectorizer_norm_max,0.9157,0.8688,0.9807,0.9213,0.5714,0.5,3.39,1.36
5,Bernoulli_NB,pre_processed,TfidfVectorizer_idf_max,0.9157,0.8688,0.9807,0.9213,0.5714,0.5,3.46,1.4
6,CatBoost,pre_processed,TfidfVectorizer_idf_max,0.9579,0.9603,0.9558,0.958,0.5714,0.5333,721.47,34.93
7,CatBoost,pre_processed,CountVectorizer_max,0.9574,0.9543,0.9614,0.9578,0.5714,0.5,378.75,34.07
8,CatBoost,pre_processed,TfidfVectorizer_norm_max,0.9574,0.9594,0.9558,0.9576,0.5714,0.5333,716.87,35.22
9,Decision_Tree_Classifier,pre_processed,CountVectorizer_max,0.9028,0.9102,0.8951,0.9026,0.6429,0.5333,79.58,0.55


In [13]:
df.sort_values(by=["accuracy"], ascending=[False]).reset_index(drop=True)

Unnamed: 0,algorithm,dataset,vectorizer,accuracy,precision,recall,f1,custom_accuracy,covid_accuracy,train_time,predict_time
0,SVC,pre_processed,TfidfVectorizer_idf_max,0.9602,0.9554,0.966,0.9607,0.5714,0.5333,1591.13,1146.91
1,SVC,pre_processed,TfidfVectorizer_norm_max,0.9602,0.9571,0.9641,0.9606,0.5714,0.5333,923.85,742.23
2,SVC,pre_processed,CountVectorizer_max,0.9579,0.956,0.9604,0.9582,0.5714,0.5,394.49,258.14
3,CatBoost,pre_processed,TfidfVectorizer_idf_max,0.9579,0.9603,0.9558,0.958,0.5714,0.5333,721.47,34.93
4,LGBM,pre_processed,CountVectorizer_max,0.9579,0.9646,0.9512,0.9579,0.5714,0.5,6.5,0.64
5,CatBoost,pre_processed,CountVectorizer_max,0.9574,0.9543,0.9614,0.9578,0.5714,0.5,378.75,34.07
6,CatBoost,pre_processed,TfidfVectorizer_norm_max,0.9574,0.9594,0.9558,0.9576,0.5714,0.5333,716.87,35.22
7,Extra_Trees_Classifier,pre_processed,TfidfVectorizer_idf_max,0.9569,0.9486,0.9669,0.9576,0.5,0.5,140.12,0.75
8,LGBM,pre_processed,TfidfVectorizer_norm_max,0.9565,0.968,0.9448,0.9562,0.5714,0.5333,13.86,0.26
9,MLP,pre_processed,TfidfVectorizer_norm_max,0.9551,0.9705,0.9393,0.9547,0.6429,0.8,581.04,0.76


In [14]:
df.to_csv("results/pre_max.csv", index=False)