In [1]:
import os

pre_processed_path = os.path.join('data', 'csvs', "pre_processed.csv")
# pre_normalized_path = os.path.join('data', 'csvs', "pre_normalized.csv")
news_validation_path = os.path.join('data', 'csvs', "news_validation.csv")
covid_validation_path = os.path.join('data', 'csvs', "covid.csv")

data_frame_paths = [pre_processed_path]
data_frame_names = ["pre_processed"]

validations_paths = [news_validation_path, covid_validation_path]

In [2]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def add_vectorizer(codes):
    vectorizers = []
    vectorizers_names = []

    for code in codes:
        add_code = "_max" if code == 0 else ("_" + str(code))

        if code == 0:
            filtered_vocab = None
        else:
            filtered_vocab_path = os.path.join('data', 'vocabularies', "filtered_vocab_" + str(code) + ".csv")
            filtered_vocab = pd.read_csv(filtered_vocab_path).set_index('word').to_dict()['index']

        vectorizer = CountVectorizer(strip_accents="ascii", lowercase=True, vocabulary=filtered_vocab)
        vectorizer_norm = TfidfVectorizer(strip_accents="ascii", lowercase=True, vocabulary=filtered_vocab, use_idf = False, norm='l2')
        vectorizer_idf = TfidfVectorizer(strip_accents="ascii", lowercase=True, vocabulary=filtered_vocab)

        vectorizers += [vectorizer, vectorizer_norm, vectorizer_idf]
        vectorizers_names += ["CountVectorizer" + add_code, "TfidfVectorizer_norm" + add_code, "TfidfVectorizer_idf" + add_code]

    return vectorizers, vectorizers_names

In [3]:
vectorizers, vectorizers_names = add_vectorizer([0])

In [4]:
algs = []
algs_names = []

In [24]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

Gaussain_NB = GaussianNB()
Bernoulli_NB = BernoulliNB()
Multinomial_NB = MultinomialNB()

algs += [Gaussain_NB, Bernoulli_NB, Multinomial_NB]
algs_names += ["Gaussain_NB", "Bernoulli_NB", "Multinomial_NB"]

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

Decision_Tree_Classifier = DecisionTreeClassifier()
Extra_Tree_Classifier = ExtraTreeClassifier()
Random_Forest_Classifier = RandomForestClassifier()
Extra_Trees_Classifier = ExtraTreesClassifier()

algs += [Decision_Tree_Classifier, Extra_Tree_Classifier, Random_Forest_Classifier, Extra_Trees_Classifier]
algs_names += ["Decision_Tree_Classifier", "Extra_Tree_Classifier", "Random_Forest_Classifier", "Extra_Trees_Classifier"]

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

knn = KNeighborsClassifier()
svc = SVC()

algs += [knn, svc]
algs_names += ["KNN", "SVC"]

In [27]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

AdaBoost = AdaBoostClassifier()
GradientBoost = GradientBoostingClassifier()

algs += [AdaBoost, GradientBoost]
algs_names += ["AdaBoost", "GradientBoost"]

In [28]:
# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

LGBM = LGBMClassifier()
CatBoost = CatBoostClassifier()

algs += [LGBM, CatBoost]
algs_names += ["LGBM", "CatBoost"]

In [29]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=1000)

algs += [mlp]
algs_names += ["MLP"]

In [None]:
import importlib
import sys

import pandas as pd
import train

sys.path.append( '.' )
importlib.reload(train)

df = train.get_result_data_frames(algs, algs_names, vectorizers, vectorizers_names, data_frame_paths, data_frame_names, validations_paths)
df = df.sort_values(by=['algorithm',"accuracy"], ascending=[True, False]).reset_index(drop=True)
df

In [31]:
df.sort_values(by=["accuracy"], ascending=[False]).reset_index(drop=True)

Unnamed: 0,algorithm,dataset,vectorizer,accuracy,precision,recall,f1,custom_accuracy,covid_accuracy,train_time,predict_time
0,CatBoost,pre_processed,TfidfVectorizer_idf_4096,0.9611,0.9656,0.9568,0.9612,0.5000,0.5333,228.53,2.19
1,Random_Forest_Classifier,pre_processed,TfidfVectorizer_idf_4096,0.9611,0.9631,0.9595,0.9613,0.5000,0.5000,7.25,0.11
2,Extra_Trees_Classifier,pre_processed,CountVectorizer_4096,0.9606,0.9563,0.9660,0.9611,0.5000,0.5000,18.57,0.14
3,SVC,pre_processed,CountVectorizer_4096,0.9606,0.9605,0.9614,0.9609,0.5714,0.5000,12.35,11.74
4,Extra_Trees_Classifier,pre_processed,CountVectorizer_1024,0.9602,0.9613,0.9595,0.9604,0.5714,0.5000,6.23,0.09
...,...,...,...,...,...,...,...,...,...,...,...
121,KNN,pre_processed,CountVectorizer_1024,0.7491,0.6677,0.9982,0.8001,0.5000,0.5000,0.00,0.52
122,KNN,pre_processed,TfidfVectorizer_norm_1024,0.7343,0.9415,0.5032,0.6559,0.6429,0.4667,0.01,0.39
123,KNN,pre_processed,TfidfVectorizer_idf_4096,0.7282,0.8846,0.5290,0.6621,0.6429,0.6333,0.05,1.07
124,KNN,pre_processed,CountVectorizer_4096,0.7019,0.6281,0.9991,0.7713,0.5000,0.5000,0.01,1.26


In [32]:
df.to_csv("results/pre_max.csv", index=False)