In [26]:
import os

pre_processed_path = os.path.join('data', 'csvs', "pre_processed.csv")
pre_normalized_path = os.path.join('data', 'csvs', "pre_normalized.csv")

data_frame_paths = [pre_processed_path, pre_normalized_path]
data_frame_names = ["pre_processed", "pre_normalized"]

In [27]:
from preprocess import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

bag_of_words = CountVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords)
bag_of_words_normalized = TfidfVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords, use_idf = False, norm='l2')
bag_of_words_idf = TfidfVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords)

vectorizers = [bag_of_words, bag_of_words_normalized, bag_of_words_idf]
vectorizers_names = ["CountVectorizer", "TfidfVectorizer_normalized", "TfidfVectorizer_idf"]

In [28]:
algs = []
algs_names = []

In [29]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

Gaussain_NB = GaussianNB()
Bernoulli_NB = BernoulliNB()
Multinomial_NB = MultinomialNB()

algs += [Gaussain_NB, Bernoulli_NB, Multinomial_NB]
algs_names += ["Gaussain_NB", "Bernoulli_NB", "Multinomial_NB"]

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

Decision_Tree_Classifier = DecisionTreeClassifier()
Extra_Tree_Classifier = ExtraTreeClassifier()
Random_Forest_Classifier = RandomForestClassifier()
Extra_Trees_Classifier = ExtraTreesClassifier()

algs += [Decision_Tree_Classifier, Extra_Tree_Classifier, Random_Forest_Classifier, Extra_Trees_Classifier]
algs_names += ["Decision_Tree_Classifier", "Extra_Tree_Classifier", "Random_Forest_Classifier", "Extra_Trees_Classifier"]

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

KNN = KNeighborsClassifier()
svc = SVC()

algs += [KNN, svc]
algs_names += ["KNN", "SVC"]

In [32]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

AdaBoost = AdaBoostClassifier()
GradientBoost = GradientBoostingClassifier()

algs += [AdaBoost, GradientBoost]
algs_names += ["AdaBoost", "GradientBoost"]

In [33]:
# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

LGBM = LGBMClassifier()
CatBoost = CatBoostClassifier()

algs += [LGBM, CatBoost]
algs_names += ["LGBM", "CatBoost"]

In [34]:
import pandas as pd
from get_results import get_result_data_frames

df = get_result_data_frames(algs, algs_names, vectorizers, vectorizers_names, data_frame_paths, data_frame_names)
df = df.sort_values(by=['algorithm',"accuracy"], ascending=[True, False]).reset_index(drop=True)
df

In [None]:
df.sort_values(by=["accuracy"], ascending=[False]).reset_index(drop=True)

Unnamed: 0,algorithm,dataset,vectorizer,accuracy,precision,recall,f1,train_time,predict_time
0,Gaussain_NB,pre_normalized,CountVectorizer,0.7324,0.7325,0.7392,0.7358,4.4,2.37
1,Gaussain_NB,pre_normalized,TfidfVectorizer_idf,0.7157,0.7118,0.7224,0.7171,4.93,2.44
2,Gaussain_NB,pre_normalized,TfidfVectorizer_normalized,0.713,0.7072,0.7269,0.7169,4.16,2.44
3,Gaussain_NB,pre_processed,TfidfVectorizer_idf,0.6361,0.9551,0.2785,0.4313,6.85,3.98
4,Gaussain_NB,pre_processed,TfidfVectorizer_normalized,0.6338,0.9439,0.2816,0.4338,7.19,3.95
5,Gaussain_NB,pre_processed,CountVectorizer,0.6148,0.9291,0.2532,0.398,7.15,3.75


In [None]:
df.to_csv("results/models_results.csv", index=False)