In [6]:
import os

pre_processed_path = os.path.join('data', 'csvs', "pre_processed.csv")
pre_normalized_path = os.path.join('data', 'csvs', "pre_normalized.csv")
news_validation_path = os.path.join('data', 'csvs', "news_validation.csv")

data_frame_paths = [pre_processed_path, pre_normalized_path]
data_frame_names = ["pre_processed", "pre_normalized"]

In [7]:
from preprocess import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

bag_of_words = CountVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords)
bag_of_words_normalized = TfidfVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords, use_idf = False, norm='l2')
bag_of_words_idf = TfidfVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords)

vectorizers = [bag_of_words, bag_of_words_normalized, bag_of_words_idf]
vectorizers_names = ["CountVectorizer", "TfidfVectorizer_normalized", "TfidfVectorizer_idf"]

In [8]:
algs = []
algs_names = []

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

Gaussain_NB = GaussianNB()
Bernoulli_NB = BernoulliNB()
Multinomial_NB = MultinomialNB()

algs += [Gaussain_NB, Bernoulli_NB, Multinomial_NB]
algs_names += ["Gaussain_NB", "Bernoulli_NB", "Multinomial_NB"]

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

Decision_Tree_Classifier = DecisionTreeClassifier()
Extra_Tree_Classifier = ExtraTreeClassifier()
Random_Forest_Classifier = RandomForestClassifier()
Extra_Trees_Classifier = ExtraTreesClassifier()

algs += [Decision_Tree_Classifier, Extra_Tree_Classifier, Random_Forest_Classifier, Extra_Trees_Classifier]
algs_names += ["Decision_Tree_Classifier", "Extra_Tree_Classifier", "Random_Forest_Classifier", "Extra_Trees_Classifier"]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

KNN = KNeighborsClassifier()
svc = SVC()

algs += [KNN, svc]
algs_names += ["KNN", "SVC"]

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

AdaBoost = AdaBoostClassifier()
GradientBoost = GradientBoostingClassifier()

algs += [AdaBoost, GradientBoost]
algs_names += ["AdaBoost", "GradientBoost"]

In [9]:
# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

LGBM = LGBMClassifier()
CatBoost = CatBoostClassifier()

algs += [LGBM, CatBoost]
algs_names += ["LGBM", "CatBoost"]

In [10]:
import pandas as pd
import get_results

import sys
sys.path.append( '.' )
import importlib
importlib.reload(get_results)

df = get_results.get_result_data_frames(algs, algs_names, vectorizers, vectorizers_names, data_frame_paths, data_frame_names, news_validation_path)
df = df.sort_values(by=['algorithm',"accuracy"], ascending=[True, False]).reset_index(drop=True)
df

In [None]:
df.sort_values(by=["accuracy"], ascending=[False]).reset_index(drop=True)

Unnamed: 0,algorithm,dataset,vectorizer,accuracy,precision,recall,f1,train_time,predict_time
0,CatBoost,pre_processed,TfidfVectorizer_normalized,0.9662,0.9575,0.9731,0.9652,774.72,35.03
1,LGBM,pre_processed,TfidfVectorizer_normalized,0.9630,0.9651,0.9615,0.9633,17.94,0.41
2,LGBM,pre_processed,TfidfVectorizer_idf,0.9630,0.9638,0.9620,0.9629,14.83,0.28
3,LGBM,pre_processed,CountVectorizer,0.9620,0.9577,0.9666,0.9621,12.57,0.98
4,CatBoost,pre_processed,CountVectorizer,0.9606,0.9520,0.9690,0.9604,590.08,53.07
...,...,...,...,...,...,...,...,...,...
73,KNN,pre_processed,CountVectorizer,0.6366,0.5769,0.9991,0.7314,0.00,25.07
74,Gaussain_NB,pre_processed,TfidfVectorizer_normalized,0.6167,0.9512,0.2511,0.3974,10.67,5.31
75,Multinomial_NB,pre_processed,TfidfVectorizer_idf,0.6153,1.0000,0.2255,0.3681,1.42,0.49
76,Gaussain_NB,pre_processed,TfidfVectorizer_idf,0.6120,0.9446,0.2493,0.3945,6.28,3.33


In [None]:
# df.to_csv("results/models_results.csv", index=False)