In [1]:
import os

pre_processed_path = os.path.join('data', 'csvs', "pre_processed.csv")
pre_normalized_path = os.path.join('data', 'csvs', "pre_normalized.csv")
news_validation_path = os.path.join('data', 'csvs', "news_validation.csv")

data_frame_paths = [pre_processed_path, pre_normalized_path]
data_frame_names = ["pre_processed", "pre_normalized"]

In [2]:
from preprocess import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

bag_of_words = CountVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords)
bag_of_words_normalized = TfidfVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords, use_idf = False, norm='l2')
bag_of_words_idf = TfidfVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords)

vectorizers = [bag_of_words, bag_of_words_normalized, bag_of_words_idf]
vectorizers_names = ["CountVectorizer", "TfidfVectorizer_normalized", "TfidfVectorizer_idf"]

In [3]:
algs = []
algs_names = []

In [4]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

Gaussain_NB = GaussianNB()
Bernoulli_NB = BernoulliNB()
Multinomial_NB = MultinomialNB()

algs += [Gaussain_NB, Bernoulli_NB, Multinomial_NB]
algs_names += ["Gaussain_NB", "Bernoulli_NB", "Multinomial_NB"]

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

Decision_Tree_Classifier = DecisionTreeClassifier()
Extra_Tree_Classifier = ExtraTreeClassifier()
Random_Forest_Classifier = RandomForestClassifier()
Extra_Trees_Classifier = ExtraTreesClassifier()

algs += [Decision_Tree_Classifier, Extra_Tree_Classifier, Random_Forest_Classifier, Extra_Trees_Classifier]
algs_names += ["Decision_Tree_Classifier", "Extra_Tree_Classifier", "Random_Forest_Classifier", "Extra_Trees_Classifier"]

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

KNN = KNeighborsClassifier()
svc = SVC()

algs += [KNN, svc]
algs_names += ["KNN", "SVC"]

In [7]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

AdaBoost = AdaBoostClassifier()
GradientBoost = GradientBoostingClassifier()

algs += [AdaBoost, GradientBoost]
algs_names += ["AdaBoost", "GradientBoost"]

In [8]:
# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

LGBM = LGBMClassifier()
CatBoost = CatBoostClassifier()

algs += [LGBM, CatBoost]
algs_names += ["LGBM", "CatBoost"]

In [9]:
import importlib
import sys

import pandas as pd
import get_results

sys.path.append( '.' )
importlib.reload(get_results)

df = get_results.get_result_data_frames(algs, algs_names, vectorizers, vectorizers_names, data_frame_paths, data_frame_names, news_validation_path)
df = df.sort_values(by=['algorithm',"accuracy"], ascending=[True, False]).reset_index(drop=True)
df

Learning rate set to 0.020553
0:	learn: 0.6706658	total: 572ms	remaining: 9m 31s
1:	learn: 0.6498472	total: 901ms	remaining: 7m 29s
2:	learn: 0.6290009	total: 1.24s	remaining: 6m 50s
3:	learn: 0.6068992	total: 1.56s	remaining: 6m 27s
4:	learn: 0.5851575	total: 1.89s	remaining: 6m 16s
5:	learn: 0.5668940	total: 2.24s	remaining: 6m 10s
6:	learn: 0.5512990	total: 2.56s	remaining: 6m 3s
7:	learn: 0.5359346	total: 2.89s	remaining: 5m 58s
8:	learn: 0.5189201	total: 3.22s	remaining: 5m 55s
9:	learn: 0.5057053	total: 3.58s	remaining: 5m 54s
10:	learn: 0.4906879	total: 3.93s	remaining: 5m 53s
11:	learn: 0.4767109	total: 4.27s	remaining: 5m 51s
12:	learn: 0.4631301	total: 4.59s	remaining: 5m 48s
13:	learn: 0.4516623	total: 4.94s	remaining: 5m 48s
14:	learn: 0.4394390	total: 5.29s	remaining: 5m 47s
15:	learn: 0.4292774	total: 5.64s	remaining: 5m 46s
16:	learn: 0.4181766	total: 5.98s	remaining: 5m 45s
17:	learn: 0.4081778	total: 6.32s	remaining: 5m 44s
18:	learn: 0.3990135	total: 6.65s	remaining: 

Unnamed: 0,algorithm,dataset,vectorizer,accuracy,precision,recall,f1,validation_accuracy,train_time,predict_time
0,AdaBoost,pre_processed,CountVectorizer,0.9444,0.9448,0.9448,0.9448,0.5833,671.54,22.74
1,AdaBoost,pre_processed,TfidfVectorizer_idf,0.9403,0.9435,0.9374,0.9405,0.5000,427.41,18.12
2,AdaBoost,pre_processed,TfidfVectorizer_normalized,0.9398,0.9443,0.9356,0.9399,0.6667,431.03,18.11
3,AdaBoost,pre_normalized,TfidfVectorizer_normalized,0.8398,0.8270,0.8620,0.8441,0.7500,211.98,10.91
4,AdaBoost,pre_normalized,CountVectorizer,0.8366,0.8449,0.8270,0.8359,0.8333,209.87,10.83
...,...,...,...,...,...,...,...,...,...,...
73,SVC,pre_processed,TfidfVectorizer_idf,0.9588,0.9520,0.9669,0.9594,0.5833,1615.46,1151.44
74,SVC,pre_processed,CountVectorizer,0.9583,0.9569,0.9604,0.9587,0.5000,435.41,307.20
75,SVC,pre_normalized,TfidfVectorizer_normalized,0.9037,0.9111,0.8960,0.9035,0.9167,1531.28,1095.53
76,SVC,pre_normalized,TfidfVectorizer_idf,0.9028,0.9102,0.8951,0.9026,0.8333,1673.35,1292.31


In [10]:
df.sort_values(by=["accuracy"], ascending=[False]).reset_index(drop=True)

Unnamed: 0,algorithm,dataset,vectorizer,accuracy,precision,recall,f1,validation_accuracy,train_time,predict_time
0,SVC,pre_processed,TfidfVectorizer_normalized,0.9620,0.9564,0.9687,0.9625,0.5833,979.00,760.00
1,CatBoost,pre_processed,CountVectorizer,0.9597,0.9604,0.9595,0.9600,0.5000,429.98,35.29
2,SVC,pre_processed,TfidfVectorizer_idf,0.9588,0.9520,0.9669,0.9594,0.5833,1615.46,1151.44
3,SVC,pre_processed,CountVectorizer,0.9583,0.9569,0.9604,0.9587,0.5000,435.41,307.20
4,CatBoost,pre_processed,TfidfVectorizer_normalized,0.9569,0.9585,0.9558,0.9572,0.5000,710.60,36.26
...,...,...,...,...,...,...,...,...,...,...
73,Gaussain_NB,pre_processed,CountVectorizer,0.6181,0.9456,0.2557,0.4026,0.5000,8.89,3.46
74,Gaussain_NB,pre_processed,TfidfVectorizer_idf,0.6139,0.9470,0.2466,0.3912,0.5000,5.78,3.26
75,KNN,pre_normalized,CountVectorizer,0.6060,0.5667,0.9227,0.7021,0.5000,0.20,10.26
76,Multinomial_NB,pre_processed,TfidfVectorizer_idf,0.6023,0.9914,0.2116,0.3487,0.5000,1.36,0.46


In [11]:
df.to_csv("results/models_results.csv", index=False)