In [14]:
import os

pre_processed_path = os.path.join('data', 'csvs', "pre_processed.csv")
pre_normalized_path = os.path.join('data', 'csvs', "pre_normalized.csv")
news_validation_path = os.path.join('data', 'csvs', "news_validation.csv")

data_frame_paths = [pre_processed_path, pre_normalized_path]
data_frame_names = ["pre_processed", "pre_normalized"]

In [15]:
from preprocess import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

bag_of_words = CountVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords)
bag_of_words_normalized = TfidfVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords, use_idf = False, norm='l2')
bag_of_words_idf = TfidfVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords)

vectorizers = [bag_of_words, bag_of_words_normalized, bag_of_words_idf]
vectorizers_names = ["CountVectorizer", "TfidfVectorizer_normalized", "TfidfVectorizer_idf"]

In [16]:
algs = []
algs_names = []

In [4]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

Gaussain_NB = GaussianNB()
Bernoulli_NB = BernoulliNB()
Multinomial_NB = MultinomialNB()

algs += [Gaussain_NB, Bernoulli_NB, Multinomial_NB]
algs_names += ["Gaussain_NB", "Bernoulli_NB", "Multinomial_NB"]

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

Decision_Tree_Classifier = DecisionTreeClassifier()
Extra_Tree_Classifier = ExtraTreeClassifier()
Random_Forest_Classifier = RandomForestClassifier()
Extra_Trees_Classifier = ExtraTreesClassifier()

algs += [Decision_Tree_Classifier, Extra_Tree_Classifier, Random_Forest_Classifier, Extra_Trees_Classifier]
algs_names += ["Decision_Tree_Classifier", "Extra_Tree_Classifier", "Random_Forest_Classifier", "Extra_Trees_Classifier"]

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

knn = KNeighborsClassifier()
svc = SVC()

algs += [knn, svc]
algs_names += ["KNN", "SVC"]

In [7]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

AdaBoost = AdaBoostClassifier()
GradientBoost = GradientBoostingClassifier()

algs += [AdaBoost, GradientBoost]
algs_names += ["AdaBoost", "GradientBoost"]

In [8]:
# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

LGBM = LGBMClassifier()
CatBoost = CatBoostClassifier()

algs += [LGBM, CatBoost]
algs_names += ["LGBM", "CatBoost"]

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()

algs += [mlp]
algs_names += ["MLP"]

In [18]:
import importlib
import sys

import pandas as pd
import get_results

sys.path.append( '.' )
importlib.reload(get_results)

df = get_results.get_result_data_frames(algs, algs_names, vectorizers, vectorizers_names, data_frame_paths, data_frame_names, news_validation_path)
df = df.sort_values(by=['algorithm',"accuracy"], ascending=[True, False]).reset_index(drop=True)
df

LGBM CountVectorizer pre_processed
['fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake'
 'fake' 'fake' 'fake' 'true']
LGBM TfidfVectorizer_normalized pre_processed
['fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'true' 'fake' 'fake' 'fake'
 'fake' 'fake' 'fake' 'true']
LGBM TfidfVectorizer_idf pre_processed
['fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake' 'fake'
 'fake' 'fake' 'fake' 'true']
LGBM CountVectorizer pre_normalized
['fake' 'fake' 'fake' 'true' 'fake' 'fake' 'fake' 'true' 'true' 'true'
 'true' 'true' 'true' 'true']
LGBM TfidfVectorizer_normalized pre_normalized
['fake' 'fake' 'fake' 'true' 'fake' 'fake' 'fake' 'true' 'true' 'true'
 'true' 'true' 'true' 'true']
LGBM TfidfVectorizer_idf pre_normalized
['fake' 'fake' 'fake' 'true' 'fake' 'fake' 'fake' 'true' 'true' 'true'
 'true' 'true' 'true' 'true']


Unnamed: 0,algorithm,dataset,vectorizer,accuracy,precision,recall,f1,validation_accuracy,train_time,predict_time
0,LGBM,pre_processed,CountVectorizer,0.9565,0.9645,0.9485,0.9564,0.5714,8.84,0.86
1,LGBM,pre_processed,TfidfVectorizer_normalized,0.9523,0.965,0.9393,0.952,0.5,16.44,0.29
2,LGBM,pre_processed,TfidfVectorizer_idf,0.9514,0.9658,0.9365,0.951,0.5714,15.17,0.28
3,LGBM,pre_normalized,TfidfVectorizer_idf,0.8926,0.8992,0.8859,0.8925,0.9286,5.34,0.17
4,LGBM,pre_normalized,CountVectorizer,0.8898,0.8884,0.8933,0.8908,0.9286,3.4,0.51
5,LGBM,pre_normalized,TfidfVectorizer_normalized,0.8861,0.8883,0.885,0.8866,0.9286,5.72,0.18


In [10]:
df.sort_values(by=["accuracy"], ascending=[False]).reset_index(drop=True)

Unnamed: 0,algorithm,dataset,vectorizer,accuracy,precision,recall,f1,validation_accuracy,train_time,predict_time
0,SVC,pre_processed,TfidfVectorizer_normalized,0.9620,0.9564,0.9687,0.9625,0.5833,979.00,760.00
1,CatBoost,pre_processed,CountVectorizer,0.9597,0.9604,0.9595,0.9600,0.5000,429.98,35.29
2,SVC,pre_processed,TfidfVectorizer_idf,0.9588,0.9520,0.9669,0.9594,0.5833,1615.46,1151.44
3,SVC,pre_processed,CountVectorizer,0.9583,0.9569,0.9604,0.9587,0.5000,435.41,307.20
4,CatBoost,pre_processed,TfidfVectorizer_normalized,0.9569,0.9585,0.9558,0.9572,0.5000,710.60,36.26
...,...,...,...,...,...,...,...,...,...,...
73,Gaussain_NB,pre_processed,CountVectorizer,0.6181,0.9456,0.2557,0.4026,0.5000,8.89,3.46
74,Gaussain_NB,pre_processed,TfidfVectorizer_idf,0.6139,0.9470,0.2466,0.3912,0.5000,5.78,3.26
75,KNN,pre_normalized,CountVectorizer,0.6060,0.5667,0.9227,0.7021,0.5000,0.20,10.26
76,Multinomial_NB,pre_processed,TfidfVectorizer_idf,0.6023,0.9914,0.2116,0.3487,0.5000,1.36,0.46


In [11]:
# df.to_csv("results/models_results.csv", index=False)