In [4]:
from preprocess import stopwords

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
import os

pre_file_path = os.path.join('pre_normalized', 'news.csv')
pre_df = pd.read_csv(pre_file_path)
pre_df = pre_df[["text","label"]]
pre_df.head()

Unnamed: 0,text,label
0,autor bestseller cita 5 expressoes indicam int...,fake
1,juiz df confirma indicios expresidente petista...,fake
2,senhora 60 anos corre atras eduardo cunha aero...,fake
3,russia ameaca derrubar avioes americanos siria...,fake
4,governo podera demitir 18 mil servidores banco...,fake


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

bag_of_words = TfidfVectorizer(strip_accents="ascii", lowercase=True, stop_words = stopwords)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(pre_df["text"], pre_df["label"], shuffle= True, test_size=0.2)

bag_of_words.fit(X_train)

X_train = bag_of_words.transform(X_train).toarray()
X_test = bag_of_words.transform(X_test).toarray()

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

DTC = DecisionTreeClassifier()
ETC = ExtraTreeClassifier()
RFC = RandomForestClassifier()
RETC = ExtraTreesClassifier()


algs_vector = [DTC, ETC, RFC, RETC]
algs_names = ["DTC", "ETC", "RFC", "RETC"]

In [9]:
from collections import defaultdict
scores = defaultdict(list)

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

for algo in algs_vector:
    algo.fit(X_train, y_train)
    y_predict = algo.predict(X_test)

    scores["accuracy"].append(accuracy_score(y_test, y_predict))
    scores["precision"].append(precision_score(y_test, y_predict, pos_label = 'fake'))
    scores["recall"].append(recall_score(y_test, y_predict, pos_label = 'fake'))
    scores["f1_score"].append(f1_score(y_test, y_predict, pos_label = 'fake'))

results_df = pd.DataFrame(scores, index = algs_names)
results_df

Unnamed: 0,accuracy,precision,recall,f1_score
DTC,0.752778,0.759831,0.745179,0.752434
ETC,0.677083,0.676113,0.690083,0.683027
RFC,0.879861,0.871141,0.893939,0.882393
RETC,0.892361,0.875164,0.917355,0.895763


In [11]:
# results_df.index.name = "Model"
# results_df.to_csv("results/Forests.csv",index=True)

In [13]:
from sklearn.model_selection import GridSearchCV

params = {'bootstrap': [True, False],
        'max_depth': [5, 10, 20, 40, 70, 80, 90, 100, None],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [200, 500, 1000, 2000]}

grid = GridSearchCV(RandomForestClassifier(), param_grid=params, n_jobs=4, cv=5, verbose=2)
grid.fit(X_train,y_train)
grid.best_params_

KeyboardInterrupt: 

In [None]:
# best_acc_model_param = grid.best_params_
best_param = {'alpha': 0.01, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True}
best_model = RandomForestClassifier(**best_param)

#{'alpha': 0.01, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True}

In [None]:
best_model.fit(X_train, y_train)
y_predict = best_model.predict(X_test)

acc = accuracy_score(y_test, y_predict)
prec = precision_score(y_test, y_predict, pos_label = 'fake')
recall = recall_score(y_test, y_predict, pos_label = 'fake')
f1 = f1_score(y_test, y_predict, pos_label = 'fake')

results_df.loc["Tuned RFC"] = [acc, prec, recall, f1]
results_df