In [3]:
# Read pre_normalized data

import pandas as pd
import os

# Read data
pre_normalized_path = os.path.join('data', 'csvs', "pre_normalized.csv")

df = pd.read_csv(pre_normalized_path)


In [36]:
# Create CountVectorizer using filtered vocab
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# load filtered vocab from csv
filtered_vocab_path = os.path.join('data', 'csvs', "filtered_vocab.csv")

# read as a dictionary with key as word and value as index
filtered_vocab = pd.read_csv(filtered_vocab_path).set_index('word').to_dict()['index']

vectorizer = TfidfVectorizer(vocabulary=filtered_vocab, strip_accents="ascii", lowercase=True)

In [37]:
# Fit and transform the vectorizer on the text data
X = vectorizer.fit_transform(df['text']).toarray()
y = df['label']

In [38]:
# Train a LGBMClassifier on the count vectorized data
from lightgbm import LGBMClassifier

# Create a LGBMClassifier
lgbm = LGBMClassifier()

# Fit the classifier to the data
lgbm.fit(X, y)


In [39]:
from sklearn.metrics import accuracy_score

# Read the custom data
custom_data_path = os.path.join('data', 'csvs', "news_validation.csv")
custom_df = pd.read_csv(custom_data_path)

# Try to predict the label of the custom data
custom_X = vectorizer.transform(custom_df['text']).toarray()
custom_y = custom_df['label']

# Predict the labels of the custom data
custom_y_pred = lgbm.predict(custom_X)
print(accuracy_score(custom_y, custom_y_pred))

0.8571428571428571


In [41]:
# read the covid data
covid_data_path = os.path.join('data', 'csvs', "covid.csv")
covid_df = pd.read_csv(covid_data_path)

# Try to predict the label of the custom data
covid_X = vectorizer.transform(covid_df['text']).toarray()
covid_y = covid_df['label']

# Predict the labels of the custom data
covid_y_pred = lgbm.predict(covid_X)
print(accuracy_score(covid_y, covid_y_pred))


0.6666666666666666


In [43]:
# Fit a MLPClassifier on the count vectorized data
from sklearn.neural_network import MLPClassifier

# Create a MLPClassifier
mlp = MLPClassifier(max_iter=1000)

# Fit the classifier to the data
mlp.fit(X, y)

# Predict the labels of the custom data
custom_y_pred = mlp.predict(custom_X)
print(accuracy_score(custom_y, custom_y_pred))

# Predict the labels of the covid data
covid_y_pred = mlp.predict(covid_X)
print(accuracy_score(covid_y, covid_y_pred))

0.8571428571428571
0.7


In [130]:
# Send HTTP request to the API endpoint https://nilc-fakenews.herokuapp.com/ajax/check_web/ with the field text set to the text of the article and the field model set to 'unigramas' use the cookie xr3HZGxMUGv5GL8CZ2e2ZhOZXew7iaHPzgdeEhmVDupCyHeh5gD6EaHyrhy2A6aZ with origin https://nilc-fakenews.herokuapp.com/

import requests

def pad_article(article):
    original_article = article
    while len(article.split()) < 100:
        article += "\n\n" + original_article
    
    return article

# for each article in the custom_X dataset send a request to the API endpoint
# and check if the result is the same as the one obtained with the model
convert_label = {'REAL': 'true', 'FAKE': 'fake'}

# Shuffle the data
#fake_check_df = df.sample(frac=1).head(100)
fake_check_df = covid_df

models = ['unigramas', 'pos']
models_name = ['Palavras do texto', 'Classes gramaticais']
fake_check_predictions_df = pd.DataFrame(columns=['text', 'label', 'model', 'prediction'])

# for each file in the data/text_files/covid_dataset folder
for model, model_name in zip(models, models_name):
    for folder in ['fake', 'true']:
        for file in os.listdir('data/text_files/covid_dataset/' + folder):
            with open('data/text_files/covid_dataset/' + folder + '/' + file, 'r', encoding='utf-8') as f:
                article = pad_article(f.read())
                label = folder
                response = requests.post('http://nilc-fakenews.herokuapp.com/ajax/check_web/', 
                                        data={'text': article, 'model': model}, 
                                        cookies={'csrftoken': 'xr3HZGxMUGv5GL8CZ2e2ZhOZXew7iaHPzgdeEhmVDupCyHeh5gD6EaHyrhy2A6aZ'}, 
                                        headers={
                                            'Origin': 'https://nilc-fakenews.herokuapp.com/',
                                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0',
                                            'Referer': 'https://nilc-fakenews.herokuapp.com/',
                                            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                                            'Host': 'nilc-fakenews.herokuapp.com',
                                            'X-CSRFToken': 'xr3HZGxMUGv5GL8CZ2e2ZhOZXew7iaHPzgdeEhmVDupCyHeh5gD6EaHyrhy2A6aZ',
                                            'X-Requested-With': 'XMLHttpRequest'})

                pred = convert_label[response.json()['result']]
                new_row = pd.DataFrame([[article, label, model_name, pred]], columns=['text', 'label', 'model', 'prediction'], index=[0])
                fake_check_predictions_df = pd.concat([fake_check_predictions_df, new_row], ignore_index=True)


In [131]:
fake_check_predictions_df

Unnamed: 0,text,label,model,prediction
0,Olá eu sou médico sanitarista e a minha especi...,fake,Palavras do texto,fake
1,"Isso aí, galera do bem. Já no hospital aqui, s...",fake,Palavras do texto,true
2,ESLOVÊNIA ESCANDALIZADA - Ontem estourou um gr...,fake,Palavras do texto,true
3,Presidente da OMS não recomenda Carnaval em 20...,fake,Palavras do texto,true
4,Uma comparação de relatórios oficiais do gover...,fake,Palavras do texto,fake
5,Bolsas de sangue de não vacinado versus sangue...,fake,Palavras do texto,fake
6,os testes PCR de Covid-19 são estudos da Unive...,fake,Palavras do texto,true
7,Diretor geral da OMS Tedros Adhanom diz que nã...,fake,Palavras do texto,true
8,"Tedros Adhanom Ghebreyesus, o atual diretor da...",fake,Palavras do texto,fake
9,"Oi, pessoal. Bom dia. Meu nome é Mônica Travas...",fake,Palavras do texto,fake


In [132]:
for model in models_name:
    print(f'[{model}] Accuracy of the model: ', accuracy_score(fake_check_predictions_df[fake_check_predictions_df['model'] == model]['label'], fake_check_predictions_df[fake_check_predictions_df['model'] == model]['prediction']))


[Palavras do texto] Accuracy of the model:  0.7666666666666667
[Classes gramaticais] Accuracy of the model:  0.6333333333333333
