# Fake news detekcija

## Učitavanje, analiza  i obrada podataka

In [256]:
import pandas as pd

In [257]:
data = pd.read_csv("train.csv")
data.head(10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


In [258]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


Broj novinskih članaka u ovom podatkovnom skupu: 

In [259]:
data.shape[0]

20800

U nekim stupcima nedostaju neke vrijednosti. Izvođenjem iduće naredbe se može vidjeti koliko vrijednosti nedostaje za pojedinu značajku.

In [260]:
data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

Za predviđanje lažnih vijesti koristit će se naslov i sadržaj novinskog članka.



In [261]:
data = data.drop(["id", "author"], axis=1)

Nakon micanja pojedinih značajki, za daljnu obradu podatkovnog skupa, NaN vrijednosti su zamijenjene znakovnim nizom duljine nula.

In [262]:
data.fillna("")

Unnamed: 0,title,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Rapper T. I. unloaded on black celebrities who...,0
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",When the Green Bay Packers lost to the Washing...,0
20797,Macy’s Is Said to Receive Takeover Approach by...,The Macy’s of today grew from the union of sev...,0
20798,"NATO, Russia To Hold Parallel Exercises In Bal...","NATO, Russia To Hold Parallel Exercises In Bal...",1


In [263]:
data["title_and_text"] = data["title"] + " " + data["text"]
D = data["title_and_text"].tolist()
label = data["label"].tolist()

Uvidom u tekst pojedinih članaka vidljivo je da je potrebno napraviti neke izmjene na njemu prije nego što se isti preda modelu za treniranje i validaciju. 
Neke od tih promjena su pretvaranje velikih slova u mala, micanje interpunkcijskih znakova, micanje drugih posebnih znakova, itd.

In [264]:
import re
import string

In [265]:
def clean_text(text: str) -> str:
    
    # mala slova
    text = str(text).lower()
    
    # micanje brojeva
    text = re.sub('\w*\d\w*', '', text)    
        
    # micanje #...
    text = re.sub("#\S+", " ", text)
    
    # micanje @...
    text = re.sub("@\S+", " ", text)
    
    # micanje teksta oblika [...]
    text = re.sub('\[.*?\]', '', text)
    
    # micanje npr. HTML tagova
    text = re.sub('<.*?>+', '', text)
     
    # micanje URL-a
    text = re.sub('https*\S+', '', text)
    
    # micanje \n
    text = re.sub('\n', '', text)
    
    # micanje interpunkcijskih znakova
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    # micanje vise praznina
    text = re.sub('\s{2,}', " ", text)
    
    # micanje npr. emotikona
    text = text.encode('ascii', 'ignore').decode()
    
    # micanje svega sto nije rijec
    text = re.sub("\\W"," ",text)
   
    text = text.strip()
    
    return text

In [266]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [267]:
def preprocess(text: str) -> str:
    
    stop = stopwords.words('english')
    tokens = word_tokenize(text)
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(w) for w in tokens if w not in stop]
 
    return " ".join(words)

In [268]:
corpus = [preprocess(clean_text(w)) for w in D]

## Pretvorba podataka

In [270]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [271]:
tf_idf_vec = TfidfVectorizer(max_features = 2000, ngram_range=(1, 2))
doc_vec = tf_idf_vec.fit_transform(corpus).toarray() 

## Unakrsna provjera, treniranje i validacija modela

In [272]:
from sklearn.model_selection import train_test_split,KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import fbeta_score, make_scorer
from numpy import mean, std
import numpy as np

In [273]:
cv_inner = KFold(n_splits=3, shuffle=True, random_state=42)
cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)
ftwo_scorer = make_scorer(fbeta_score, beta=2)
X = doc_vec
y = np.array(label)

In [274]:
rf = RandomForestClassifier(random_state=42)

prev_f2_rf = 0
best_rf: RandomForestClassifier
outer_results_rf = list()

for train_index, test_index in cv_outer.split(X):
    X_train, X_test = X[train_index, :], X[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    search_space = dict()
    search_space['n_estimators'] = range(10, 16)
    search_space['max_depth'] = range(10, 16)
    search = GridSearchCV(rf, search_space, scoring=ftwo_scorer, cv=cv_inner, refit=True, n_jobs=-1)
    result = search.fit(X_train, y_train)
    best = result.best_estimator_
    pred = best.predict(X_test)
    f2 = fbeta_score(y_test, pred, beta=2)
    if prev_f2_rf < f2:
        prev_f2_rf = f2
        best_rf = best
    outer_results_rf.append(f2)
    
print(f"Random forest F2 = {mean(outer_results_rf)}, std = {std(outer_results_rf)}")

Random forest F2 = 0.9454151352502104, std = 0.00879809972801967


In [275]:
svm = LinearSVC(random_state=42)

prev_f2_svm = 0
best_rf: SVC
outer_results_svm = list()

for train_index, test_index in cv_outer.split(X):
    X_train, X_test = X[train_index, :], X[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    search_space = dict()
    search_space['C'] = range(10, 21)
    search = GridSearchCV(svm, search_space, scoring=ftwo_scorer, cv=cv_inner, refit=True, n_jobs=-1)
    result = search.fit(X_train, y_train)
    best = result.best_estimator_
    pred = best.predict(X_test)
    f2 = fbeta_score(y_test, pred, beta=2)
    if prev_f2_svm < f2:
        prev_f2_svm = f2
        best_svm = best
    outer_results_svm.append(f2)
    
print(f"Linear SVM F2 = {mean(outer_results_svm)}, std = {std(outer_results_svm)}")

Linear SVM F2 = 0.9433570988644391, std = 0.002589976986445193


In [276]:
lr = LogisticRegression(solver='lbfgs', max_iter=250)

prev_f2_lr = 0
best_lr: LogisticRegression
outer_results_lr = list()

for train_index, test_index in cv_outer.split(X):
    X_train, X_test = X[train_index, :], X[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    search_space = dict()
    search_space['C'] = range(10, 21)
    search = GridSearchCV(lr, search_space, scoring=ftwo_scorer, cv=cv_inner, refit=True, n_jobs=-1)
    result = search.fit(X_train, y_train)
    best = result.best_estimator_
    pred = best.predict(X_test)
    f2 = fbeta_score(y_test, pred, beta=2)
    if prev_f2_lr < f2:
        prev_f2_lr = f2
        best_lr = best
    outer_results_lr.append(f2)
    
print(f"Logistic regression F2 = {mean(outer_results_lr)}, std = {std(outer_results_lr)}")

Logistic regression F2 = 0.9580345074766179, std = 0.003172438194031856


In [277]:
mnb = MultinomialNB()

prev_f2_mnb= 0
best_mnb: MultinomialNB
outer_results_mnb = list()

for train_index, test_index in cv_outer.split(X):
    X_train, X_test = X[train_index, :], X[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    search_space = dict()
    search_space['alpha'] = range(10, 21)
    search = GridSearchCV(mnb, search_space, scoring=ftwo_scorer, cv=cv_inner, refit=True, n_jobs=-1)
    result = search.fit(X_train, y_train)
    best = result.best_estimator_
    pred = best.predict(X_test)
    f2 = fbeta_score(y_test, pred, beta=2)
    if prev_f2_mnb < f2:
        prev_f2_mnb = f2
        best_mnb = best
    outer_results_mnb.append(f2)
    
print(f"Multinomial Naive Bayes F2 = {mean(outer_results_mnb)}, std = {std(outer_results_mnb)}")

Multinomial Naive Bayes F2 = 0.8384531365498269, std = 0.028294127933092748


In [278]:
pac = PassiveAggressiveClassifier()

prev_f2_pac = 0
best_pac: PassiveAggressiveClassifier
outer_results_pac = list()

for train_index, test_index in cv_outer.split(X):
    X_train, X_test = X[train_index, :], X[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    search_space = dict()
    search_space['C'] = range(10, 21)
    search = GridSearchCV(pac, search_space, scoring=ftwo_scorer, cv=cv_inner, refit=True, n_jobs=-1)
    result = search.fit(X_train, y_train)
    best = result.best_estimator_
    pred = best.predict(X_test)
    f2 = fbeta_score(y_test, pred, beta=2)
    if prev_f2_pac < f2:
        prev_f2_pac = f2
        best_pac = best
    outer_results_pac.append(f2)
    
print(f"Passive Aggressive Classifier F2 = {mean(outer_results_pac)}, std = {std(outer_results_pac)}")

Passive Aggressive Classifier F2 = 0.9286890796010671, std = 0.0037521735877221614


## Predikcija 

In [279]:
test_data = pd.read_csv("test.csv")
id = test_data["id"].tolist()
test_data.fillna("")
test_data = test_data.drop(["id", "author"], axis=1)
test_data["title_and_text"] = test_data["title"] + " " + test_data["text"]
D_test_data = test_data["title_and_text"].tolist()

In [280]:
test_corpus = [preprocess(clean_text(w)) for w in D_test_data]

In [281]:
tf_idf_vec_test = TfidfVectorizer(max_features = 2000, ngram_range=(1, 2))
doc_vec_test = tf_idf_vec_test.fit_transform(test_corpus).toarray() 

In [289]:
models = [best_rf, best_svm, best_lr, best_mnb, best_pac]
predictions = ['rf_prediction', 'svm_prediction', 'lr_prediction', 'mnb_prediction', 'pac_prediction']

In [291]:
for model, name in zip(models, predictions):
    pd.DataFrame(list(zip(id, model.predict(doc_vec_test))), 
                 columns =['id', 'label']).to_csv(name + '.csv', index=False)