# Fake news

## Parameters

In [1]:
BASE_DIR = ''
FILE_NAME = 'fake_news.csv'

## Import

In [2]:
import os
import pandas as pd
df = pd.read_csv(os.path.join(BASE_DIR,FILE_NAME),index_col=0).drop_duplicates()
df.sample()

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5370,"Politicians, NATO Officials Furious as Spain P...",,"Politicians, NATO Officials Furious as Spain P...",1


In [3]:
df = df.drop(columns='author').dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20134 entries, 0 to 20799
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20134 non-null  object
 1   text    20134 non-null  object
 2   label   20134 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 629.2+ KB


In [4]:
#There are outliers in the title length
df['len'] = df['title'].str.split().str.len()
df['len'].describe([i/10 for i in range(0,10,2)])

count    20134.000000
mean        12.418595
std          4.093237
min          1.000000
0%           1.000000
20%          9.000000
40%         11.000000
50%         13.000000
60%         14.000000
80%         16.000000
max         72.000000
Name: len, dtype: float64

In [5]:
import numpy as np

df = df[(df['len'] < np.percentile(df['len'],98))& 
        (df['len'] > np.percentile(df['len'],2))].copy()
df['len'].describe([i/10 for i in range(0,10,2)])

count    19122.000000
mean        12.301224
std          3.363644
min          5.000000
0%           5.000000
20%          9.000000
40%         12.000000
50%         13.000000
60%         14.000000
80%         15.000000
max         19.000000
Name: len, dtype: float64

## BERT

In [6]:
import transformers as ppb

#Tutorial for BERT is here:
#https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/
#model_class = ppb.DistilBertModel
tokenizer_class = ppb.DistilBertTokenizer
pretrained_weights = 'distilbert-base-uncased'

#Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased'

#Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
#model = model_class.from_pretrained(pretrained_weights)

In [7]:
#Just the first 512 words max for each row because BERT max sequence length
tokenized = df['title'].str.split().str[:512].apply(' '.join)
tokenized = tokenized.apply(lambda x:tokenizer.encode(x,add_special_tokens=True))
tokenized[:4]

id
0    [101, 2160, 17183, 14895, 1024, 2057, 2134, 15...
1    [101, 13259, 1024, 18520, 7207, 1010, 2502, 24...
2    [101, 2339, 1996, 3606, 2453, 2131, 2017, 5045...
3    [101, 2321, 9272, 2730, 1999, 2309, 2149, 1436...
Name: title, dtype: object

In [8]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Every sentence should have the same lenght
max_seq_len = max(map(len,tokenized))
padded = np.array([pad_sequences([x],maxlen=max_seq_len,padding='post') for x in tokenized])
print(len(tokenized[0]),padded.shape)

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2
26 (19122, 1, 109)


## Model

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded,df['label'].values,
                                                    train_size=0.77,
                                                    random_state=22)
print(X_test.shape)
X_train = [x[0] for x in X_train]
X_test = [x[0] for x in X_test]
print(np.array(X_test).shape)

(4399, 1, 109)
(4399, 109)


In [10]:
from sklearn.pipeline import Pipeline
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import MinMaxScaler

mm_x = MinMaxScaler()
xgb = XGBClassifier()

model_xgb = Pipeline(steps=[('scaler', mm_x),
                            ('model', xgb)])

## Training

In [11]:
model_xgb.fit(X_train,y_train)





Pipeline(steps=[('scaler', MinMaxScaler()),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=8, num_parallel_tree=1, random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=None))])

In [12]:
print(f"Train score: {'{:.1%}'.format(model_xgb.score(X_train,y_train))}\n",
      f"Test score: {'{:.1%}'.format(model_xgb.score(X_test,y_test))}")

Train score: 97.8%
 Test score: 91.0%


## Save the model

In [13]:
import pickle

#First, preserve the tokenizer
with open(os.path.join(BASE_DIR,'fake_tokenizer.pickle'), 'wb') as f:
    pickle.dump(tokenizer, f)
    
#Then the model-pipeline
with open(os.path.join(BASE_DIR,'fake_model.pickle'), 'wb') as f:
    pickle.dump(model_xgb, f)

## Predict

In [14]:
import pickle

with open(os.path.join(BASE_DIR,'fake_tokenizer.pickle'), 'rb') as f:
    tokenizer = pickle.load(f)
    
with open(os.path.join(BASE_DIR,'fake_model.pickle'), 'rb') as f:
    model = pickle.load(f)

In [15]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences 

def fake_predict(text,fake_model,fitted_tokenizer):
    text = ' '.join(text.split()[:512])
    text = tokenizer.encode(text,add_special_tokens=True)
    text = np.array([pad_sequences([x],maxlen=model.n_features_in_,padding='post') for x in [text]])
    proba = model.predict_proba([x[0] for x in text])[0][1]
    print(f"{'{:.1%}'.format(proba)} prob this new is fake.\n")
    return proba

In [16]:
import random

n = len(df)
aux = []
for _ in range(11):
    i = random.randint(0,n)
    text = df.iloc[i,:]['title']
    label = df.iloc[i,:]['label']==1 
    print(('FAKE: 'if label else 'REAL: ')+' '.join(text.split()[:10])+'...')
    aux.append((text,label,fake_predict(text,model,tokenizer)))
    
pred = pd.DataFrame(aux,columns=['title','fake','prediction'])

FAKE: A very important question about the Scunthorpe hospital computer hack...
93.2% prob this new is fake.

FAKE: AIG Quadruples Limits for Terrorism Insurance to $1 Billion...
88.0% prob this new is fake.

FAKE: Here’s What Happened When a Hillary Supporting MIT Professor Decided...
98.7% prob this new is fake.

REAL: Can Raf Simons Reinvent Calvin Klein? - The New York...
5.9% prob this new is fake.

REAL: Falluja Restaurant Is Reborn in Baghdad, Offering Nostalgia With Its...
1.1% prob this new is fake.

REAL: HSBC Bank Executives Face Charges in $3.5 Billion Currency Case...
0.6% prob this new is fake.

REAL: NYT: ’Bill Maher and Milo Yiannopoulos Find Common Ground’ -...
0.3% prob this new is fake.

REAL: Hillary Clinton Surprises by Attending Pride Parade in New York...
2.5% prob this new is fake.

REAL: Parisian Women Face Constant Harassment By Migrants...
98.7% prob this new is fake.

FAKE: Why Time Magazine’s Joe Klein Is So Wrong About Hillary...
90.8% prob this new is fake

In [17]:
pred

Unnamed: 0,title,fake,prediction
0,A very important question about the Scunthorpe...,True,0.932015
1,AIG Quadruples Limits for Terrorism Insurance ...,True,0.880457
2,Here’s What Happened When a Hillary Supporting...,True,0.986988
3,Can Raf Simons Reinvent Calvin Klein? - The Ne...,False,0.059162
4,"Falluja Restaurant Is Reborn in Baghdad, Offer...",False,0.01149
5,HSBC Bank Executives Face Charges in $3.5 Bill...,False,0.006097
6,NYT: ’Bill Maher and Milo Yiannopoulos Find Co...,False,0.002501
7,Hillary Clinton Surprises by Attending Pride P...,False,0.025457
8,Parisian Women Face Constant Harassment By Mig...,False,0.986693
9,Why Time Magazine’s Joe Klein Is So Wrong Abou...,True,0.907936
