In [1]:
import pandas as pd
import numpy as np
import string
import re
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from collections import defaultdict, Counter
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, log_loss

# ML
from catboost import CatBoostClassifier

In [2]:
data_true = pd.read_csv('D:/Data/Fake and real news/True.csv')
data_fake = pd.read_csv('D:/Data/Fake and real news/Fake.csv')

In [3]:
data_true

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


In [4]:
data_fake

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


In [5]:
# Add labels
data_true['target'] = np.ones(data_true.shape[0])
data_fake['target'] = np.zeros(data_fake.shape[0])

# Create one big dataframe
dataframes = [data_true, data_fake]
data = pd.concat(dataframes, ignore_index=True)

# Shuffle new dataframe
data = data.sample(frac=1).reset_index(drop=True)

# Pull out only text and target features
data = data[['text', 'target']]

In [6]:
data

Unnamed: 0,text,target
0,"WINSTON-SALEM, N.C. (Reuters) - North Carolina...",1.0
1,WASHINGTON (Reuters) - U.S. President Barack O...,1.0
2,"WASHINGTON (Reuters) - Donald Trump, leading t...",1.0
3,BERLIN (Reuters) - German police on Tuesday de...,1.0
4,WASHINGTON (Reuters) - President Barack Obama ...,1.0
...,...,...
44893,"In September 2015, the Sterling Heights, MI Ci...",0.0
44894,"Just like Obamacare was pushed through, the Ir...",0.0
44895,A television reporter has been fired after she...,0.0
44896,WASHINGTON (Reuters) - The U.S. Senate Judicia...,1.0


<h1>Exploratory Data Analysis</h1>

In [7]:
fig = go.Figure()

to_plot = data.value_counts('target')

fig.add_trace(go.Pie(
    labels = to_plot.index.map({0: 'Fake', 1: 'True'}),
    values = to_plot.values,
    textinfo = 'label+percent'
))

fig.update_layout(
    template = 'plotly_dark'
)

We are lucky because our dataset is fairly balanced. With this information our work will be easier. No we will check couple of common attributes of text data like number of words or number of letters in news

In [8]:
fig = make_subplots(rows=2, cols=1, subplot_titles=('True news', 'Fake news'))

news_len_true = data[data['target']==1]['text'].str.len()

fig.add_trace(
    go.Histogram(x=news_len_true, name='True news', nbinsx=500),
    row=1, 
    col=1
)

news_len_false = data[data['target']==0]['text'].str.len()

fig.add_trace(
    go.Histogram(x=news_len_false, name='Fake news', nbinsx=500),
    row=2, 
    col=1
)

fig.update_layout(
    template='plotly_dark',
    title_text='Number of characters in news'
)

In [9]:
fig = make_subplots(rows=2, cols=1, subplot_titles=('True news', 'Fake news'))

word_len_true = data[data['target']==1]['text'].str.split().map(lambda x: len(x))

fig.add_trace(
    go.Histogram(x=word_len_true, name='True news'),
    row=1,
    col=1
)

word_len_false = data[data['target']==0]['text'].str.split().map(lambda x: len(x))

fig.add_trace(
    go.Histogram(x=word_len_false, name='Fake news'),
    row=2,
    col=1
)

fig.update_layout(
    template='plotly_dark',
    title_text='Number of words in words'
)

<h2>Common stopwords</h2>

In [10]:
def words_list(target):
    words=[]

    for x in data[data['target']==target]['text'].str.split():
        for i in x:
            words.append(i)
    return words

In [11]:
words_list_true = words_list(1)
stop = stopwords.words('english')
dic = defaultdict(int)

for word in words_list_true:
    if word in stop:
        dic[word] += 1

top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]

In [12]:
fig = go.Figure()

x, y = zip(*top)

fig.add_trace(go.Bar(
    x=x,
    y=y
))

fig.update_layout(
    template='plotly_dark',
    title_text='Common stopwords in true news'
)

In [13]:
words_list_true = words_list(0)
stop = stopwords.words('english')
dic = defaultdict(int)

for word in words_list_true:
    if word in stop:
        dic[word] += 1

top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]

In [14]:
fig = go.Figure()

x, y = zip(*top)

fig.add_trace(go.Bar(
    x=x,
    y=y
))

fig.update_layout(
    template='plotly_dark',
    title_text='Common stopwords in fake news'
)

<h2>Punctuations</h2>

In [15]:
punctuations = words_list(1)

dic = defaultdict(int)
special = string.punctuation
for p in punctuations:
    if p in special:
        dic[p] += 1

x, y = zip(*dic.items())

In [16]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = x,
    y = y
))

fig.update_layout(
    title_text = 'Punctuations for true news',
    template='plotly_dark'
)

In [17]:
punctuations = words_list(0)

dic = defaultdict(int)
special = string.punctuation
for p in punctuations:
    if p in special:
        dic[p] += 1

x, y = zip(*dic.items())

In [18]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = x,
    y = y
))

fig.update_layout(
    title_text = 'Punctuations for fake news',
    template='plotly_dark'
)

<h2>Common words</h2>

In [19]:
counter = Counter(words_list(1))
most_common = counter.most_common()

x = list()
y = list()

for word, count in most_common[:50]:
    if word not in stop:
        x.append(word)
        y.append(count)

In [20]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = x,
    y = y
))

fig.update_layout(
    title_text = 'Common words true news',
    template='plotly_dark'
)

In [21]:
counter = Counter(words_list(0))
most_common = counter.most_common()
x = list()
y = list()

for word, count in most_common[:70]:
    if word not in stop:
        x.append(word)
        y.append(count)

In [22]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = x,
    y = y
))

fig.update_layout(
    title_text = 'Common words for fake news',
    template='plotly_dark'
)

<h1>Text Preprocessing</h1>

<h2>Removing urls</h2>

In [23]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

In [24]:
# Delete url from all rows
data['text'] = data['text'].apply(lambda x: remove_URL(x))

<h2>Removing Emojis</h2>

In [25]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [26]:
# Delete emojis from all rows
data['text'] = data['text'].apply(lambda x: remove_emoji(x))

<h2>Removing punctuacions</h2>

In [27]:
def remove_punctuations(text):
    """This function creates a dictionary mapping of every character from string.punctuation to None"""
    table = str.maketrans('','', string.punctuation)
    return text.translate(table)

In [28]:
# Delete punctuations from all rows
data['text'] = data['text'].apply(lambda x: remove_punctuations(x))

In [29]:
data

Unnamed: 0,text,target
0,WINSTONSALEM NC Reuters North Carolina will p...,1.0
1,WASHINGTON Reuters US President Barack Obama’...,1.0
2,WASHINGTON Reuters Donald Trump leading the p...,1.0
3,BERLIN Reuters German police on Tuesday detai...,1.0
4,WASHINGTON Reuters President Barack Obama and...,1.0
...,...,...
44893,In September 2015 the Sterling Heights MI City...,0.0
44894,Just like Obamacare was pushed through the Ira...,0.0
44895,A television reporter has been fired after she...,0.0
44896,WASHINGTON Reuters The US Senate Judiciary Co...,1.0


<h1>Prediction Model Creation</h1>

In [30]:
data_train = data[:40000]
data_test = data[40000:41000]

In [31]:
data_test

Unnamed: 0,text,target
40000,Donald Trump at this point is known for spewin...,0.0
40001,SEOUL Reuters South Korea expects more provoc...,1.0
40002,WASHINGTON Reuters Aides to Presidentelect Do...,1.0
40003,In 1991 when old white men turned on Anita Hil...,0.0
40004,AUSTIN Texas Reuters Former Democratic US Vic...,1.0
...,...,...
40995,GENEVA Reuters North Korea on Tuesday rejecte...,1.0
40996,UNITED NATIONS Reuters The United States on W...,1.0
40997,PHNOM PENH Reuters Cambodian Prime Minister H...,1.0
40998,You can t say that both parties are the same w...,0.0


In [32]:
reduced_data = data_train.sample(5000).reset_index().drop(['index'], axis=1)

In [33]:
vectorizer = TfidfVectorizer()

data_train_vector = pd.DataFrame(vectorizer.fit_transform(reduced_data['text']).toarray())
data_test_vector = pd.DataFrame(vectorizer.transform(data_test['text']).toarray())
data_train_vector['target'] = reduced_data['target']

In [34]:
X = data_train_vector.drop(['target'], axis=1)
y = data_train_vector['target']

In [35]:
cat_model = CatBoostClassifier()

log_pred = np.zeros(len(X))
test_pred = np.zeros(len(data_test_vector))

In [36]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X, y)):
    print('Fold: ', fold_)
    model = cat_model.fit(
        X.iloc[train_index],
        y.iloc[train_index],
        eval_set = [(X.iloc[train_index], y.iloc[train_index]), (X.iloc[val_index], y.iloc[val_index])],
        early_stopping_rounds = 10,
        verbose = False,
    )

    temp_pred = model.predict(X.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'F1-score: {f1_score(y.iloc[val_index], temp_pred)}')

    temp_test = model.predict(data_test_vector)
    test_pred += temp_test

final_pred = np.zeros(len(test_pred))
for index, pred in enumerate(test_pred):
    if(pred >= 3):
        final_pred[index] = 1

print(f'Overall F1-score: {f1_score(y, log_pred)}')

Fold:  0
F1-score: 0.9875259875259876
Fold:  1
F1-score: 0.9947753396029259
Fold:  2
F1-score: 0.9896049896049895
Fold:  3
F1-score: 0.9905362776025235
Fold:  4
F1-score: 0.9926854754440961
Overall F1-score: 0.9910210899979119


In [40]:
print('Performance on test data (F1-score): {}'.format(f1_score(data_test['target'], final_pred)))

Performance on test data (F1-score): 0.9956896551724138
