In [None]:
import pandas as pd
import numpy as np
import string
import re
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from collections import defaultdict, Counter
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, log_loss

# ML
from catboost import CatBoostClassifier

In [None]:
data_true = pd.read_csv('D:/Data/Fake and real news/True.csv')
data_fake = pd.read_csv('D:/Data/Fake and real news/Fake.csv')

In [None]:
data_true

In [None]:
data_fake

In [None]:
# Add labels
data_true['target'] = np.ones(data_true.shape[0])
data_fake['target'] = np.zeros(data_fake.shape[0])

# Create one big dataframe
dataframes = [data_true, data_fake]
data = pd.concat(dataframes, ignore_index=True)

# Shuffle new dataframe
data = data.sample(frac=1).reset_index(drop=True)

# Pull out only text and target features
data = data[['text', 'target']]

In [None]:
data

<h1>Exploratory Data Analysis</h1>

In [None]:
fig = go.Figure()

to_plot = data.value_counts('target')

fig.add_trace(go.Pie(
    labels = to_plot.index.map({0: 'Fake', 1: 'True'}),
    values = to_plot.values,
    textinfo = 'label+percent'
))

fig.update_layout(
    template = 'plotly_dark'
)

We are lucky because our dataset is fairly balanced. With this information our work will be easier. No we will check couple of common attributes of text data like number of words or number of letters in news

In [None]:
fig = make_subplots(rows=2, cols=1, subplot_titles=('True news', 'Fake news'))

news_len_true = data[data['target']==1]['text'].str.len()

fig.add_trace(
    go.Histogram(x=news_len_true, name='True news', nbinsx=500),
    row=1, 
    col=1
)

news_len_false = data[data['target']==0]['text'].str.len()

fig.add_trace(
    go.Histogram(x=news_len_false, name='Fake news', nbinsx=500),
    row=2, 
    col=1
)

fig.update_layout(
    template='plotly_dark',
    title_text='Number of characters in news'
)

In [None]:
fig = make_subplots(rows=2, cols=1, subplot_titles=('True news', 'Fake news'))

word_len_true = data[data['target']==1]['text'].str.split().map(lambda x: len(x))

fig.add_trace(
    go.Histogram(x=word_len_true, name='True news'),
    row=1,
    col=1
)

word_len_false = data[data['target']==0]['text'].str.split().map(lambda x: len(x))

fig.add_trace(
    go.Histogram(x=word_len_false, name='Fake news'),
    row=2,
    col=1
)

fig.update_layout(
    template='plotly_dark',
    title_text='Number of words in words'
)

<h2>Common stopwords</h2>

In [None]:
def words_list(target):
    words=[]

    for x in data[data['target']==target]['text'].str.split():
        for i in x:
            words.append(i)
    return words

In [None]:
words_list_true = words_list(1)
stop = stopwords.words('english')
dic = defaultdict(int)

for word in words_list_true:
    if word in stop:
        dic[word] += 1

top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]

In [None]:
fig = go.Figure()

x, y = zip(*top)

fig.add_trace(go.Bar(
    x=x,
    y=y
))

fig.update_layout(
    template='plotly_dark',
    title_text='Common stopwords in true news'
)

In [None]:
words_list_true = words_list(0)
stop = stopwords.words('english')
dic = defaultdict(int)

for word in words_list_true:
    if word in stop:
        dic[word] += 1

top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]

In [None]:
fig = go.Figure()

x, y = zip(*top)

fig.add_trace(go.Bar(
    x=x,
    y=y
))

fig.update_layout(
    template='plotly_dark',
    title_text='Common stopwords in fake news'
)

<h2>Punctuations</h2>

In [None]:
punctuations = words_list(1)

dic = defaultdict(int)
special = string.punctuation
for p in punctuations:
    if p in special:
        dic[p] += 1

x, y = zip(*dic.items())

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = x,
    y = y
))

fig.update_layout(
    title_text = 'Punctuations for true news',
    template='plotly_dark'
)

In [None]:
punctuations = words_list(0)

dic = defaultdict(int)
special = string.punctuation
for p in punctuations:
    if p in special:
        dic[p] += 1

x, y = zip(*dic.items())

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = x,
    y = y
))

fig.update_layout(
    title_text = 'Punctuations for fake news',
    template='plotly_dark'
)

<h2>Common words</h2>

In [None]:
counter = Counter(words_list(1))
most_common = counter.most_common()

x = list()
y = list()

for word, count in most_common[:50]:
    if word not in stop:
        x.append(word)
        y.append(count)

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = x,
    y = y
))

fig.update_layout(
    title_text = 'Common words true news',
    template='plotly_dark'
)

In [None]:
counter = Counter(words_list(0))
most_common = counter.most_common()
x = list()
y = list()

for word, count in most_common[:70]:
    if word not in stop:
        x.append(word)
        y.append(count)

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = x,
    y = y
))

fig.update_layout(
    title_text = 'Common words for fake news',
    template='plotly_dark'
)

<h1>Text Preprocessing</h1>

<h2>Removing urls</h2>

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

In [None]:
# Delete url from all rows
data['text'] = data['text'].apply(lambda x: remove_URL(x))

<h2>Removing Emojis</h2>

In [None]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
# Delete emojis from all rows
data['text'] = data['text'].apply(lambda x: remove_emoji(x))

<h2>Removing punctuacions</h2>

In [None]:
def remove_punctuations(text):
    """This function creates a dictionary mapping of every character from string.punctuation to None"""
    table = str.maketrans('','', string.punctuation)
    return text.translate(table)

In [None]:
# Delete punctuations from all rows
data['text'] = data['text'].apply(lambda x: remove_punctuations(x))

In [None]:
data

<h1>Prediction Model Creation</h1>

In [None]:
data_train = data[:40000]
data_test = data[40000:41000]

In [None]:
data_test

In [None]:
reduced_data = data_train.sample(5000).reset_index().drop(['index'], axis=1)

In [None]:
vectorizer = TfidfVectorizer()

data_train_vector = pd.DataFrame(vectorizer.fit_transform(reduced_data['text']).toarray())
data_test_vector = pd.DataFrame(vectorizer.transform(data_test['text']).toarray())
data_train_vector['target'] = reduced_data['target']

In [None]:
X = data_train_vector.drop(['target'], axis=1)
y = data_train_vector['target']

In [None]:
cat_model = CatBoostClassifier()

log_pred = np.zeros(len(X))
test_pred = np.zeros(len(data_test_vector))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_, (train_index, val_index) in enumerate(skf.split(X, y)):
    print('Fold: ', fold_)
    model = cat_model.fit(
        X.iloc[train_index],
        y.iloc[train_index],
        eval_set = [(X.iloc[train_index], y.iloc[train_index]), (X.iloc[val_index], y.iloc[val_index])],
        early_stopping_rounds = 10,
        verbose = False,
    )

    temp_pred = model.predict(X.iloc[val_index])
    log_pred[val_index] = temp_pred

    print(f'F1-score: {f1_score(y.iloc[val_index], temp_pred)}')

    temp_test = model.predict(data_test_vector)
    test_pred += temp_test

final_pred = np.zeros(len(test_pred))
for index, pred in enumerate(test_pred):
    if(pred >= 3):
        final_pred[index] = 1

print(f'Overall F1-score: {f1_score(y, log_pred)}')

In [None]:
print('Performance on test data (F1-score): {}'.format(f1_score(data_test['target'], final_pred)))