# Natural Language Processing for the Fake News Challenge

## Main Imports

In [2]:
import time
import torch
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from joblib import dump, load
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc, f1_score

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [3]:
!pip install transformers
import transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 14.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 57.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 55.8MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=9690a9ba47

## Set up the data for preprocessing
### Load and sample the data

In [4]:
data_folder_name = 'data'
train_body_filename = 'train_bodies.csv'
train_stance_filename = 'train_stances.csv'

train_body_path = data_folder_name +'/'+ train_body_filename 
train_stance_path = data_folder_name + '/' + train_stance_filename

body_data = pd.read_csv(train_body_path)
stance_data = pd.read_csv(train_stance_path)

In [5]:
body_data.sample(10)

Unnamed: 0,Body ID,articleBody
596,899,A nun who complained of stomach pains shocked ...
1493,2247,"SANAA, Dec 6 (Reuters) - U.S. journalist Luke ..."
1514,2273,When Tim Cook unveiled the Apple Watch back in...
1136,1720,A South American nun suddenly started experien...
54,81,"In this week's edition of lies, fakes, pranks ..."
710,1066,We still don’t know the exact date the Apple W...
1064,1595,The Apple Watch Sport may start at a mere $349...
1178,1784,A photo of a woman sitting in front of what ap...
1515,2274,Jordan’s King Abdullah announced he was cuttin...
809,1215,"""We needed the best actor on the board in a ce..."


### Merge the data based on the Body ID to get one dataframe containing the corresponding Headlines and Article Bodies

In [6]:
total_data = pd.merge(body_data, stance_data, on='Body ID')

# randomly drop rows to improve performance on SVM
remove_n = 0
drop_indices = np.random.choice(total_data.index, remove_n, replace=False)
total_data = total_data.drop(drop_indices)

# add relevance column
total_data['Relevance'] = np.where(total_data['Stance'] == 'unrelated', 'unrelated', 'related')
total_data.sample(10)

Unnamed: 0,Body ID,articleBody,Headline,Stance,Relevance
11979,720,DNA tests have confirmed that a daughter and a...,"Confusion swirls, details murky in arrest of I...",discuss,related
15406,932,India is going through some serious public sec...,That Was Fast: Christian Bale Bails on Steve J...,unrelated,unrelated
13927,830,"Suresh Kumar, 40, was set upon after locals in...",Christian Bale In Talks To Play Steve Jobs In ...,unrelated,unrelated
14560,869,KLAS-TV in Las Vegas is reporting that Jose Ca...,Report—Jose Canseco accidentally shot at his h...,discuss,related
20062,1195,The Islamist group Boko Haram has denied claim...,Six months after abducting Nigerian schoolgirl...,discuss,related
49532,2523,THE hunt is on to find the owner of a dog who ...,Angry mob hacks off alleged rapist's genitals ...,unrelated,unrelated
32472,1804,"Seven girls from a Bosnian school, all aged be...",Fence-jumper ran through much of main floor of...,unrelated,unrelated
335,11,We’ve all seen the traditional depictions of G...,Schoolboy almost killed by electric shock clai...,unrelated,unrelated
9956,614,Batman as Steve Jobs? Sure. How about that guy...,Turkish Leader Says U.S. Airdrop Aided ISIS Mi...,unrelated,unrelated
47594,2431,The Apple Watch is on track for a February rel...,Apple Watch delayed for February in limited qu...,discuss,related


### Splitting the dataset into train and validation sets

In [7]:
input_columns = total_data[['Headline', 'articleBody']]
target = 'Relevance'
X_train, X_val, y_train, y_val = train_test_split(input_columns, total_data[target].values , test_size=0.20, random_state=0)

# check the size of our datasets
print('Size of training set:', X_train.shape)
print('Size of validation set:', X_val.shape)

X_train.sample(10)

Size of training set: (39977, 2)
Size of validation set: (9995, 2)


Unnamed: 0,Headline,articleBody
39702,"Is North Korea, world's most secret state, rea...",Islamic State militants have released a graphi...
23952,ESPN to save NFL's image with all-male domesti...,ESPN continues to try to thread the needle bet...
21071,"Managua explosion not a meteorite, NASA suggests","MANAGUA, Nicaragua — Nicaragua’s government sa..."
2219,Someone painted a graffiti dick on a $2.5 mill...,Some might say that no matter how rich you are...
20847,Armed U.S. drones spotted flying over Syria in...,In a sprawling Facebook post and subsequent in...
33896,Sources: Guns N' Roses Frontman Axl Rose Found...,A dog was found abandoned at a Scottish train ...
3199,Officials shoot down congressman's claims ISIS...,Variety is reporting that Seth Rogen's got the...
12574,"Prankster Gives Homeless Man $100, Secretly Fo...",A Texas National Guard soldier scans the Mexic...
21889,ISIS beheads American photo-journalist James W...,The missing American freelance photo-journalis...
33257,Big Bank Hank of The Sugarhill Gang is dead at 57,Legendary Sugarhill Gang rapper Big Bank Hank ...


## TF-IDF Feature Extraction
### Custom Transformer for Pipeline

In [8]:
# custom transformer for column extraction
class ColumnExtractor(TransformerMixin):

    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

### Pipelines and Feature Union of Headline and Article Body
- Apply fit transform

In [None]:
# feature extraction with TF-IDF

# create pipes and perform TF-IDF on Headline and Body columns
headline_pipe = make_pipeline(
    ColumnExtractor('Headline'),
    TfidfVectorizer(decode_error='ignore', min_df=4, max_df=0.5, max_features=200, lowercase=True)
)

body_pipe = make_pipeline(
    ColumnExtractor('articleBody'),
    TfidfVectorizer(decode_error='ignore', min_df=4, max_df=0.5, max_features=200, lowercase=True)
)

# combine headline and body transformers with a feature union and weight equally
preprocessor = FeatureUnion(transformer_list=[('headline', headline_pipe),
                                              ('body', body_pipe)],
                            transformer_weights= {
                                    'headline': 0.5,
                                    'body': 0.5
                               }
                           )

# fit transform and print data
train = preprocessor.fit_transform(X_train)
val = preprocessor.transform(X_val)
print('Train size: ', train.shape)
print('Val size: ', val.shape)

Train size:  (39977, 400)
Val size:  (9995, 400)


# Tokenisation with BERT

## Load the model and tokeniser

In [9]:
model_class, tokenizer_class, pretrained_weights = (transformers.DistilBertModel,
                                                    transformers.DistilBertTokenizer,
                                                    'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [10]:
tokenized = tokenizer(list(X_train['Headline'][:100]), list(X_train['articleBody'][:100]),\
                      padding='max_length', truncation=True, return_tensors="pt")

In [11]:
tokenized_text = tokenized['input_ids']

with torch.no_grad():
    embeddings = model(tokenized_text)

get_cls = lambda x: x[0][:, 0, :].squeeze()
train = get_cls(embeddings)


## Evaluation Metrics

### Confusion Matrix

In [12]:
# Create the confusion matrix
def plot_confusion_matrix(y_test, y_pred):
    ''' Plot the confusion matrix for the target labels and predictions '''
    cm = confusion_matrix(y_test, y_pred)

    # Create a dataframe with the confusion matrix values
    df_cm = pd.DataFrame(cm, range(cm.shape[0]),
                  range(cm.shape[1]))

    # Plot the confusion matrix
    sn.set(font_scale=1.4) #for label size
    sn.heatmap(df_cm, annot=True,fmt='.0f',cmap="YlGnBu",annot_kws={"size": 10})# font size
    plt.show()

### ROC Curve

In [13]:
# ROC Curve
# plot no skill
# Calculate the points in the ROC curve
def plot_roc_curve(y_test, y_pred):
    ''' Plot the ROC curve for the target labels and predictions'''
    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    roc_auc= auc(fpr,tpr)
    plt.figure(figsize=(12, 12))
    ax = plt.subplot(121)
    ax.set_aspect(1)
    
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

## Training Models

### Naive Bayes

In [None]:
# create the model, train it on the train dataset and print the scores
model = MultinomialNB() # as implemented in sklearn
model.fit(train, y_train)
print("train score:", model.score(train, y_train))
print("validation score:", model.score(val, y_val))

train score: 0.7339970483027741
validation score: 0.7237618809404702


### Support Vector Machine Classifier

In [14]:
# training an SVM on TF-IDF features
# Define the parameters to tune

start = time.time()
parameters = {
    'C': [1, 10],
    'gamma': [1, 'auto', 'scale']
}
# Tune yyperparameters  using Grid Search and a SVM model
model = GridSearchCV(SVC(kernel='rbf'), parameters, cv=5, n_jobs=-1).fit(train, y_train[:100])
end = time.time()
print(end - start, "seconds")
dump(model, 'new_SVM.joblib') 

11.477129220962524 seconds


['new_SVM.joblib']

In [None]:
y_pred = model.predict(val)

print(metrics.classification_report(y_val, y_pred))
plot_confusion_matrix(y_val, y_pred)

# convert to 0s and 1s for plotting ROC curve
roc_y_val = [np.int64(1) if i == 'related' else np.int64(0) for i in y_val]
roc_y_pred = [np.int64(1) if i == 'related' else np.int64(0) for i in y_pred]

plot_roc_curve(roc_y_val, roc_y_pred)

NameError: name 'val' is not defined