# Natural Language Processing for the Fake News Challenge

## Main Imports

In [10]:
import pandas as pd
import numpy as np

from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

## Set up the data for preprocessing
### Load and sample the data

In [2]:
data_folder_name = 'data'
train_body_filename = 'train_bodies.csv'
train_stance_filename = 'train_stances.csv'

train_body_path = data_folder_name +'/'+ train_body_filename 
train_stance_path = data_folder_name + '/' + train_stance_filename

body_data = pd.read_csv(train_body_path)
stance_data = pd.read_csv(train_stance_path)

In [3]:
body_data.sample(10)

Unnamed: 0,Body ID,articleBody
1641,2470,The mystery surrounding North Korean dictator ...
724,1085,Axl Rose is NOT dead. He’s the victim of a new...
1581,2375,This post was widely shared over the internet ...
1048,1572,Multiple users of anonymous web browser Tor ha...
1523,2284,At least now I know who to direct my sun rage ...
830,1252,A second patient with Ebola symptoms walked in...
1412,2132,The video was one of those viral sensations th...
950,1422,The tweed-wearing germ-hater smacked two women...
783,1175,A spokesman for the Iraqi health ministry deni...
1579,2372,Mayor de Blasio and NYPD Commissioner Bill Bra...


### Merge the data based on the Body ID to get one dataframe containing the corresponding Headlines and Article Bodies

In [4]:
total_data = pd.merge(body_data, stance_data, on='Body ID')
total_data.sample(10)

Unnamed: 0,Body ID,articleBody,Headline,Stance
23218,1347,"The Apple Watch might be water resistant, acco...",Another American hostage at risk by Islamic state,unrelated
43108,2248,"SEVEN girls, aged 13 to 15, have fallen pregna...",CNN audio: Is this the moment Michael Brown wa...,unrelated
43024,2242,A Texas plumber is getting death threats after...,Texas Truck Winds Up in Syria With Islamic Mil...,agree
26938,1528,It would have been pretty embarrassing for Bat...,Judd Nelson rebuffs Internet rumors that he di...,unrelated
2577,139,Judd Nelson rebuffs Internet rumors that he di...,Video shows ISIL beheading of photojournalist ...,unrelated
29915,1670,Although the majority of Austrian resorts are ...,US drones hunt Isis leader in Syria,unrelated
28403,1588,Boko Haram has denied claims by Nigeria's gove...,Nigeria and Boko Haram 'agree ceasefire and gi...,discuss
21648,1268,"Vice Media CEO and co-founder Shane Smith, who...",A Man Says Comcast Called His Boss And Got Him...,unrelated
25281,1435,Apple may be planning to hold a special event ...,Video Messaging App Says Audio Recording Of Mi...,unrelated
19888,1179,Supporters of the Islamic State of Iraq and Sy...,Nicaragua says meteorite probable cause of bla...,unrelated


### Splitting the dataset into train and validation sets

In [5]:
input_columns = total_data[['Headline', 'articleBody']]
target = 'Stance'
X_train, X_val, y_train, y_val = train_test_split(input_columns, total_data[target].values , test_size=0.20, random_state=0)

# check the size of our datasets
print('Size of training set:', X_train.shape)
print('Size of validation set:', X_val.shape)

X_train.sample(10)

Size of training set: (39977, 2)
Size of validation set: (9995, 2)


Unnamed: 0,Headline,articleBody
40787,Iraqi media says ISIS militants have contracte...,The Islamic State (Isis) operating in Iraq doe...
1980,Obama Denounces James Foley's Execution: 'Toda...,Afghanistan veteran Sam Arnold uploaded this s...
10502,Did Josh Paler Lin Stage His Viral Video About...,"When Apple unveiled the Apple Watch, the unvei..."
758,Six months after abducting Nigerian schoolgirl...,"Abdel-Majed Abdel Bary, who went by the rap na..."
11159,Pope says dogs can go to heaven: ‘Paradise is ...,Brian Williams took a moment during Monday’s e...
22692,Batmobile Stolen From 'Batman V Superman: Dawn...,"Earlier today, the Detroit media reported on r..."
48236,Tropical spider 'burrowed under man's skin thr...,There are Christmas miracles after all.\n\nFor...
41308,Wife Of ISIS Leader Abu Bakr al-Baghdadi Detai...,Lebanese authorities are holding a daughter an...
12147,CNN Plays Alleged Audiotape of Michael Brown S...,Forget sweater weather and crisp autumn leaves...
18333,Christian Bale In Talks To Play Steve Jobs In ...,$1.61\nThat's how much WalMart reported pulled...


## TF-IDF Feature Extraction
### Custom Transformer for Pipeline

In [6]:
# custom transformer for column extraction
class ColumnExtractor(TransformerMixin):

    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

### Pipelines and Feature Union of Headline and Article Body
- Apply fit transform

In [7]:
# feature extraction with TF-IDF

# create pipes and perform TF-IDF on Headline and Body columns
headline_pipe = make_pipeline(
    ColumnExtractor('Headline'),
    TfidfVectorizer(decode_error='ignore', min_df=2, max_df=0.5, lowercase=True)
)

body_pipe = make_pipeline(
    ColumnExtractor('articleBody'),
    TfidfVectorizer(decode_error='ignore', min_df=2, max_df=0.5, lowercase=True)
)

# combine headline and body transformers with a feature union and weight equally
preprocessor = FeatureUnion(transformer_list=[('headline', headline_pipe),
                                              ('body', body_pipe)],
                            transformer_weights= {
                                    'headline': 0.5,
                                    'body': 0.5
                               }
                           )

# fit transform and print data
train = preprocessor.fit_transform(X_train)
print(train)
print('Train size: ', train.shape)

  (0, 262)	0.144999360637973
  (0, 460)	0.1340446020897992
  (0, 947)	0.13540073141762096
  (0, 1182)	0.05935997104518523
  (0, 1347)	0.07334590459758498
  (0, 1585)	0.15483289244386472
  (0, 1629)	0.15186498240697285
  (0, 1642)	0.09482691473950242
  (0, 1811)	0.13470962383149607
  (0, 1976)	0.08494580673689808
  (0, 2053)	0.08508717929217971
  (0, 2670)	0.09748456257087873
  (0, 2784)	0.07027612583925776
  (0, 2895)	0.13372127219819605
  (0, 3037)	0.12446556262654324
  (0, 3084)	0.07768083204733288
  (0, 3098)	0.1340446020897992
  (0, 3247)	0.10953533999819696
  (0, 3327)	0.1064176466979934
  (0, 3663)	0.01810957629851757
  (0, 3720)	0.016896447039576698
  (0, 3978)	0.027436972223690282
  (0, 4047)	0.008304083055796328
  (0, 4052)	0.04256387531523555
  (0, 4148)	0.02091800564388568
  :	:
  (39976, 24011)	0.030753736060680485
  (39976, 24180)	0.01120058401794071
  (39976, 24402)	0.02114441184406385
  (39976, 24498)	0.010716303510234795
  (39976, 24603)	0.03763997985264709
  (39976, 24

## Support Vector Machine Classifier

In [None]:
# training an SVM on TF-IDF features
# Define the parameters to tune
parameters = { 
    'C': [1.0, 10],
    'gamma': [1, 'auto', 'scale']
}
# Tune yyperparameters  using Grid Search and a SVM model
model = GridSearchCV(SVC(kernel='rbf'), parameters, cv=5, n_jobs=-1).fit(train, y_train)