In [53]:
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.pipeline import FeatureUnion, make_pipeline, make_union
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [54]:
# load the data

data_folder_name = 'data'
train_body_filename = 'train_bodies.csv'
train_stance_filename = 'train_stances.csv'

train_body_path = data_folder_name +'/'+ train_body_filename 
train_stance_path = data_folder_name + '/' + train_stance_filename

body_data = pd.read_csv(train_body_path)
stance_data = pd.read_csv(train_stance_path)

In [55]:
body_data.sample(10)

Unnamed: 0,Body ID,articleBody
999,1500,Tonight — finally! — ESPN is going to have an ...
217,350,If the past few weeks of nonstop media onslaug...
937,1404,"TOPEKA, Kan. (WIBW)- A Kansas City patient is ..."
1330,2002,BAGHDAD — Three hundred Iraqi soldiers died in...
1094,1647,The TV presenter was filming a programme in th...
1255,1889,Islamic State of Iraq and Syria leader Abu Bak...
794,1190,The artist whose real name was Henry Jackson p...
343,529,"An 85-year-old woman in Waco, Texas has been a..."
419,633,Knightscope co-founder Stacy Stephens said rum...
401,609,WONKET EXCLUSIVE MUST CREDIT WONKetTE! So you ...


In [56]:
# merge the body data with the stance data based on body ID and sample
total_data = pd.merge(body_data, stance_data, on='Body ID')
total_data.sample(10)

Unnamed: 0,Body ID,articleBody,Headline,Stance
17394,1020,YouTube prankster Josh Paler Lin decided to gi...,Rat problem worsens at One World Trade Center ...,unrelated
18913,1120,Pentagon investigating claims but admits one l...,Ann Arbor pizza delivery driver surprised with...,unrelated
1494,77,Ahmed Abdi Godane is the spiritual leader of a...,Met police denies reports of Banksy arrest,unrelated
42459,2210,"Texas Turkey Farm Contaminated With Ebola, Ove...",Kim Jong Un hospitalized with two broken ankle...,unrelated
18425,1084,The Pentagon said Friday that it had confirmed...,"Leader of Qaeda-Linked Somali Group Is Dead, U...",discuss
17320,1019,Christian Bale will star as late Apple CEO Ste...,Christian Bale in Talks to Play Steve Jobs in ...,discuss
28548,1594,(Update: The story has been updated to reflect...,Rare meteorite impact causes blast in Nicaragu...,disagree
13053,776,SIS has developed a new weapon in Iraq designe...,New Audio Reveals Pause in Gunfire When Michae...,unrelated
36540,1948,Militants from Islamic State (Isis) claimed to...,NEWS/ You'll Never Guess How a Homeless Man Sp...,unrelated
12570,742,A Texas National Guard soldier scans the Mexic...,The famous “Dog whisperer” Cesar Millan died o...,unrelated


In [57]:
# custom transformer for column extraction
class ColumnExtractor(TransformerMixin):

    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

In [58]:
# feature extraction with TF-IDF

# create pipes and perform TF-IDF on Headline and Body columns
headline_pipe = make_pipeline(
    ColumnExtractor('Headline'),
    TfidfVectorizer(decode_error='ignore', lowercase=True)
)

body_pipe = make_pipeline(
    ColumnExtractor('articleBody'),
    TfidfVectorizer(decode_error='ignore', lowercase=True)
)

# combine headline and body transformers with a feature union and weight equally
preprocessor = FeatureUnion(transformer_list=[('headline', head_pipe),
                                              ('body', body_pipe)],
                            transformer_weights= {
                                    'headline': 0.5,
                                    'body': 0.5
                               }
                           )

# fit transform and print data
train = preprocessor.fit_transform(total_data)
print(train)
print('Train size: ', train.shape)

  (0, 141)	0.09165769362230133
  (0, 270)	0.09015667682672754
  (0, 916)	0.13870610316615226
  (0, 1011)	0.22110919720496758
  (0, 1321)	0.15583176491758655
  (0, 1735)	0.22110919720496758
  (0, 1823)	0.15067648505879452
  (0, 2093)	0.13870610316615226
  (0, 2654)	0.11533605649744558
  (0, 2717)	0.13268058655372672
  (0, 3202)	0.13899580731607086
  (0, 3481)	0.05260667790482452
  (0, 3598)	0.018430174342005782
  (0, 3738)	0.029215363121401664
  (0, 3882)	0.027661527336946132
  (0, 4051)	0.03418640916556418
  (0, 4397)	0.033504180984306464
  (0, 4536)	0.018785087954458726
  (0, 4558)	0.04811889269643358
  (0, 4641)	0.011519260799340054
  (0, 4676)	0.027643831375161545
  (0, 4808)	0.04040893638385707
  (0, 4841)	0.019147304112527935
  (0, 4970)	0.02084294156311771
  (0, 4971)	0.022954378880178043
  :	:
  (49971, 21765)	0.03856187058390559
  (49971, 22527)	0.023417517592363383
  (49971, 22530)	0.0421605773645041
  (49971, 22533)	0.033737151627779974
  (49971, 22655)	0.10373928211390186
  

In [59]:
# sampling words from dictionary
dictionary = np.asarray(tfidf.get_feature_names())
print(dictionary[np.random.randint(0,len(dictionary),size=20)])

['10am' 'tears' 'erase' 'col' 'rolled' 'excommunicated' 'keys' 'minister'
 'spaces' 'glad' 'reel' 'hollow' 'chatting' 'crystal' 'recalls' 'refuses'
 'feels' 'additionally' 'tyrant' 'digs']
