In [1]:
import pandas as pd
import spacy
import re

In [2]:
nlp = spacy.load('en_core_web_sm', disable=['ner','parser'])

In [3]:
df1 = pd.read_csv('train_bodies.csv')

In [4]:
df1.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [5]:
doc_token_list = [nlp(value) for value in df1['articleBody']]
doc_token_list[0]

A small meteorite crashed into a wooded area in Nicaragua's capital of Managua overnight, the government said Sunday. Residents reported hearing a mysterious boom that left a 16-foot deep crater near the city's airport, the Associated Press reports. 

Government spokeswoman Rosario Murillo said a committee formed by the government to study the event determined it was a "relatively small" meteorite that "appears to have come off an asteroid that was passing close to Earth." House-sized asteroid 2014 RC, which measured 60 feet in diameter, skimmed the Earth this weekend, ABC News reports. 
Murillo said Nicaragua will ask international experts to help local scientists in understanding what happened.

The crater left by the meteorite had a radius of 39 feet and a depth of 16 feet,  said Humberto Saballos, a volcanologist with the Nicaraguan Institute of Territorial Studies who was on the committee. He said it is still not clear if the meteorite disintegrated or was buried.

Humberto Garcia

In [6]:
df1['articleBody'][0]

'A small meteorite crashed into a wooded area in Nicaragua\'s capital of Managua overnight, the government said Sunday. Residents reported hearing a mysterious boom that left a 16-foot deep crater near the city\'s airport, the Associated Press reports. \r\n\r\nGovernment spokeswoman Rosario Murillo said a committee formed by the government to study the event determined it was a "relatively small" meteorite that "appears to have come off an asteroid that was passing close to Earth." House-sized asteroid 2014 RC, which measured 60 feet in diameter, skimmed the Earth this weekend, ABC News reports. \r\nMurillo said Nicaragua will ask international experts to help local scientists in understanding what happened.\r\n\r\nThe crater left by the meteorite had a radius of 39 feet and a depth of 16 feet,  said Humberto Saballos, a volcanologist with the Nicaraguan Institute of Territorial Studies who was on the committee. He said it is still not clear if the meteorite disintegrated or was buried

In [7]:
def cleaning(doc):
  txt = [token.lemma_ for token in doc if not token.is_stop]
  if len(txt) >= 2:
    return ' '.join(txt)

In [8]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df1['articleBody'])


In [9]:
cleaned_txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning)]

In [10]:
cleaned_txt[0]

'small meteorite crash wooded area nicaragua capital managua overnight government say sunday resident report hear mysterious boom leave foot deep crater near city airport associated press report government spokeswoman rosario murillo say committee form government study event determine relatively small meteorite appear come asteroid pass close earth house size asteroid rc measure foot diameter skim earth weekend abc news reports murillo say nicaragua ask international expert help local scientist understanding happen crater leave meteorite radius foot depth foot say humberto saballo volcanologist nicaraguan institute territorial study committee say clear meteorite disintegrate bury humberto garcia astronomy center national autonomous university nicaragua say meteorite relate asteroid forecast pass planet saturday night study ice rock say wilfrie strauch adviser institute territorial study say strange report streak light ask photo local resident report hear loud boom saturday night say st

In [11]:
df1_cleaned = pd.DataFrame(columns=['Body ID', 'articleBody_cleaned'])
df1_cleaned['Body ID'] = df1['Body ID']
df1_cleaned['articleBody_cleaned'] = cleaned_txt
df1_cleaned.head()

Unnamed: 0,Body ID,articleBody_cleaned
0,0,small meteorite crash wooded area nicaragua ca...
1,4,week hint come ebola fear spread america today...
2,5,newser wonder long quarter pounder cheese au...
3,6,post photo gun tote child online isis supporte...
4,7,suspect boko haram insurgent kill clash soldie...


In [12]:
df1_cleaned['articleBody_cleaned'] = df1_cleaned['articleBody_cleaned'].str.findall('\w{2,}').str.join(' ')
df1_cleaned.head()

Unnamed: 0,Body ID,articleBody_cleaned
0,0,small meteorite crash wooded area nicaragua ca...
1,4,week hint come ebola fear spread america today...
2,5,newser wonder long quarter pounder cheese aust...
3,6,post photo gun tote child online isis supporte...
4,7,suspect boko haram insurgent kill clash soldie...


In [13]:
df1_cleaned.to_csv('lemma_train_bodies.csv', index=False)

In [17]:
df2 = pd.read_csv('train_stances.csv')
df2.head()

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree


In [15]:
doc_token_list2 = [nlp(value) for value in df2['Headline']]

In [16]:
doc_token_list2[0:20]

[Police find mass graves with at least '15 bodies' near Mexico town where 43 students disappeared after police clash,
 Hundreds of Palestinians flee floods in Gaza as Israel opens dams,
 Christian Bale passes on role of Steve Jobs, actor reportedly felt he wasn't right for part,
 HBO and Apple in Talks for $15/Month Apple TV Streaming Service Launching in April,
 Spider burrowed through tourist's stomach and up into his chest,
 'Nasa Confirms Earth Will Experience 6 Days of Total Darkness in December' Fake News Story Goes Viral,
 Accused Boston Marathon Bomber Severely Injured In Prison, May Never Walk Or Talk Again,
 Identity of ISIS terrorist known as 'Jihadi John' reportedly revealed,
 Banksy 'Arrested & Real Identity Revealed' Is The Same Hoax From Last Year,
 British Aid Worker Confirmed Murdered By ISIS,
 Gateway Pundit,
 Woman detained in Lebanon is not al-Baghdadi's wife, Iraq says,
 Kidnapped Nigerian schoolgirls: Government claims ceasefire deal with Boko Haram that will brin

In [18]:
brief_cleaning2 = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df2['Headline'])

In [19]:
cleaned_txt2 = [cleaning(doc) for doc in nlp.pipe(brief_cleaning2,batch_size=5000, n_threads=-1)]

In [20]:
cleaned_txt2[0:10]

["police find mass grave ' body ' near mexico town student disappear police clash",
 'hundred palestinians flee flood gaza israel open dam',
 'christian bale pass role steve jobs actor reportedly feel right',
 'hbo apple talk month apple tv streaming service launch april',
 'spider burrow tourist stomach chest',
 "' nasa confirm earth experience day total darkness december ' fake news story go viral",
 'accuse boston marathon bomber severely injure prison walk talk',
 "identity isis terrorist know ' jihadi john ' reportedly reveal",
 "banksy ' arrest real identity reveal ' hoax year",
 'british aid worker confirm murder isis']

In [21]:
df2_cleaned = pd.DataFrame(columns=['Headline_cleaned', 'Body ID', 'Stance'])
df2_cleaned['Body ID'] = df2['Body ID']
df2_cleaned['Stance'] = df2['Stance']
df2_cleaned['Headline_cleaned'] = cleaned_txt2
df2_cleaned.head()

Unnamed: 0,Headline_cleaned,Body ID,Stance
0,police find mass grave ' body ' near mexico to...,712,unrelated
1,hundred palestinians flee flood gaza israel op...,158,agree
2,christian bale pass role steve jobs actor repo...,137,unrelated
3,hbo apple talk month apple tv streaming servic...,1034,unrelated
4,spider burrow tourist stomach chest,1923,disagree


In [None]:
# df2_cleaned.to_csv('cleaned_train_stances3.csv', index=False)

In [23]:
df2_cleaned['Headline_cleaned'] = df2_cleaned['Headline_cleaned'].str.findall('\w{2,}').str.join(' ')

In [24]:
df2_cleaned.head()

Unnamed: 0,Headline_cleaned,Body ID,Stance
0,police find mass grave body near mexico town s...,712,unrelated
1,hundred palestinians flee flood gaza israel op...,158,agree
2,christian bale pass role steve jobs actor repo...,137,unrelated
3,hbo apple talk month apple tv streaming servic...,1034,unrelated
4,spider burrow tourist stomach chest,1923,disagree


In [25]:
df2_cleaned['Headline_cleaned'] = df2_cleaned['Headline_cleaned'].str.findall('\w{2,}').str.join(' ')

In [26]:
df2_cleaned.head()

Unnamed: 0,Headline_cleaned,Body ID,Stance
0,police find mass grave body near mexico town s...,712,unrelated
1,hundred palestinians flee flood gaza israel op...,158,agree
2,christian bale pass role steve jobs actor repo...,137,unrelated
3,hbo apple talk month apple tv streaming servic...,1034,unrelated
4,spider burrow tourist stomach chest,1923,disagree


In [28]:
df2_cleaned.to_csv('lemma_train_stances.csv', index=False)