In [32]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
#stopwords são palavras que não adicionam muito significado a sentença
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [3]:
# Número de dados e features
data.shape

(4009, 4)

In [4]:
data['Body'] = data['Body'].astype(str)
data

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1
5,http://beforeitsnews.com/sports/2017/09/jetnat...,JetNation FanDuel League; Week 4,JetNation FanDuel League; Week 4\n% of readers...,0
6,https://www.nytimes.com/2017/10/10/us/politics...,Kansas Tried a Tax Plan Similar to Trump’s. It...,"In 2012, Kansas lawmakers, led by Gov. Sam Bro...",1
7,https://www.reuters.com/article/us-india-cenba...,"India RBI chief: growth important, but not at ...",The Reserve Bank of India (RBI) Governor Urjit...,1
8,https://www.reuters.com/article/us-climatechan...,EPA chief to sign rule on Clean Power Plan exi...,"Scott Pruitt, Administrator of the U.S. Enviro...",1
9,https://www.reuters.com/article/us-air-berlin-...,Talks on sale of Air Berlin planes to easyJet ...,FILE PHOTO - An Air Berlin sign is seen at an ...,1


# 1. Brincando com os dados e gerando novas features

In [5]:
# Numero de 'palavras' nos Headlines
data['word_count'] = data['Headline'].apply(lambda x: len(str(x).split(" ")))
data.head()

Unnamed: 0,URLs,Headline,Body,Label,word_count
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,7
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,10
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,8
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,11
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,7


In [6]:
# Numero de 'caracteres' nos Headlines
data['char_count'] = data['Headline'].str.len() ## this also includes spaces
data.head()

Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,7,42
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,10,66
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,8,60
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,11,68
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,7,43


In [7]:
# Média de palavras nos Headlines
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

data['avg_word'] = data['Headline'].apply(lambda x: avg_word(x))
data.head()

Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count,avg_word
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,7,42,5.142857
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,10,66,5.7
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,8,60,6.625
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,11,68,5.272727
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,7,43,5.285714


In [8]:
# Numero de 'stopwords' nos Headlines
data['stopwords'] = data['Headline'].apply(lambda x: len([x for x in x.split() if x in stop]))
data.head()

Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count,avg_word,stopwords
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,7,42,5.142857,0
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,10,66,5.7,1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,8,60,6.625,0
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,11,68,5.272727,2
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,7,43,5.285714,1


In [9]:
data['stopwordsBody'] = data['Body'].apply(lambda x: len([x for x in str(x).split() if str(x) in stop]))
data.head()

Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count,avg_word,stopwords,stopwordsBody
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,7,42,5.142857,0,403
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,10,66,5.7,1,201
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,8,60,6.625,0,382
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,11,68,5.272727,2,29
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,7,43,5.285714,1,13


In [10]:
# Numero de 'hastags' nos Headlines
data['hastags'] = data['Headline'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
data[data['hastags'] > 0] # vendo quais noticias possuem hashtags

Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count,avg_word,stopwords,stopwordsBody,hastags
223,http://beforeitsnews.com/sports/2017/10/108-hb...,"10/8: HBD Tom, Donie, Danny, Catfish & Spanky;...",A Potato Battery Can Light up a Room for Over ...,0,12,73,5.166667,0,8,1
245,http://beforeitsnews.com/sports/2017/09/929-fr...,"9/29 From the 50s Forward: HBD Ken, Teke Takes...",Warning Something Big Is About to Happen in Am...,0,20,114,4.75,1,5,1
373,http://beforeitsnews.com/sports/2017/09/around...,"Around the #ACC Blogosphere For September 21, ...","Around the #ACC Blogosphere For September 21, ...",0,8,50,5.375,1,69,1
415,http://beforeitsnews.com/sports/2017/09/around...,"Around the #ACC Blogosphere For September 21, ...","Around the #ACC Blogosphere For September 21, ...",0,8,50,5.375,1,69,1
561,http://beforeitsnews.com/u-s-politics/2017/09/...,NFL Fans Are Setting Fire To Their Jerseys In ...,Warning Something Big Is About to Happen in Am...,0,12,68,4.75,0,5,1
756,http://beforeitsnews.com/sports/2017/09/926-tr...,"9/26 TRS/PNC Era: Frankie #200, HBD Daniel, Sm...","No Getting Around it, The War Is Coming! Trump...",0,11,70,5.454545,0,1,1
1057,http://beforeitsnews.com/sports/2017/09/929-fr...,"9/29 From the 50s Forward: HBD Ken, Teke Takes...",A Potato Battery Can Light up a Room for Over ...,0,20,114,4.75,1,8,1
1095,http://beforeitsnews.com/sports/2017/10/notdon...,#notdoneyet,#notdoneyet\n(Before It's News)\nSource: John ...,0,1,11,11.0,0,163,1
1104,http://beforeitsnews.com/sports/2017/09/929-fr...,"9/29 From the 50s Forward: HBD Ken, Teke Takes...",An Embattled Pharmaceutical Company That Sells...,0,20,114,4.75,1,2,1
1225,http://beforeitsnews.com/sports/2017/10/notdon...,#notdoneyet,#notdoneyet\n(Before It's News)\nSource: John ...,0,1,11,11.0,0,163,1


In [11]:
# Numero de 'numerics' nos Headlines
data['numerics'] = data['Headline'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
data[data['numerics'] > 0] 

Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count,avg_word,stopwords,stopwordsBody,hastags,numerics
5,http://beforeitsnews.com/sports/2017/09/jetnat...,JetNation FanDuel League; Week 4,JetNation FanDuel League; Week 4\n% of readers...,0,5,32,5.600000,0,39,0,1
11,http://beforeitsnews.com/sports/2017/10/2017-f...,2017 Fantasy Football Team Defense Rankings - ...,2017 Fantasy Football Team Defense Rankings – ...,0,9,52,4.888889,0,7,0,2
13,https://www.reuters.com/article/us-deloitte-cy...,Deloitte cyber attack affected up to 350 clien...,FILE PHOTO: The Deloitte Company logo is seen ...,1,9,58,5.555556,2,34,0,1
30,http://beforeitsnews.com/sports/2017/10/cycleb...,CycleBar Westgate West Hosts Benefit Cycle Rid...,CycleBar Westgate West Hosts Benefit Cycle Rid...,0,16,99,5.250000,2,118,0,1
38,http://beforeitsnews.com/sports/2017/10/101-ex...,10/1 Expo Park-Forbes Field Era: Deacon Whips ...,A Potato Battery Can Light up a Room for Over ...,0,20,106,4.350000,0,8,0,1
41,http://beforeitsnews.com/sports/2017/09/2017-f...,2017 Fantasy Football Kicker Rankings - Week 4,Warning Something Big Is About to Happen in Am...,0,8,46,4.875000,0,5,0,2
43,http://beforeitsnews.com/sports/2017/10/2017-f...,2017 Fantasy Football Running Back Rankings - ...,2017 Fantasy Football Running Back Rankings – ...,0,9,52,4.888889,0,7,0,2
55,http://dailybuzzlive.com/2000-simpsons-predict...,In 2000 The Simpsons Predicted The Donald Trum...,I came across this video and I have to say I’m...,0,9,58,5.555556,0,147,0,1
73,http://www.cnn.com/2017/10/09/asia/rohingya-my...,12 Rohingya refugees killed after boat capsize...,"Story highlights At least 519,000 Rohingya ref...",1,11,78,6.181818,3,48,0,1
85,http://beforeitsnews.com/sports/2017/10/gateke...,Gatekeeper Invades The 312 Podcastto Preview t...,Gatekeeper Invades The 312 Podcastto Preview t...,0,9,66,6.444444,1,21,0,1


In [12]:
# Numero de uppers nos Headlines
data['upper'] = data['Headline'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
data.head()

Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count,avg_word,stopwords,stopwordsBody,hastags,numerics,upper
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1,7,42,5.142857,0,403,0,0,0
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1,10,66,5.7,1,201,0,0,0
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1,8,60,6.625,0,382,0,0,0
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1,11,68,5.272727,2,29,0,0,0
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1,7,43,5.285714,1,13,0,0,1


In [13]:
# 10 Palavras mais frequentes
freqHeadline = pd.Series(' '.join(data['Headline']).split()).value_counts()[:10]
freqBody = pd.Series(' '.join(data['Body']).split()).value_counts()[:10]
rareHead = pd.Series(' '.join(data['Headline']).split()).value_counts()[-10:]
rareBody = pd.Series(' '.join(data['Body']).split()).value_counts()[-10:]


In [14]:
freqHeadline

to       718
in       510
the      423
of       407
The      378
for      328
Trump    294
on       281
-        262
and      257
dtype: int64

In [15]:
freqBody

the     96245
to      50454
of      44772
and     44364
a       41501
in      35620
that    18821
is      17770
for     17750
on      17692
dtype: int64

In [16]:
rareHead

surrender    1
Hargan       1
snake        1
Drones:      1
tech,        1
albums       1
Interpol     1
Scout        1
grass        1
Agree?       1
dtype: int64

In [17]:
rareBody

coughs,                 1
Ukraine”                1
swapped                 1
“Marijuana              1
diagnoses,              1
turnover)               1
Counter-demonstrator    1
skiing,                 1
tastes.                 1
Hwasong-14,             1
dtype: int64

# 2. Pré-processamento básico do dado

In [18]:
# Deixar tudo em lower case
data['Headline'] = data['Headline'].apply(lambda x: " ".join(x.lower() for x in x.split()))
data['Body'] = data['Body'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
data.head()

Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count,avg_word,stopwords,stopwordsBody,hastags,numerics,upper
0,http://www.bbc.com/news/world-us-canada-414191...,four ways bob corker skewered donald trump,image copyright getty images on sunday morning...,1,7,42,5.142857,0,403,0,0,0
1,https://www.reuters.com/article/us-filmfestiva...,linklater's war veteran comedy speaks to moder...,"london (reuters) - “last flag flying”, a comed...",1,10,66,5.7,1,201,0,0,0
2,https://www.nytimes.com/2017/10/09/us/politics...,trump’s fight with corker jeopardizes his legi...,the feud broke into public view last week when...,1,8,60,6.625,0,382,0,0,0
3,https://www.reuters.com/article/us-mexico-oil-...,egypt's cheiron wins tie-up with pemex for mex...,mexico city (reuters) - egypt’s cheiron holdin...,1,11,68,5.272727,2,29,0,0,0
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,jason aldean opens 'snl' with vegas tribute,"country singer jason aldean, who was performin...",1,7,43,5.285714,1,13,0,0,1


In [19]:
# Removendo pontuação
data['Headline'] = data['Headline'].str.replace('[^\w\s]','')
data['Body'] = data['Body'].str.replace('[^\w\s]','')
data.head()

Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count,avg_word,stopwords,stopwordsBody,hastags,numerics,upper
0,http://www.bbc.com/news/world-us-canada-414191...,four ways bob corker skewered donald trump,image copyright getty images on sunday morning...,1,7,42,5.142857,0,403,0,0,0
1,https://www.reuters.com/article/us-filmfestiva...,linklaters war veteran comedy speaks to modern...,london reuters last flag flying a comedydrama...,1,10,66,5.7,1,201,0,0,0
2,https://www.nytimes.com/2017/10/09/us/politics...,trumps fight with corker jeopardizes his legis...,the feud broke into public view last week when...,1,8,60,6.625,0,382,0,0,0
3,https://www.reuters.com/article/us-mexico-oil-...,egypts cheiron wins tieup with pemex for mexic...,mexico city reuters egypts cheiron holdings l...,1,11,68,5.272727,2,29,0,0,0
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,jason aldean opens snl with vegas tribute,country singer jason aldean who was performing...,1,7,43,5.285714,1,13,0,0,1


In [20]:
data['Headline'] = data['Headline'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
data['Headline'] = data['Headline'].apply(lambda x: " ".join(x for x in str(x).split() if str(x) not in stop))
data.head()

Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count,avg_word,stopwords,stopwordsBody,hastags,numerics,upper
0,http://www.bbc.com/news/world-us-canada-414191...,four ways bob corker skewered donald trump,image copyright getty images on sunday morning...,1,7,42,5.142857,0,403,0,0,0
1,https://www.reuters.com/article/us-filmfestiva...,linklaters war veteran comedy speaks modern am...,london reuters last flag flying a comedydrama...,1,10,66,5.7,1,201,0,0,0
2,https://www.nytimes.com/2017/10/09/us/politics...,trumps fight corker jeopardizes legislative ag...,the feud broke into public view last week when...,1,8,60,6.625,0,382,0,0,0
3,https://www.reuters.com/article/us-mexico-oil-...,egypts cheiron wins tieup pemex mexican onshor...,mexico city reuters egypts cheiron holdings l...,1,11,68,5.272727,2,29,0,0,0
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,jason aldean opens snl vegas tribute,country singer jason aldean who was performing...,1,7,43,5.285714,1,13,0,0,1


In [21]:
# 10 Palavras mais frequentes - Atualizando
freqHeadline = pd.Series(' '.join(data['Headline']).split()).value_counts()[:10]
freqBody = pd.Series(' '.join(data['Body']).split()).value_counts()[:10]
rareHead = pd.Series(' '.join(data['Headline']).split()).value_counts()[-10:]
rareBody = pd.Series(' '.join(data['Body']).split()).value_counts()[-10:]

In [22]:
# Remoção das palavras raras e frequentes
freqHeadline = list(freqHeadline.index)
freqBody = list(freqBody.index)
data['Headline'] = data['Headline'].apply(lambda x: " ".join(x for x in x.split() if x not in freqHeadline))
data['Body'] = data['Body'].apply(lambda x: " ".join(x for x in x.split() if x not in freqBody))
data['Headline'] = data['Headline'].apply(lambda x: " ".join(x for x in x.split() if x not in rareHead))
data['Body'] = data['Body'].apply(lambda x: " ".join(x for x in x.split() if x not in rareBody))
data.head()

Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count,avg_word,stopwords,stopwordsBody,hastags,numerics,upper
0,http://www.bbc.com/news/world-us-canada-414191...,four ways bob corker skewered donald,image copyright getty images sunday morning do...,1,7,42,5.142857,0,403,0,0,0
1,https://www.reuters.com/article/us-filmfestiva...,linklaters war veteran comedy speaks modern am...,london reuters last flag flying comedydrama ab...,1,10,66,5.7,1,201,0,0,0
2,https://www.nytimes.com/2017/10/09/us/politics...,trumps fight corker jeopardizes legislative ag...,feud broke into public view last week when mr ...,1,8,60,6.625,0,382,0,0,0
3,https://www.reuters.com/article/us-mexico-oil-...,egypts cheiron wins tieup pemex mexican onshor...,mexico city reuters egypts cheiron holdings li...,1,11,68,5.272727,2,29,0,0,0
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,jason aldean opens snl tribute,country singer jason aldean who was performing...,1,7,43,5.285714,1,13,0,0,1


In [34]:
# Correção ortografica dos dados
data['Headline'] = data['Headline'].apply(lambda x: str(TextBlob(x).correct()))
data['Body'] = data['Body'].apply(lambda x: str(TextBlob(x).correct()))
data.head()

KeyboardInterrupt: 

In [35]:
# Tokenize os dados
TextBlob(data['Headline'][1]).words

WordList(['linklaters', 'war', 'veteran', 'comedy', 'speaks', 'modern', 'america', 'star'])

In [41]:
# Stemming refers to the removal of suffices, like “ing”, “ly”, “s”, etc
#from nltk.stem import PorterStemmer
#st = PorterStemmer()
#data['Headline'] = data['Headline'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
#data['Body'] = data['Body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
#data.head()

In [44]:
# Lemmatization is a more effective option than stemming because it converts the word into its root word, 
# rather than just stripping the suffices. 
# It makes use of the vocabulary and does a morphological analysis to obtain the root word

from textblob import Word
import nltk
nltk.download('wordnet')
data['Headline'] = data['Headline'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data['Body'] = data['Body'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


Unnamed: 0,URLs,Headline,Body,Label,word_count,char_count,avg_word,stopwords,stopwordsBody,hastags,numerics,upper
0,http://www.bbc.com/news/world-us-canada-414191...,four way bob corker skewer donald,imag copyright getti imag sunday morn donald t...,1,7,42,5.142857,0,403,0,0,0
1,https://www.reuters.com/article/us-filmfestiva...,linklat war veteran comedi speak modern americ...,london reuter last flag fli comedydrama about ...,1,10,66,5.7,1,201,0,0,0
2,https://www.nytimes.com/2017/10/09/us/politics...,trump fight corker jeopard legisl agenda,feud broke into public view last week when mr ...,1,8,60,6.625,0,382,0,0,0
3,https://www.reuters.com/article/us-mexico-oil-...,egypt cheiron win tieup pemex mexican onshor o...,mexico citi reuter egypt cheiron hold limit wo...,1,11,68,5.272727,2,29,0,0,0
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,jason aldean open snl tribut,countri singer jason aldean who wa perform a l...,1,7,43,5.285714,1,13,0,0,1


# Técnicas avançadas de processamento