In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [7]:
# Import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
import re

# Remove HTML
def remove_html(text):
    soup=BeautifulSoup(text, 'lxml')
    html_free=soup.get_text()
    return html_free

# Remove URL using re
def remove_urls (text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    return(text)

# Remove punctuation and Tokenization
tokenizer=RegexpTokenizer(r'\w+')

# Remove stop words
def remove_stopwords(text):
    words=[w for w in text if w not in stopwords.words("english")]
    return words

# Lemmatizer
lemmatizer=WordNetLemmatizer()
def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

# Instantiate Stemmer
stemmer=PorterStemmer()
def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text

# Remove punctuation
import string
def remove_punctuation(text):
    no_punct=" ".join([c for c in text if c not in string.punctuation])
    return no_punct

In [2]:
pd.set_option('display.max_colwidth', 1000)

In [17]:
Kaiser=pd.read_csv('KaiserHealthNews.csv', encoding= 'unicode_escape',header=None)
Kaiser=Kaiser.drop(Kaiser.columns[3:8], axis=1)
Kaiser.head()

Unnamed: 0,0,1,2
0,5.86267e+17,Thu Apr 09 20:37:18 +0000 2015,Tougher Vaccine Law In Calif. Clears First Hurdle http://khne.ws/1GQdDro http://pbs.twimg.com/media/CCLWazbUEAAmvjH.jpg
1,5.8626e+17,Thu Apr 09 20:11:29 +0000 2015,A new sort of extracurricular activity: seeing patients. http://khne.ws/1ceRuaZ http://pbs.twimg.com/media/CCLQgiFUIAAC7nJ.jpg
2,5.86239e+17,Thu Apr 09 18:46:24 +0000 2015,Houston firefighters have another tool at their disposal when answering calls: digital doctors http://khne.ws/1NfQjsp http://pbs.twimg.com/media/CCK9CYoUsAE4PP8.jpg
3,5.86182e+17,Thu Apr 09 15:00:25 +0000 2015,Will #Montana expand its #Medicaid program? Some moderates there have revived the idea: http://khne.ws/1Do7UrB http://pbs.twimg.com/media/CCKJT7DUAAABcJj.jpg
4,5.86159e+17,Thu Apr 09 13:30:44 +0000 2015,Rand Paul's campaign first day: abortion and the budget: http://khne.ws/1y8aJwC


In [18]:
# Named Columns for Kaiser_df
Kaiser_df["ReviewID"]=Kaiser[0]
Kaiser_df["ReviewTime"]=Kaiser[1]
Kaiser_df["Review"]=Kaiser[2]
#Kaiser_df.drop(Kaiser.columns[0:3], axis=1, inplace=True)
Kaiser_df.head()

Unnamed: 0,ReviewID,ReviewTime,Review
0,5.86267e+17,Thu Apr 09 20:37:18 +0000 2015,Tougher Vaccine Law In Calif. Clears First Hurdle http://khne.ws/1GQdDro http://pbs.twimg.com/media/CCLWazbUEAAmvjH.jpg
1,5.8626e+17,Thu Apr 09 20:11:29 +0000 2015,A new sort of extracurricular activity: seeing patients. http://khne.ws/1ceRuaZ http://pbs.twimg.com/media/CCLQgiFUIAAC7nJ.jpg
2,5.86239e+17,Thu Apr 09 18:46:24 +0000 2015,Houston firefighters have another tool at their disposal when answering calls: digital doctors http://khne.ws/1NfQjsp http://pbs.twimg.com/media/CCK9CYoUsAE4PP8.jpg
3,5.86182e+17,Thu Apr 09 15:00:25 +0000 2015,Will #Montana expand its #Medicaid program? Some moderates there have revived the idea: http://khne.ws/1Do7UrB http://pbs.twimg.com/media/CCKJT7DUAAABcJj.jpg
4,5.86159e+17,Thu Apr 09 13:30:44 +0000 2015,Rand Paul's campaign first day: abortion and the budget: http://khne.ws/1y8aJwC


In [21]:
Kaiser_df["Review"] = Kaiser_df["Review"].apply(lambda x: remove_urls(x))
Kaiser_df["Review"].head()

0                                                 Tougher Vaccine Law In Calif. Clears First Hurdle  
1                                          A new sort of extracurricular activity: seeing patients.  
2    Houston firefighters have another tool at their disposal when answering calls: digital doctors  
3           Will #Montana expand its #Medicaid program? Some moderates there have revived the idea:  
4                                           Rand Paul's campaign first day: abortion and the budget: 
Name: Review, dtype: object

In [22]:
Kaiser_df["Review"] = Kaiser_df["Review"].apply(lambda x: tokenizer.tokenize(x.lower()))
Kaiser_df["Review"].head()

0                                                      [tougher, vaccine, law, in, calif, clears, first, hurdle]
1                                                [a, new, sort, of, extracurricular, activity, seeing, patients]
2    [houston, firefighters, have, another, tool, at, their, disposal, when, answering, calls, digital, doctors]
3              [will, montana, expand, its, medicaid, program, some, moderates, there, have, revived, the, idea]
4                                              [rand, paul, s, campaign, first, day, abortion, and, the, budget]
Name: Review, dtype: object

In [23]:
Kaiser_df["Review"] = Kaiser_df["Review"].apply(lambda x: remove_stopwords(x))
Kaiser_df["Review"].head()

0                                   [tougher, vaccine, law, calif, clears, first, hurdle]
1                                [new, sort, extracurricular, activity, seeing, patients]
2    [houston, firefighters, another, tool, disposal, answering, calls, digital, doctors]
3                          [montana, expand, medicaid, program, moderates, revived, idea]
4                                    [rand, paul, campaign, first, day, abortion, budget]
Name: Review, dtype: object

In [24]:
Kaiser_df["Review"] = Kaiser_df["Review"].apply(lambda x: word_lemmatizer(x))
Kaiser_df["Review"].head()

0                                 [tougher, vaccine, law, calif, clear, first, hurdle]
1                              [new, sort, extracurricular, activity, seeing, patient]
2    [houston, firefighter, another, tool, disposal, answering, call, digital, doctor]
3                        [montana, expand, medicaid, program, moderate, revived, idea]
4                                 [rand, paul, campaign, first, day, abortion, budget]
Name: Review, dtype: object

In [16]:
#Kaiser_df["Review"] = Kaiser_df["Review"].apply(lambda x: word_stemmer(x))
#Kaiser_df["Review"].head()

0                      tougher vaccin law calif clear first hurdl
1                      new sort extracurricular activ see patient
2    houston firefight anoth tool dispos answer call digit doctor
3                montana expand medicaid program moder reviv idea
4                       rand paul campaign first day abort budget
Name: Review, dtype: object

In [25]:
Kaiser_df["Review"] = Kaiser_df["Review"].apply(lambda x: remove_punctuation(x))
Kaiser_df["Review"].head()

0                               tougher vaccine law calif clear first hurdle
1                           new sort extracurricular activity seeing patient
2    houston firefighter another tool disposal answering call digital doctor
3                      montana expand medicaid program moderate revived idea
4                               rand paul campaign first day abortion budget
Name: Review, dtype: object

In [26]:
reuters=pd.read_csv("reuters_health.csv", header=None)
reuters=reuters.drop(reuters.columns[3:5], axis=1)
reuters.head()

Unnamed: 0,0,1,2
0,5.86283e+17,Thu Apr 09 21:40:16 +0000 2015,Los Angeles closes 500 medical marijuana shops
1,5.86279e+17,Thu Apr 09 21:24:27 +0000 2015,U.S. cuts poultry export forecast as deadly bird flu spreads http://reut.rs/1abCAQY
2,5.86273e+17,Thu Apr 09 21:04:15 +0000 2015,Fears over Roundup herbicide residues prompt private testing http://reut.rs/1DpfmCU
3,5.86255e+17,Thu Apr 09 19:49:50 +0000 2015,Liberia watchdog says some Ebola funds unaccounted for http://reut.rs/1Dp1vfF
4,5.86244e+17,Thu Apr 09 19:07:10 +0000 2015,Diabetes devices may interfere with avalanche beacon signals http://reut.rs/1DoTDLl


In [36]:
# Named Columns for reuters_df
reuters_df["ReviewID"]=reuters[0]
reuters_df["ReviewTime"]=reuters[1]
reuters_df["Review"]=reuters[2]
#reuters_df.drop(reuters.columns[0:3], axis=1, inplace=True)
reuters_df.head()

Unnamed: 0,ReviewID,ReviewTime,Review
0,5.86283e+17,Thu Apr 09 21:40:16 +0000 2015,Los Angeles closes 500 medical marijuana shops
1,5.86279e+17,Thu Apr 09 21:24:27 +0000 2015,U.S. cuts poultry export forecast as deadly bird flu spreads http://reut.rs/1abCAQY
2,5.86273e+17,Thu Apr 09 21:04:15 +0000 2015,Fears over Roundup herbicide residues prompt private testing http://reut.rs/1DpfmCU
3,5.86255e+17,Thu Apr 09 19:49:50 +0000 2015,Liberia watchdog says some Ebola funds unaccounted for http://reut.rs/1Dp1vfF
4,5.86244e+17,Thu Apr 09 19:07:10 +0000 2015,Diabetes devices may interfere with avalanche beacon signals http://reut.rs/1DoTDLl


In [30]:
reuters_df["Review"] = reuters_df["Review"].apply(lambda x: remove_urls(x))
reuters_df["Review"].head()

0                   Los Angeles closes 500 medical marijuana shops
1    U.S. cuts poultry export forecast as deadly bird flu spreads 
2    Fears over Roundup herbicide residues prompt private testing 
3          Liberia watchdog says some Ebola funds unaccounted for 
4    Diabetes devices may interfere with avalanche beacon signals 
Name: Review, dtype: object

In [31]:
reuters_df["Review"] = reuters_df["Review"].apply(lambda x: tokenizer.tokenize(x.lower()))
reuters_df["Review"].head()

0                     [los, angeles, closes, 500, medical, marijuana, shops]
1    [u, s, cuts, poultry, export, forecast, as, deadly, bird, flu, spreads]
2      [fears, over, roundup, herbicide, residues, prompt, private, testing]
3            [liberia, watchdog, says, some, ebola, funds, unaccounted, for]
4      [diabetes, devices, may, interfere, with, avalanche, beacon, signals]
Name: Review, dtype: object

In [37]:
reuters_df["Review"] = reuters_df["Review"].apply(lambda x: remove_stopwords(x))
reuters_df["Review"].head()

0                                                                                        [L,  , A, n, g, e, l, e,  , c, l, e,  , 5, 0, 0,  , e, c, l,  , r, j, u, n,  , h, p]
1       [U, ., S, .,  , c, u,  , p, u, l, r,  , e, x, p, r,  , f, r, e, c,  ,  , e, l,  , b, r,  , f, l, u,  , p, r, e,  , h, p, :, /, /, r, e, u, ., r, /, 1, b, C, A, Q, Y]
2    [F, e, r,  , v, e, r,  , R, u, n, u, p,  , h, e, r, b, c, e,  , r, e, u, e,  , p, r, p,  , p, r, v, e,  , e, n, g,  , h, p, :, /, /, r, e, u, ., r, /, 1, D, p, f, C, U]
3                      [L, b, e, r,  , w, c, h, g,  ,  , e,  , E, b, l,  , f, u, n,  , u, n, c, c, u, n, e,  , f, r,  , h, p, :, /, /, r, e, u, ., r, /, 1, D, p, 1, v, f, F]
4       [D, b, e, e,  , e, v, c, e,  ,  , n, e, r, f, e, r, e,  , w, h,  , v, l, n, c, h, e,  , b, e, c, n,  , g, n, l,  , h, p, :, /, /, r, e, u, ., r, /, 1, D, T, D, L, l]
Name: Review, dtype: object

In [33]:
reuters_df["Review"] = reuters_df["Review"].apply(lambda x: word_lemmatizer(x))
reuters_df["Review"].head()

0              [los, angeles, close, 500, medical, marijuana, shop]
1    [u, cut, poultry, export, forecast, deadly, bird, flu, spread]
2     [fear, roundup, herbicide, residue, prompt, private, testing]
3                [liberia, watchdog, say, ebola, fund, unaccounted]
4     [diabetes, device, may, interfere, avalanche, beacon, signal]
Name: Review, dtype: object

In [39]:
#reuters_df["Review"] = reuters_df["Review"].apply(lambda x: word_stemmer(x))
#reuters_df["Review"].head()

0                 lo angel close 500 medic marijuana shop
1    u cut poultri export forecast deadli bird flu spread
2         fear roundup herbicid residu prompt privat test
3               liberia watchdog say ebola fund unaccount
4        diabet devic may interfer avalanch beacon signal
Name: Review, dtype: object

In [34]:
reuters_df["Review"] = reuters_df["Review"].apply(lambda x: remove_punctuation(x))
reuters_df["Review"].head()

0             los angeles close 500 medical marijuana shop
1     u cut poultry export forecast deadly bird flu spread
2    fear roundup herbicide residue prompt private testing
3              liberia watchdog say ebola fund unaccounted
4    diabetes device may interfere avalanche beacon signal
Name: Review, dtype: object

In [35]:
Kaiser_df.to_csv("Kaiser_df.csv", index=False)
reuters_df.to_csv("reuters_df.csv", index=False)