# Step 2: Clean the Data

**Project Description**: This project aims to analyze which words in news headlines generate the most engagement. Headlines are from the r/news subreddit. 

In [23]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('omw-1.4') #Download OpenMultilingualWordnet
wnl = nltk.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\reyni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\reyni\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Before we can proceed with our analysis

In [24]:
titles = pd.read_csv('titles.csv').drop(['Unnamed: 0'], axis=1) # read from csv and drop extra index column
titles

Unnamed: 0,title,score,num_comments,date
0,U-Haul to Implement Nicotine-Free Hiring Polic...,134,518,2020-01-02 18:01:47+00:00
1,FDA issues ban on some flavored vaping products,28,43,2020-01-02 18:29:42+00:00
2,Family kicked off plane because daughter threw...,30,114,2020-01-02 18:54:03+00:00
3,Kansas police apologize for faked story of exp...,745,298,2020-01-02 19:02:08+00:00
4,Lebanese lawyers want Ghosn prosecuted over Is...,2,2,2020-01-02 19:03:04+00:00
...,...,...,...,...
9366,Louisville police move to fire officers in Bre...,2,1,2020-12-30 05:44:28+00:00
9367,A Black jazz musician's son was falsely accuse...,2,39,2020-12-30 17:44:14+00:00
9368,Apple and TikTok remove app used to arrange pa...,2,28,2020-12-30 19:32:36+00:00
9369,Most prolific serial killer in US history dies...,2,4,2020-12-31 01:35:25+00:00


In [25]:
def cleaner(document):
    document = document.lower() #To lower case
    document = re.sub(r'<[^>]*>', ' ', document) #Remove HTML
    document = re.sub(r'[^\w\s]','', document) #Remove non-alphanumeric characters
    document =  re.sub(r'[^a-zA-Z ]''+','',document) #remove numbers
    return document

titles['title'] = titles['title'].apply(cleaner)

In [26]:
titles['split t'] = titles['title'].str.split()
titles

Unnamed: 0,title,score,num_comments,date,split t
0,uhaul to implement nicotinefree hiring policy ...,134,518,2020-01-02 18:01:47+00:00,"[uhaul, to, implement, nicotinefree, hiring, p..."
1,fda issues ban on some flavored vaping products,28,43,2020-01-02 18:29:42+00:00,"[fda, issues, ban, on, some, flavored, vaping,..."
2,family kicked off plane because daughter threw...,30,114,2020-01-02 18:54:03+00:00,"[family, kicked, off, plane, because, daughter..."
3,kansas police apologize for faked story of exp...,745,298,2020-01-02 19:02:08+00:00,"[kansas, police, apologize, for, faked, story,..."
4,lebanese lawyers want ghosn prosecuted over is...,2,2,2020-01-02 19:03:04+00:00,"[lebanese, lawyers, want, ghosn, prosecuted, o..."
...,...,...,...,...,...
9366,louisville police move to fire officers in bre...,2,1,2020-12-30 05:44:28+00:00,"[louisville, police, move, to, fire, officers,..."
9367,a black jazz musicians son was falsely accused...,2,39,2020-12-30 17:44:14+00:00,"[a, black, jazz, musicians, son, was, falsely,..."
9368,apple and tiktok remove app used to arrange pa...,2,28,2020-12-30 19:32:36+00:00,"[apple, and, tiktok, remove, app, used, to, ar..."
9369,most prolific serial killer in us history dies...,2,4,2020-12-31 01:35:25+00:00,"[most, prolific, serial, killer, in, us, histo..."


In [27]:
titles['tokenized_title'] = titles.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
i = titles['tokenized_title']
i

0       [uhaul, to, implement, nicotinefree, hiring, p...
1       [fda, issues, ban, on, some, flavored, vaping,...
2       [family, kicked, off, plane, because, daughter...
3       [kansas, police, apologize, for, faked, story,...
4       [lebanese, lawyers, want, ghosn, prosecuted, o...
                              ...                        
9366    [louisville, police, move, to, fire, officers,...
9367    [a, black, jazz, musicians, son, was, falsely,...
9368    [apple, and, tiktok, remove, app, used, to, ar...
9369    [most, prolific, serial, killer, in, us, histo...
9370    [employees, infected, in, covid, outbreak, at,...
Name: tokenized_title, Length: 9371, dtype: object

In [28]:
f = titles['title'][87]
f

'we are not safe indias muslims tell of wave of police brutality'

In [29]:
g = nltk.tokenize.word_tokenize(f)
g

['we',
 'are',
 'not',
 'safe',
 'indias',
 'muslims',
 'tell',
 'of',
 'wave',
 'of',
 'police',
 'brutality']

In [30]:
stop = stopwords.words('english')
h = [i for i in g if i not in stop]
h

['safe', 'indias', 'muslims', 'tell', 'wave', 'police', 'brutality']

In [31]:
k = [nltk.WordNetLemmatizer().lemmatize(i) for i in h]
k

['safe', 'india', 'muslim', 'tell', 'wave', 'police', 'brutality']

In [49]:
wnl = nltk.WordNetLemmatizer()
titles['lemma_titles'] = titles['title'].apply(lambda word: wnl.lemmatize(word, pos='v'))
titles['lemma_titles'][546]

'beer and bagels please new york rats evolve to mirror human habits'

In [50]:
titles['title'][546]

'beer and bagels please new york rats evolve to mirror human habits'

In [55]:
stop = stopwords.words('english')
titles['title_stop'] = titles['lemma_titles'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
titles['title_stop'][546]

'beer bagels please new york rats evolve mirror human habits'

In [56]:
titles

Unnamed: 0,title,score,num_comments,date,split t,tokenized_title,lemma_titles,title_stop
0,uhaul to implement nicotinefree hiring policy ...,134,518,2020-01-02 18:01:47+00:00,"[uhaul, to, implement, nicotinefree, hiring, p...","[uhaul, to, implement, nicotinefree, hiring, p...",uhaul to implement nicotinefree hiring policy ...,uhaul implement nicotinefree hiring policy states
1,fda issues ban on some flavored vaping products,28,43,2020-01-02 18:29:42+00:00,"[fda, issues, ban, on, some, flavored, vaping,...","[fda, issues, ban, on, some, flavored, vaping,...",fda issues ban on some flavored vaping products,fda issues ban flavored vaping products
2,family kicked off plane because daughter threw...,30,114,2020-01-02 18:54:03+00:00,"[family, kicked, off, plane, because, daughter...","[family, kicked, off, plane, because, daughter...",family kicked off plane because daughter threw...,family kicked plane daughter threw take forced...
3,kansas police apologize for faked story of exp...,745,298,2020-01-02 19:02:08+00:00,"[kansas, police, apologize, for, faked, story,...","[kansas, police, apologize, for, faked, story,...",kansas police apologize for faked story of exp...,kansas police apologize faked story expletive cup
4,lebanese lawyers want ghosn prosecuted over is...,2,2,2020-01-02 19:03:04+00:00,"[lebanese, lawyers, want, ghosn, prosecuted, o...","[lebanese, lawyers, want, ghosn, prosecuted, o...",lebanese lawyers want ghosn prosecuted over is...,lebanese lawyers want ghosn prosecuted israel ...
...,...,...,...,...,...,...,...,...
9366,louisville police move to fire officers in bre...,2,1,2020-12-30 05:44:28+00:00,"[louisville, police, move, to, fire, officers,...","[louisville, police, move, to, fire, officers,...",louisville police move to fire officers in bre...,louisville police move fire officers breonna t...
9367,a black jazz musicians son was falsely accused...,2,39,2020-12-30 17:44:14+00:00,"[a, black, jazz, musicians, son, was, falsely,...","[a, black, jazz, musicians, son, was, falsely,...",a black jazz musicians son was falsely accused...,black jazz musicians son falsely accused steal...
9368,apple and tiktok remove app used to arrange pa...,2,28,2020-12-30 19:32:36+00:00,"[apple, and, tiktok, remove, app, used, to, ar...","[apple, and, tiktok, remove, app, used, to, ar...",apple and tiktok remove app used to arrange pa...,apple tiktok remove app used arrange parties c...
9369,most prolific serial killer in us history dies...,2,4,2020-12-31 01:35:25+00:00,"[most, prolific, serial, killer, in, us, histo...","[most, prolific, serial, killer, in, us, histo...",most prolific serial killer in us history dies...,prolific serial killer us history dies california
