# Twitter Cleaning

In [64]:
import pandas as pd
import re

from string import punctuation
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [65]:
df = pd.read_csv('../Data/twitter.csv')

df.head()

Unnamed: 0,tweet,location,user_name,time_stamp,num_retweets,num_likes,tweet_word_count
0,"Come on over, the weather is great! \n\n(just ...",Los Angeles,steveendow,2019-10-29 22:46:18,0,0,14
1,"California blackouts of the past. \n\nIn 1941,...",Los Angeles,SaulKQED,2019-10-29 22:41:11,0,0,45
2,blackout miggs coming soon pic.twitter.com/J3D...,Los Angeles,_roxyroxx,2019-10-29 22:34:24,1,1,5
3,"California Blackouts Hit Cellphone Service, Fr...",Los Angeles,mattsheffield,2019-10-29 22:33:32,2,0,10
4,"Breaking News: PG&E failed to notify 23,000 of...",Los Angeles,REDRealEstateLA,2019-10-29 22:11:29,0,0,17


In [66]:
df.shape

(7438, 7)

In [67]:
# remove duplicate tweets
df = df.drop_duplicates(subset=['tweet', 'location', 'user_name', 'time_stamp'], keep='last').reset_index(drop=True)
df.shape

(7218, 7)

___
## remove links

In [68]:
# remove any twitter pic urls
df['tweet'] = [re.sub(r'pic.twitter.com\S+', '', post).strip() for post in df['tweet']]

In [69]:
# remove any http urls
df['tweet'] = [re.sub(r'http\S+', '', post).strip() for post in df['tweet']]

In [70]:
[tweet for tweet in df['tweet']]

['Come on over, the weather is great! \n\n(just ignore the fires and massive blackouts)',
 'California blackouts of the past. \n\nIn 1941, after the attack on Pearl Harbor, "Los Angeles Times" reporters and editors work in a dimly lit newsroom.  Authorities imposed  blackouts on the West Coast for fear of a possible Japanese assault. (Photo: Los Angeles Times Archives)',
 'blackout miggs coming soon',
 'California Blackouts Hit Cellphone Service, Fraying a Lifeline',
 'Breaking News: PG&E failed to notify 23,000 of blackouts; CPUC launches probe of utility power outages:',
 "The Yankees' TV blackout lasted one season. The Dodgers' TV blackout has lasted six seasons, and counting. \xa0…",
 'Here’s how to best prepare if your area will have a planned power outage, how to protect a child from smoke inhalation and what to unpack if you need to evacuate: \xa0…',
 'Just landed in San Francisco. Power is out at the @142Throckmorton but the show will go on!  This is going to be amazing.  Don’t

___
## Tokenizing

In [71]:
# instatiate the tokenizer
tknr = RegexpTokenizer(r'\w+')

# start with empty lists
tokens = []

# fill the list with tokenized versions of each post title
for post in df['tweet']:
    tokens.append(" ".join(tknr.tokenize(post.lower())))

df['tweet'] = tokens

df = df.reset_index(drop=True)


In [72]:
df.head()

Unnamed: 0,tweet,location,user_name,time_stamp,num_retweets,num_likes,tweet_word_count
0,come on over the weather is great just ignore ...,Los Angeles,steveendow,2019-10-29 22:46:18,0,0,14
1,california blackouts of the past in 1941 after...,Los Angeles,SaulKQED,2019-10-29 22:41:11,0,0,45
2,blackout miggs coming soon,Los Angeles,_roxyroxx,2019-10-29 22:34:24,1,1,5
3,california blackouts hit cellphone service fra...,Los Angeles,mattsheffield,2019-10-29 22:33:32,2,0,10
4,breaking news pg e failed to notify 23 000 of ...,Los Angeles,REDRealEstateLA,2019-10-29 22:11:29,0,0,17


In [73]:
[tweet for tweet in df['tweet']]

['come on over the weather is great just ignore the fires and massive blackouts',
 'california blackouts of the past in 1941 after the attack on pearl harbor los angeles times reporters and editors work in a dimly lit newsroom authorities imposed blackouts on the west coast for fear of a possible japanese assault photo los angeles times archives',
 'blackout miggs coming soon',
 'california blackouts hit cellphone service fraying a lifeline',
 'breaking news pg e failed to notify 23 000 of blackouts cpuc launches probe of utility power outages',
 'the yankees tv blackout lasted one season the dodgers tv blackout has lasted six seasons and counting',
 'here s how to best prepare if your area will have a planned power outage how to protect a child from smoke inhalation and what to unpack if you need to evacuate',
 'just landed in san francisco power is out at the 142throckmorton but the show will go on this is going to be amazing don t mias this',
 'california s blackouts are part of a f

___
## Lemmatizing

In [74]:
# Instatiate the lemmatizer
lemm = WordNetLemmatizer()

In [75]:
# start with an empty list
lemm_tweet = []

# Lemmatize the words in each post and add them to the list
for post in df['tweet']:
    lemm_tweet.append(' '.join([lemm.lemmatize(w) for w in word_tokenize(post)]))


df['tweet'] = lemm_tweet

In [76]:
df.head()

Unnamed: 0,tweet,location,user_name,time_stamp,num_retweets,num_likes,tweet_word_count
0,come on over the weather is great just ignore ...,Los Angeles,steveendow,2019-10-29 22:46:18,0,0,14
1,california blackout of the past in 1941 after ...,Los Angeles,SaulKQED,2019-10-29 22:41:11,0,0,45
2,blackout miggs coming soon,Los Angeles,_roxyroxx,2019-10-29 22:34:24,1,1,5
3,california blackout hit cellphone service fray...,Los Angeles,mattsheffield,2019-10-29 22:33:32,2,0,10
4,breaking news pg e failed to notify 23 000 of ...,Los Angeles,REDRealEstateLA,2019-10-29 22:11:29,0,0,17


In [77]:
[tweet for tweet in df['tweet']]

['come on over the weather is great just ignore the fire and massive blackout',
 'california blackout of the past in 1941 after the attack on pearl harbor los angeles time reporter and editor work in a dimly lit newsroom authority imposed blackout on the west coast for fear of a possible japanese assault photo los angeles time archive',
 'blackout miggs coming soon',
 'california blackout hit cellphone service fraying a lifeline',
 'breaking news pg e failed to notify 23 000 of blackout cpuc launch probe of utility power outage',
 'the yankee tv blackout lasted one season the dodger tv blackout ha lasted six season and counting',
 'here s how to best prepare if your area will have a planned power outage how to protect a child from smoke inhalation and what to unpack if you need to evacuate',
 'just landed in san francisco power is out at the 142throckmorton but the show will go on this is going to be amazing don t mias this',
 'california s blackout are part of a far bigger problem',
 