# Twitter Cleaning

In [None]:
import pandas as pd
import re

from string import punctuation
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

In [None]:
df = pd.read_csv('../Data/twitter_states_2014_19.csv')

df.head()

In [None]:
df['tweet'][4]

In [None]:
df.shape

In [None]:
# remove duplicate tweets
df = df.drop_duplicates(subset=['tweet', 'location', 'user_name', 'time_stamp'], keep='last').reset_index(drop=True)
df.shape

___
## remove links

In [None]:
# remove any twitter pic urls
df['tweet'] = [re.sub(r'pic.twitter.com\S+', '', post).strip() for post in df['tweet']]

In [None]:
# remove any http urls
df['tweet'] = [re.sub(r'http\S+', '', post).strip() for post in df['tweet']]

In [None]:
[tweet for tweet in df['tweet']]

___
## Tokenizing

In [None]:
# instatiate the tokenizer
tknr = RegexpTokenizer(r'[a-zA-Z&0-9]+')

# start with empty lists
tokens = []

# fill the list with tokenized versions of each post title
for post in df['tweet']:
    tokens.append(" ".join(tknr.tokenize(post.lower())))

df['tweet'] = tokens

df = df.reset_index(drop=True)


In [None]:
df.head()

In [None]:
[tweet for tweet in df['tweet']]

___
## Lemmatizing

In [None]:
# Instatiate the lemmatizer
port = PorterStemmer()

In [None]:
# start with an empty list
port_tweet = []

# Lemmatize the words in each post and add them to the list
for post in df['tweet']:
    port_tweet.append(' '.join([port.stem(word) for word in post.split()]))


df['port_tweet'] = port_tweet

In [None]:
df.head()

In [None]:
[tweet for tweet in df['port_tweet']]

___
# check for nulls

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna().reset_index(drop=True)
df.shape

___
# formating date and time

In [None]:
df['time_stamp']

In [None]:
df['time_location'] = [f"{date.split()[0]} {date.split()[1].split(':')[0]}:00:00 | {df['location'][i]}" 
                   for i, date in enumerate(df['time_stamp'])]

In [None]:
df.head()

In [None]:
# save our cleaned twitter data
df.to_csv('../Data/twitter_states_cleaned_2014_19.csv', index=False)