**Checklist**

- check for dupes in ingestion? 
    - not needed, no dupes, and text dupes should be checked after cleaning
- use textBlob as yet another way to perform basic sentiment analysis?
    - yes, TODO, but maybe not during cleanup
- create an incremental cleanup module?
    - overkill, just reprocess everything bc of dupes, 
      script takes 8 secs for 20k tweets, that's ~6 min for a million if linear
- improve script?
    - add logs?

In [1]:
import re
import os
import json
import time

import string
import datetime
import urlextract
import pandas as pd

from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
# load data
def load_data():
    filepath = os.path.join("data","raw","tweets") 

    dfm = []
    for f in os.listdir(filepath):
        dfm.append(pd.read_csv(os.path.join(filepath,f)))
        
    df = pd.concat(dfm)
    df = df.reset_index(drop=True)
    
    return df

In [3]:
df = load_data()

In [4]:
# test for duplicated IDs
ids = df["ID"]
df[ids.isin(ids[ids.duplicated()])].shape

(0, 5)

In [5]:
# test for duplicated Text
txt = df["Text"]
df[txt.isin(txt[txt.duplicated()])].shape

(5882, 5)

In [6]:
# look at users with more than 1 tweet?
grouped = df[['User', 'ID']].groupby('User').count().sort_values('ID', ascending=False)

grouped[grouped['ID']>1].head()

Unnamed: 0_level_0,ID
User,Unnamed: 1_level_1
Iouisogolden,4
OfValimaiyana,4
MydoghatesMe2,4
cosby_by,3
diialexandria,3


#### Adding Retweet Column

In [7]:
def is_retweet(col):

    for i in range(len(col)):
        if re.match(r'^RT', col) is not None:
            return 1
        else:
            return 0      
        
def map_is_retweet(col):
   
    bool_map = map(lambda x: is_retweet(x), col)       
    return(list(bool_map)) 

In [8]:
df['Retweet'] = map_is_retweet(df['Text'].values)

## Cleanup

In [9]:
# modified and improved version
def cleanup_tweet(tweet):

    # make all lower case
    tweet = tweet.lower()

    # remove URLs
    # URL_pattern = r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*'
    # tweet = re.sub(URL_pattern, '', tweet, flags=re.MULTILINE)
    # better, albeit slower, version
    urls = list(set(url_extractor.find_urls(tweet)))
    if len(urls) > 0:
        for url in urls:
            tweet = tweet.replace(url, "")

    # unescape HTML entities like &quot
    tweet = unescape(tweet)
    
    # remove user references (including username) or hashtags, etc.     
    pattern = r'\@\w+|\#|\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿\
                |\x82|\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
    
    tweet = re.sub(pattern, '', tweet)

    # remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # remove emojis
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet).strip()
    
    # final test, if after utf-8 encoding it's not ascii decodable, discard
    def is_ascii(text):
        try:
            text.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            return False
        else:
            return True
    
    if is_ascii(tweet) == False:
        return " "
    else:
        pass
    
    # remove stopwords
    # NLTK's set(stopwords.words('english')) removes too many words
    # using list of 25 semantically non-selective words (Reuters-RCV1 dataset)
    stop_words = ['a','an','and','are','as','at','be','by','for','from',
                  'has','he','in','is','it','its','of','on','that','the',
                  'to','was','were','will','with','rt'] # plus rt for retweet

    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]

    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]

    return " ".join(stemmed_words)

In [10]:
# cleanup Tweet text
# note: a map function takes just as long if not longer
url_extractor = urlextract.URLExtract()
df['Text'] = [cleanup_tweet(tweet) for tweet in df.loc[:,'Text']]

In [11]:
# create a subset with cols of interest
sub_df = df[['ID','Retweet','Text','Polarity']].copy()

# dedupe (text duplicates)
dupes = sub_df[sub_df['Text'].duplicated(keep='first')]

final_df = sub_df[~sub_df.ID.isin(list(dupes['ID']))]

In [12]:
final_df.head()

Unnamed: 0,ID,Retweet,Text,Polarity
0,1302406168791470081,1,total inspir bykianamaiart peach so i tri make...,-1
1,1302406168766078976,0,thank you,-1
2,1302406168170696706,0,some yall retweet realli have me look you side...,-1
3,1302406168120365056,0,thank you jen just miss old me,-1
4,1302406167956602880,0,my favorit part bad boy gangster mascot eunhyu...,-1


In [19]:
pct_retweets = round(100*sum(final_df['Retweet'])/final_df.shape[0],2)

In [20]:
pct_retweets

37.02