**Checklist**

- check for dupes in ingestion? 
    - not needed, no dupes, and text dupes should be checked after cleaning
- use textBlob as yet another way to perform basic sentiment analysis?
    - yes, TODO, but maybe not during cleanup
- create an incremental cleanup module?
    - overkill, just reprocess everything bc of dupes, 
      script takes 8 secs for 20k tweets, that's ~6 min for a million if linear
- improve script?
    - add logs?

In [1]:
import os
import re
import json
import time

import string
import datetime
import urlextract
import pandas as pd

from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
def load_todays_data():
    filepath = os.path.join("..","data","1_raw","tweets")
    today_prefix = datetime.datetime.now().strftime("%Y%m%d")
    dfm = []
    for f in os.listdir(filepath):
        if re.match(today_prefix, f):
            dfm.append(pd.read_csv(os.path.join(filepath, f)))
    df = pd.concat(dfm)
    df = df.reset_index(drop=True)
    return df

In [3]:
df = load_todays_data()

In [4]:
df.shape

(21600, 5)

In [5]:
# load data
#def load_data():
#    filepath = os.path.join("data","raw","tweets") 
#
#    dfm = []
#    for f in os.listdir(filepath):
#        dfm.append(pd.read_csv(os.path.join(filepath,f)))
#        
#    df = pd.concat(dfm)
#    df = df.reset_index(drop=True)
#    
#    return df

In [6]:
# test for duplicated IDs
ids = df["ID"]
df[ids.isin(ids[ids.duplicated()])].shape

(0, 5)

In [7]:
# test for duplicated Text
txt = df["Text"]
df[txt.isin(txt[txt.duplicated()])].shape

(5507, 5)

In [8]:
# look at users with more than 1 tweet?
grouped = df[['User', 'ID']].groupby('User').count().sort_values('ID', ascending=False)

grouped[grouped['ID']>1].head()

Unnamed: 0_level_0,ID
User,Unnamed: 1_level_1
AlluArjun373788,9
CParambarai,4
HabboRubbishBin,3
Lady_Minna,3
tobesotee,3


#### Adding Retweet Column

In [9]:
def is_retweet(col):

    for i in range(len(col)):
        if re.match(r'^RT', col) is not None:
            return 1
        else:
            return 0      
        
def map_is_retweet(col):
   
    bool_map = map(lambda x: is_retweet(x), col)       
    return(list(bool_map)) 

In [10]:
df['Retweet'] = map_is_retweet(df['Text'].values)

## Cleanup

In [11]:
def cleanup_tweet(tweet):
    """Cleans up a tweet with the following steps:
        1. make lower case
        2. remove URLs
        3. unescape HTML entities
        4. remove user references (including username) or hashtags, etc.
        5. remove punctuation
        6. remove emojis
        7. discard non-ascii decodable text after utf-8 encoding
        8. tokenize
        9. filter stop words from tokens
        10. stem filtered tokens
        
    The function returns a 3-tuple with cleaned versions 8 through 10.
    """
    # 1
    tweet = tweet.lower()

    # 2
    # URL_pattern = r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*'
    # tweet = re.sub(URL_pattern, '', tweet, flags=re.MULTILINE)
    # better, albeit slower, version
    urls = list(set(url_extractor.find_urls(tweet)))
    if len(urls) > 0:
        for url in urls:
            tweet = tweet.replace(url, "")
    # 3
    tweet = unescape(tweet)
    
    # 4
    pattern = r'\@\w+|\#|\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿\
                |\x82|\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
    tweet = re.sub(pattern, '', tweet)

    # 5
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # 6
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet).strip()

    # 7
    def is_ascii(text):
        try:
            text.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            return False
        else:
            return True
    
    if is_ascii(tweet) == False:
        return " "
    else:
        pass

    # 8 tokenized only (remove retweet prefix)
    tweet_tokens = word_tokenize(tweet)
    retweet = ['rt']
    tweet_tokens = [token for token in tweet_tokens if not token in retweet]
    
    # 9 tokenized + filtered
    # NLTK's set(stopwords.words('english')) removes too many words
    # using list of 25 semantically non-selective words (Reuters-RCV1 dataset)
    stop_words = ['a','an','and','are','as','at','be','by','for','from',
                  'has','he','in','is','it','its','of','on','that','the',
                  'to','was','were','will','with'] 
    filtered_tokens = [token for token in tweet_tokens if not token in stop_words]

    # 10 tokenized + filtered + stemmed
    ps = PorterStemmer()
    filtered_stemmed_tokens = [ps.stem(token) for token in filtered_tokens]
        
    v8 = " ".join(tweet_tokens)
    v9 = " ".join(filtered_tokens)
    v10 = " ".join(filtered_stemmed_tokens)  
    
    return (v8, v9, v10)

In [12]:
url_extractor = urlextract.URLExtract()
tuples = [cleanup_tweet(tweet) for tweet in df.loc[:,'Text']]

In [13]:
df.loc[:, 'tokenized'], df.loc[:, 'filtered'], df.loc[:, 'stemmed'] = \
[x[0] for x in tuples], [x[1] for x in tuples], [x[2] for x in tuples]

In [14]:
df.head()

Unnamed: 0,ID,Timestamp,User,Text,Polarity,Retweet,tokenized,filtered,stemmed
0,1305170993015644160,2020-09-13 15:46:28,SassySnipez,Thinking about cutting my hair but idk 🤔 #NewP...,-1,0,thinking about cutting my hair but idk newprof...,thinking about cutting my hair but idk newprof...,think about cut my hair but idk newprofilep
1,1305170992952565760,2020-09-13 15:46:28,nunubestboy,RT @milkypmh: thank you for working hard for t...,-1,1,thank you for working hard for this debut ever...,thank you working hard this debut everyone so ...,thank you work hard thi debut everyon so excit...
2,1305170992352825344,2020-09-13 15:46:28,irisskhryss,RT @chicheesticks: ya'll actually finished the...,-1,1,yall actually finished the tommy video,yall actually finished tommy video,yall actual finish tommi video
3,1305170992151621638,2020-09-13 15:46:28,shalexusss,"RT @PrincessTaaaty: I need a date night 🥺, whe...",-1,1,i need a date night where we talk eat and vibe,i need date night where we talk eat vibe,i need date night where we talk eat vibe
4,1305170991841116170,2020-09-13 15:46:28,codepinkanime,@RyTanaka2 @lucysupremacy @KawasPhattyCake @0I...,-1,0,i love you so much,i love you so much,i love you so much


In [15]:
df.tail()

Unnamed: 0,ID,Timestamp,User,Text,Polarity,Retweet,tokenized,filtered,stemmed
21595,1305306124665712640,2020-09-14 00:43:26,_MiaMiaMia_,@_BrittneyJanee IT'S JUST SUGAR OMG 😂,1,0,its just sugar omg,just sugar omg,just sugar omg
21596,1305306124586045441,2020-09-14 00:43:26,MotorCityDemon,"RT @e_seduisante: Clear Skin , Pretty Face 🍷❤️...",1,1,clear skin pretty face,clear skin pretty face,clear skin pretti face
21597,1305306124581777409,2020-09-14 00:43:26,MisGuidedGenXer,RT @Ebonyteach: I’m in tears! CLAUDINE! Curtis...,1,1,im in tears claudine curtis mayfield,im tears claudine curtis mayfield,im tear claudin curti mayfield
21598,1305306124531318784,2020-09-14 00:43:26,andrejohnson174,I see y’all hearting them RT if you need a ble...,1,0,i see yall hearting them if you need a blessing,i see yall hearting them if you need blessing,i see yall heart them if you need bless
21599,1305306124485263360,2020-09-14 00:43:26,Im_TheMove,"RT @SHAAANEE_: idc how much i like you, i’ll l...",1,1,idc how much i like you ill leave you alone,idc how much i like you ill leave you alone,idc how much i like you ill leav you alon


In [18]:
dupes = df[df['tokenized'].duplicated(keep='first')]

In [19]:
dupes.shape[0]/df.shape[0] # % dupes

0.23555555555555555

In [20]:
dupes[dupes['Retweet']==1].shape[0]/dupes.shape[0] # % of dupes that are retweets

0.8966194968553459

In [19]:
## create a subset with cols of interest
#sub_df = df[['ID','Retweet','Text','Polarity']].copy()
#
## dedupe (text duplicates)
#dupes = sub_df[sub_df['Text'].duplicated(keep='first')]
#
#final_df = sub_df[~sub_df.ID.isin(list(dupes['ID']))]

In [20]:
#final_df.head()