**Checklist**

- check for dupes in ingestion? 
    - not needed, no dupes, and text dupes should be checked after cleaning
- use textBlob as yet another way to perform basic sentiment analysis?
    - yes, TODO, but maybe not during cleanup
- create an incremental cleanup module?
    - overkill, just reprocess everything bc of dupes, 
      script takes 8 secs for 20k tweets, that's ~6 min for a million if linear
- improve script?
    - add logs?

In [6]:
import os
import re
import json
import time

import string
import datetime
import urlextract
import pandas as pd

from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [7]:
def load_todays_data():
    filepath = os.path.join("..","data","1_raw","tweets")
    today_prefix = datetime.datetime.now().strftime("%Y%m%d")
    dfm = []
    for f in os.listdir(filepath):
        if re.match(today_prefix, f):
            dfm.append(pd.read_csv(os.path.join(filepath, f)))
    df = pd.concat(dfm)
    df = df.reset_index(drop=True)
    return df

In [9]:
#df = load_todays_data()

In [10]:
def load_all_data():
    filepath = os.path.join("..","data","1_raw","tweets")
    #today_prefix = datetime.datetime.now().strftime("%Y%m%d")
    dfm = []
    for f in os.listdir(filepath):
        #if re.match(today_prefix, f):
        dfm.append(pd.read_csv(os.path.join(filepath, f)))
    df = pd.concat(dfm)
    df = df.reset_index(drop=True)
    return df

In [11]:
df = load_all_data()

In [12]:
df.shape

(372400, 5)

In [13]:
# test for duplicated IDs
ids = df["ID"]
df[ids.isin(ids[ids.duplicated()])].shape

(0, 5)

In [14]:
# test for duplicated Text
txt = df["Text"]
df[txt.isin(txt[txt.duplicated()])].shape

(104665, 5)

In [15]:
372400-104665

267735

In [16]:
# look at users with more than 1 tweet?
grouped = df[['User', 'ID']].groupby('User').count().sort_values('ID', ascending=False)

grouped[grouped['ID']>1].head()

Unnamed: 0_level_0,ID
User,Unnamed: 1_level_1
pinkyfaye,25
GetVidBot,23
Bot_Otters,21
KenanWaters,21
DeadPoolzNutz,19


#### Adding Retweet Column

In [17]:
def is_retweet(col):

    for i in range(len(col)):
        if re.match(r'^RT', col) is not None:
            return 1
        else:
            return 0      
        
def map_is_retweet(col):
   
    bool_map = map(lambda x: is_retweet(x), col)       
    return(list(bool_map)) 

In [18]:
df['Retweet'] = map_is_retweet(df['Text'].values)

## Cleanup

In [19]:
def cleanup_tweet(tweet):
    """Cleans up a tweet with the following steps:
        1. make lower case
        2. remove URLs
        3. unescape HTML entities
        4. remove user references (including username) or hashtags, etc.
        5. remove punctuation
        6. remove emojis
        7. discard non-ascii decodable text after utf-8 encoding
        8. tokenize
        9. filter stop words from tokens
        10. stem filtered tokens
        
    The function returns a 3-tuple with cleaned versions 8 through 10.
    """
    # 1
    tweet = tweet.lower()

    # 2
    # URL_pattern = r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*'
    # tweet = re.sub(URL_pattern, '', tweet, flags=re.MULTILINE)
    # better, albeit slower, version
    urls = list(set(url_extractor.find_urls(tweet)))
    if len(urls) > 0:
        for url in urls:
            tweet = tweet.replace(url, "")
    # 3
    tweet = unescape(tweet)
    
    # 4
    pattern = r'\@\w+|\#|\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿\
                |\x82|\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
    tweet = re.sub(pattern, '', tweet)

    # 5
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # 6
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet).strip()

    # 7
    def is_ascii(text):
        try:
            text.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            return False
        else:
            return True
    
    if is_ascii(tweet) == False:
        return " "
    else:
        pass

    # 8 tokenized only (remove retweet prefix)
    tweet_tokens = word_tokenize(tweet)
    retweet = ['rt']
    tweet_tokens = [token for token in tweet_tokens if not token in retweet]
    
    # 9 tokenized + filtered
    # NLTK's set(stopwords.words('english')) removes too many words
    # using list of 25 semantically non-selective words (Reuters-RCV1 dataset)
    stop_words = ['a','an','and','are','as','at','be','by','for','from',
                  'has','he','in','is','it','its','of','on','that','the',
                  'to','was','were','will','with'] 
    filtered_tokens = [token for token in tweet_tokens if not token in stop_words]

    # 10 tokenized + filtered + stemmed
    ps = PorterStemmer()
    filtered_stemmed_tokens = [ps.stem(token) for token in filtered_tokens]
        
    v8 = " ".join(tweet_tokens)
    v9 = " ".join(filtered_tokens)
    v10 = " ".join(filtered_stemmed_tokens)  
    
    return (v8, v9, v10)

In [20]:
url_extractor = urlextract.URLExtract()
tuples = [cleanup_tweet(tweet) for tweet in df.loc[:,'Text']]

In [21]:
df.loc[:, 'tokenized'], df.loc[:, 'filtered'], df.loc[:, 'stemmed'] = \
[x[0] for x in tuples], [x[1] for x in tuples], [x[2] for x in tuples]

In [22]:
df.head()

Unnamed: 0,ID,Timestamp,User,Text,Polarity,Retweet,tokenized,filtered,stemmed
0,1302406168791470081,2020-09-06 00:40:02,asarinanamis,RT @Ayshiun: Totally inspired by@/kianamaiart'...,-1,1,totally inspired bykianamaiarts peach so i tri...,totally inspired bykianamaiarts peach so i tri...,total inspir bykianamaiart peach so i tri make...
1,1302406168766078976,2020-09-06 00:40:02,xleave_thecity,@thiinkinaboutit thank you 🥺,-1,0,thank you,thank you,thank you
2,1302406168170696706,2020-09-06 00:40:02,MsTam_Tam,some of yall retweets really be having me look...,-1,0,some of yall retweets really be having me look...,some yall retweets really having me look you s...,some yall retweet realli have me look you side...
3,1302406168120365056,2020-09-06 00:40:02,_thebdawkk,@jeenbeen__ Thank you Jen 🥺 just miss the old ...,-1,0,thank you jen just miss the old me,thank you jen just miss old me,thank you jen just miss old me
4,1302406167956602880,2020-09-06 00:40:02,4ranghae1015,"@SJofficial My favorite part is the BAD boy, g...",-1,0,my favorite part is the bad boy gangsters masc...,my favorite part bad boy gangsters mascot eunh...,my favorit part bad boy gangster mascot eunhyu...


In [23]:
df.tail()

Unnamed: 0,ID,Timestamp,User,Text,Polarity,Retweet,tokenized,filtered,stemmed
372395,1307544799126237186,2020-09-20 04:59:07,kingshxxbiay,RT @arjuyna: 25rep 25rt 25like gue bikin react...,1,1,25rep 25rt 25like gue bikin reaction mv the st...,25rep 25rt 25like gue bikin reaction mv steale...,25rep 25rt 25like gue bikin reaction mv steale...
372396,1307544799000449026,2020-09-20 04:59:07,llnyall,RT @YouKnowMeLilyA: Today on Twitter I found m...,1,1,today on twitter i found my sudanese husbands,today twitter i found my sudanese husbands,today twitter i found my sudanes husband
372397,1307544798866276352,2020-09-20 04:59:07,JiyaramPatel,RT @1FIRSTGroups: 1000+ FOLLOWERS Instantly !!...,1,1,1000 followers instantly retweet and drop your...,1000 followers instantly retweet drop your handle,1000 follow instantli retweet drop your handl
372398,1307544798828679168,2020-09-20 04:59:07,alefromplenty,RT @DuecePop: Tonight I’m djayin @TheSqueezebo...,1,1,tonight im djayin mask required we serving foo...,tonight im djayin mask required we serving foo...,tonight im djayin mask requir we serv food dri...
372399,1307544798459437057,2020-09-20 04:59:07,sanamraut16,RT @KatrinaTigress: 17 YEARS OF KATRINA KAIF\n...,1,1,17 years of katrina kaif queen katrinakaif cut...,17 years katrina kaif queen katrinakaif cutie ...,17 year katrina kaif queen katrinakaif cuti da...


In [24]:
dupes = df[df['tokenized'].duplicated(keep='first')]

In [25]:
dupes.shape[0]/df.shape[0] # % dupes

0.297078410311493

In [26]:
dupes[dupes['Retweet']==1].shape[0]/dupes.shape[0] # % of dupes that are retweets

0.8210101959650011

In [27]:
# test for duplicated in tokenized
txt = df["tokenized"]
df[txt.isin(txt[txt.duplicated()])].shape

(133237, 9)

In [28]:
# get total non-dupes
df.shape[0] - df[txt.isin(txt[txt.duplicated()])].shape[0]

239163

In [31]:
df.groupby('Polarity').count()

Unnamed: 0_level_0,ID,Timestamp,User,Text,Retweet,tokenized,filtered,stemmed
Polarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1,186200,186200,186200,186200,186200,186200,186200,186200
1,186200,186200,186200,186200,186200,186200,186200,186200


In [35]:
# more negative dupes
df[txt.isin(txt[txt.duplicated()])].groupby('Polarity').count()

Unnamed: 0_level_0,ID,Timestamp,User,Text,Retweet,tokenized,filtered,stemmed
Polarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1,73202,73202,73202,73202,73202,73202,73202,73202
1,60035,60035,60035,60035,60035,60035,60035,60035
