**Checklist**

- check for dupes in ingestion? 
    - not needed, no dupes, and text dupes should be checked after cleaning
- use textBlob as yet another way to perform basic sentiment analysis?
    - yes, TODO, but maybe not during cleanup
- create an incremental cleanup module?
    - overkill, just reprocess everything bc of dupes, 
      script takes 8 secs for 20k tweets, that's ~6 min for a million if linear
- improve script?
    - add logs?

In [1]:
import os
import re
import json
import time

import string
import datetime
import urlextract
import pandas as pd

from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
def load_todays_data():
    filepath = os.path.join("..","data","1_raw","tweets")
    today_prefix = datetime.datetime.now().strftime("%Y%m%d")
    dfm = []
    for f in os.listdir(filepath):
        if re.match(today_prefix, f):
            dfm.append(pd.read_csv(os.path.join(filepath, f)))
    df = pd.concat(dfm)
    df = df.reset_index(drop=True)
    return df

In [3]:
df = load_todays_data()

In [4]:
df.shape

(44000, 5)

In [5]:
# test for duplicated IDs
ids = df["ID"]
df[ids.isin(ids[ids.duplicated()])].shape

(0, 5)

In [6]:
# test for duplicated Text
txt = df["Text"]
df[txt.isin(txt[txt.duplicated()])].shape

(12217, 5)

In [7]:
# look at users with more than 1 tweet?
grouped = df[['User', 'ID']].groupby('User').count().sort_values('ID', ascending=False)

grouped[grouped['ID']>1].head()

Unnamed: 0_level_0,ID
User,Unnamed: 1_level_1
Navii52989964,7
wonpirimirie,6
ClaudieSkies,6
Dzs18887618,6
GetVidBot,5


#### Adding Retweet Column

In [8]:
def is_retweet(col):

    for i in range(len(col)):
        if re.match(r'^RT', col) is not None:
            return 1
        else:
            return 0      
        
def map_is_retweet(col):
   
    bool_map = map(lambda x: is_retweet(x), col)       
    return(list(bool_map)) 

In [9]:
df['Retweet'] = map_is_retweet(df['Text'].values)

## Cleanup

In [10]:
def cleanup_tweet(tweet):
    """Cleans up a tweet with the following steps:
        1. make lower case
        2. remove URLs
        3. unescape HTML entities
        4. remove user references (including username) or hashtags, etc.
        5. remove punctuation
        6. remove emojis
        7. discard non-ascii decodable text after utf-8 encoding
        8. tokenize
        9. filter stop words from tokens
        10. stem filtered tokens
        
    The function returns a 3-tuple with cleaned versions 8 through 10.
    """
    # 1
    tweet = tweet.lower()

    # 2
    # URL_pattern = r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*'
    # tweet = re.sub(URL_pattern, '', tweet, flags=re.MULTILINE)
    # better, albeit slower, version
    urls = list(set(url_extractor.find_urls(tweet)))
    if len(urls) > 0:
        for url in urls:
            tweet = tweet.replace(url, "")
    # 3
    tweet = unescape(tweet)
    
    # 4
    pattern = r'\@\w+|\#|\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿\
                |\x82|\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
    tweet = re.sub(pattern, '', tweet)

    # 5
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # 6
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet).strip()

    # 7
    def is_ascii(text):
        try:
            text.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            return False
        else:
            return True
    
    if is_ascii(tweet) == False:
        return " "
    else:
        pass

    # 8 tokenized only (remove retweet prefix)
    tweet_tokens = word_tokenize(tweet)
    retweet = ['rt']
    tweet_tokens = [token for token in tweet_tokens if not token in retweet]
    
    # 9 tokenized + filtered
    # NLTK's set(stopwords.words('english')) removes too many words
    # using list of 25 semantically non-selective words (Reuters-RCV1 dataset)
    stop_words = ['a','an','and','are','as','at','be','by','for','from',
                  'has','he','in','is','it','its','of','on','that','the',
                  'to','was','were','will','with'] 
    filtered_tokens = [token for token in tweet_tokens if not token in stop_words]

    # 10 tokenized + filtered + stemmed
    ps = PorterStemmer()
    filtered_stemmed_tokens = [ps.stem(token) for token in filtered_tokens]
        
    v8 = " ".join(tweet_tokens)
    v9 = " ".join(filtered_tokens)
    v10 = " ".join(filtered_stemmed_tokens)  
    
    return (v8, v9, v10)

In [11]:
url_extractor = urlextract.URLExtract()
tuples = [cleanup_tweet(tweet) for tweet in df.loc[:,'Text']]

In [12]:
df.loc[:, 'tokenized'], df.loc[:, 'filtered'], df.loc[:, 'stemmed'] = \
[x[0] for x in tuples], [x[1] for x in tuples], [x[2] for x in tuples]

In [13]:
df.head()

Unnamed: 0,ID,Timestamp,User,Text,Polarity,Retweet,tokenized,filtered,stemmed
0,1305509246335934467,2020-09-14 14:10:34,Exquisite_Lex,wait ? why are teachers putting kids in groups...,-1,0,wait why are teachers putting kids in groups l...,wait why teachers putting kids groups like thi...,wait whi teacher put kid group like thi whole ...
1,1305509246130413569,2020-09-14 14:10:34,FabFay3,Damn I want some fried rice kitchen 😫,-1,0,damn i want some fried rice kitchen,damn i want some fried rice kitchen,damn i want some fri rice kitchen
2,1305509246067331079,2020-09-14 14:10:34,chim0nacx,@THARAFR0NG THIS ONE!!! NEW BEING NEW AND TAY ...,-1,0,this one new being new and tay being tay huhu,this one new being new tay being tay huhu,thi one new be new tay be tay huhu
3,1305509245920645120,2020-09-14 14:10:34,twoheartskiwi,@whoishannahhh you make all of our days so muc...,-1,0,you make all of our days so much brighter,you make all our days so much brighter,you make all our day so much brighter
4,1305509245811486720,2020-09-14 14:10:34,BelmilJennyrose,RT @borahaejeonjk: can we talk about how jungk...,-1,1,can we talk about how jungkook always makes su...,can we talk about how jungkook always makes su...,can we talk about how jungkook alway make sure...


In [14]:
df.tail()

Unnamed: 0,ID,Timestamp,User,Text,Polarity,Retweet,tokenized,filtered,stemmed
43995,1305708822795706368,2020-09-15 03:23:36,jindalji5206,RT @__TANU_: Waiting for new post😒😒...\nHaye m...,1,1,waiting for new post haye meri mottojaldi se e...,waiting new post haye meri mottojaldi se ek st...,wait new post hay meri mottojaldi se ek stun p...
43996,1305708822791507968,2020-09-15 03:23:36,CodingMark,@dezmondOliver @seanmmitchell @Kristennetten I...,1,0,it is quite surprising to me but the competiti...,quite surprising me but competitive price utility,quit surpris me but competit price util
43997,1305708822691020801,2020-09-15 03:23:36,zziraaah,RT @lmlowk3y: Muslim architecture 😍 https://t....,1,1,muslim architecture,muslim architecture,muslim architectur
43998,1305708822577598464,2020-09-15 03:23:36,teresa08238743,@DanRather Good report Dan Rather😊,1,0,good report dan rather,good report dan rather,good report dan rather
43999,1305708822535827456,2020-09-15 03:23:36,Ari_brielle,RT @ItssArielle_: teachers are the foundation ...,1,1,teachers are the foundation of our society sho...,teachers foundation our society shoutout my te...,teacher foundat our societi shoutout my teache...


In [15]:
dupes = df[df['tokenized'].duplicated(keep='first')]

In [16]:
dupes.shape[0]/df.shape[0] # % dupes

0.26981818181818185

In [17]:
dupes[dupes['Retweet']==1].shape[0]/dupes.shape[0] # % of dupes that are retweets

0.8828335579514824