**Checklist**

- check for dupes in ingestion? 
    - not needed, no dupes, and text dupes should be checked after cleaning
- use textBlob as yet another way to perform basic sentiment analysis?
    - yes, TODO, but maybe not during cleanup
- create an incremental cleanup module?
    - overkill, just reprocess everything bc of dupes, 
      script takes 8 secs for 20k tweets, that's ~6 min for a million if linear
- improve script?
    - add logs?

In [1]:
import re
import os
import json
import time

import string
import datetime
import urlextract
import pandas as pd

from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
def load_todays_data():
    filepath = os.path.join("data","raw","tweets")
    today_prefix = datetime.datetime.now().strftime("%Y%m%d")
    dfm = []
    for f in os.listdir(filepath):
        if re.match(today_prefix, f):
            dfm.append(pd.read_csv(os.path.join(filepath, f)))
    df = pd.concat(dfm)
    df = df.reset_index(drop=True)
    return df

In [3]:
df = load_todays_data()

In [4]:
df.shape

(22400, 5)

In [5]:
# load data
#def load_data():
#    filepath = os.path.join("data","raw","tweets") 
#
#    dfm = []
#    for f in os.listdir(filepath):
#        dfm.append(pd.read_csv(os.path.join(filepath,f)))
#        
#    df = pd.concat(dfm)
#    df = df.reset_index(drop=True)
#    
#    return df

In [6]:
# test for duplicated IDs
ids = df["ID"]
df[ids.isin(ids[ids.duplicated()])].shape

(0, 5)

In [7]:
# test for duplicated Text
txt = df["Text"]
df[txt.isin(txt[txt.duplicated()])].shape

(5603, 5)

In [8]:
# look at users with more than 1 tweet?
grouped = df[['User', 'ID']].groupby('User').count().sort_values('ID', ascending=False)

grouped[grouped['ID']>1].head()

Unnamed: 0_level_0,ID
User,Unnamed: 1_level_1
johntykishore9,5
DeadPoolzNutz,5
Bot_Otters,5
elitejohnsonn,4
fgarrazo,4


#### Adding Retweet Column

In [9]:
def is_retweet(col):

    for i in range(len(col)):
        if re.match(r'^RT', col) is not None:
            return 1
        else:
            return 0      
        
def map_is_retweet(col):
   
    bool_map = map(lambda x: is_retweet(x), col)       
    return(list(bool_map)) 

In [10]:
df['Retweet'] = map_is_retweet(df['Text'].values)

## Cleanup

In [11]:
def cleanup_tweet(tweet):
    """Cleans up a tweet with the following steps:
        1. make lower case
        2. remove URLs
        3. unescape HTML entities
        4. remove user references (including username) or hashtags, etc.
        5. remove punctuation
        6. remove emojis
        7. discard non-ascii decodable text after utf-8 encoding
        8. tokenize
        9. filter stop words from tokens
        10. stem filtered tokens
        
    The function returns a 3-tuple with cleaned versions 8 through 10.
    """
    # 1
    tweet = tweet.lower()

    # 2
    # URL_pattern = r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*'
    # tweet = re.sub(URL_pattern, '', tweet, flags=re.MULTILINE)
    # better, albeit slower, version
    urls = list(set(url_extractor.find_urls(tweet)))
    if len(urls) > 0:
        for url in urls:
            tweet = tweet.replace(url, "")
    # 3
    tweet = unescape(tweet)
    
    # 4
    pattern = r'\@\w+|\#|\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿\
                |\x82|\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
    tweet = re.sub(pattern, '', tweet)

    # 5
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # 6
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet).strip()

    # 7
    def is_ascii(text):
        try:
            text.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            return False
        else:
            return True
    
    if is_ascii(tweet) == False:
        return " "
    else:
        pass

    # 8 tokenized only (remove retweet prefix)
    tweet_tokens = word_tokenize(tweet)
    retweet = ['rt']
    tweet_tokens = [token for token in tweet_tokens if not token in retweet]
    
    # 9 tokenized + filtered
    # NLTK's set(stopwords.words('english')) removes too many words
    # using list of 25 semantically non-selective words (Reuters-RCV1 dataset)
    stop_words = ['a','an','and','are','as','at','be','by','for','from',
                  'has','he','in','is','it','its','of','on','that','the',
                  'to','was','were','will','with'] 
    filtered_tokens = [token for token in tweet_tokens if not token in stop_words]

    # 10 tokenized + filtered + stemmed
    ps = PorterStemmer()
    filtered_stemmed_tokens = [ps.stem(token) for token in filtered_tokens]
        
    v8 = " ".join(tweet_tokens)
    v9 = " ".join(filtered_tokens)
    v10 = " ".join(filtered_stemmed_tokens)  
    
    return (v8, v9, v10)

In [12]:
url_extractor = urlextract.URLExtract()
tuples = [cleanup_tweet(tweet) for tweet in df.loc[:,'Text']]

In [13]:
df.loc[:, 'tokenized'], df.loc[:, 'filtered'], df.loc[:, 'stemmed'] = \
[x[0] for x in tuples], [x[1] for x in tuples], [x[2] for x in tuples], 

In [14]:
df.head()

Unnamed: 0,ID,Timestamp,User,Text,Polarity,Retweet,tokenized,filtered,stemmed
0,1304836132090400768,2020-09-12 17:35:51,lorenabarbasso,RT @archivekarla: one year of liar mv 🤥✨ https...,-1,1,one year of liar mv,one year liar mv,one year liar mv
1,1304836131117129729,2020-09-12 17:35:50,zakisamo_,I guess I just miss being babied 🥺,-1,0,i guess i just miss being babied,i guess i just miss being babied,i guess i just miss be babi
2,1304836130857201664,2020-09-12 17:35:50,BroskeyTha,The Godfather has spoken... masambe grootman 😤...,-1,0,the godfather has spoken masambe grootman,godfather spoken masambe grootman,godfath spoken masamb grootman
3,1304836130475569152,2020-09-12 17:35:50,smolbeanseungmo,@4hyuckno @bottbabyjeno becauseee you guys see...,-1,0,becauseee you guys seemed busy,becauseee you guys seemed busy,becausee you guy seem busi
4,1304836130420989958,2020-09-12 17:35:50,Mansoor18214871,RT @Mansoor18214871: A phenomenal dancer.\nOur...,-1,1,a phenomenal dancer our sush we love you miss ...,phenomenal dancer our sush we love you miss yo...,phenomen dancer our sush we love you miss you ...


In [15]:
df.tail()

Unnamed: 0,ID,Timestamp,User,Text,Polarity,Retweet,tokenized,filtered,stemmed
22395,1305001402981179392,2020-09-13 04:32:34,tinybangtanarmy,RT @taesoothe: BTS with DNA was the first idol...,1,1,bts with dna was the first idol group in histo...,bts dna first idol group history debuted hot10...,bt dna first idol group histori debut hot100 f...
22396,1305001402851155968,2020-09-13 04:32:34,JagadeeshNTR14,RT @worldNTRfans: Painting @tarak9999 🥰😘\n\nAr...,1,1,painting artist pavan komarambheemntr,painting artist pavan komarambheemntr,paint artist pavan komarambheemntr
22397,1305001402800971776,2020-09-13 04:32:34,figu236016351,RT @LayariaNetwork: Cool 😂😂 https://t.co/Wp9KK...,1,1,cool,cool,cool
22398,1305001402771537921,2020-09-13 04:32:34,HareeshThala,RT @ArJunMohanan13: Smashed 160K Tweets 💥❤️\n\...,1,1,smashed 160k tweets moving to 200k 1yrofajithf...,smashed 160k tweets moving 200k 1yrofajithfans...,smash 160k tweet move 200k 1yrofajithfanspride...
22399,1305001402578530309,2020-09-13 04:32:34,KeishaAngeril,@taeddy_bear__ I'm just saying it if it's a Pr...,1,0,im just saying it if its a problem sorry,im just saying if problem sorry,im just say if problem sorri


In [16]:
dupes = df[df['Text'].duplicated(keep='first')]

In [17]:
dupes.shape[0]/df.shape[0] # % dupes

0.20004464285714285

In [18]:
dupes[dupes['Retweet']==1].shape[0]/dupes.shape[0] # % of dupes that are retweets

1.0

In [19]:
## create a subset with cols of interest
#sub_df = df[['ID','Retweet','Text','Polarity']].copy()
#
## dedupe (text duplicates)
#dupes = sub_df[sub_df['Text'].duplicated(keep='first')]
#
#final_df = sub_df[~sub_df.ID.isin(list(dupes['ID']))]

In [20]:
#final_df.head()