In [1]:
import re
import os
import json
import time

import string
import datetime
import urlextract
import pandas as pd

from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

TODO:

- check for dupes in ingestion
- use textBlob as yet another way to perform basic sentiment analysis


In [2]:
# load data
filepath = os.path.join("data","raw","tweets") 

dfm = []
for f in os.listdir(filepath):
    dfm.append(pd.read_csv(os.path.join(filepath,f)))

In [3]:
df = pd.concat(dfm)
df = df.reset_index(drop=True)

In [4]:
# test for dupes
ids = df["ID"]
df[ids.isin(ids[ids.duplicated()])]

Unnamed: 0,ID,Timestamp,User,Text,Polarity


In [5]:
# another, similar test
df[df.duplicated(keep=False)]

Unnamed: 0,ID,Timestamp,User,Text,Polarity


In [6]:
# look at users with more than 1 tweet
grouped = df[['User', 'ID']].groupby('User').count().sort_values('ID', ascending=False)

In [7]:
grouped[grouped['ID']>1].head()

Unnamed: 0_level_0,ID
User,Unnamed: 1_level_1
Stephaniedare3,2
alvynzaddy,2
machonejack,2
unique_friendss,2
Ms_nennie,2


In [8]:
df[df['User']=='JanelyMosqueda_']

Unnamed: 0,ID,Timestamp,User,Text,Polarity
3862,1303190716311625728,2020-09-08 04:37:33,JanelyMosqueda_,@meeyuhxo awe 🥺 Girl you got the waterworks go...,-1
5557,1303432470390542345,2020-09-08 20:38:12,JanelyMosqueda_,@_espinxsaaa Thank you 🥰❤️❤️,1


#### Adding Retweet Column

In [9]:
def is_retweet(col):
    if re.match(r'^RT', col) is not None:
        return 1
    else:
        return 0

In [10]:
for i in range(len(df)):
    df.loc[i, 'Retweet'] = is_retweet(df.loc[i, 'Text'])

In [11]:
df.tail()

Unnamed: 0,ID,Timestamp,User,Text,Polarity,Retweet
7995,1303492884138663936,2020-09-09 00:38:15,graceland1911,RT @Giveaway_Contst: I'll Send $25 to Someone ...,1,1.0
7996,1303492883979395076,2020-09-09 00:38:15,AlanOLeary20,RT @LFC: 𝑭𝒐𝒖𝒓 𝒅𝒂𝒚𝒔 𝒕𝒐 𝒈𝒐 🙌😄 https://t.co/icGDa...,1,1.0
7997,1303492883970834433,2020-09-09 00:38:15,lovexokml,RT @ruoi_05: Ohm .. Are you a groom? 🤭🤭💕💕😍😍 @...,1,1.0
7998,1303492883908104192,2020-09-09 00:38:15,Helen09Porter,@PicklesBottom @charliedecheeky 😘 luv yooz Pic...,1,0.0
7999,1303492883782283265,2020-09-09 00:38:15,_Samanthald,@Kuwnatalie Whoaaa sis😍,1,0.0


## Cleanup

In [12]:
# modified and improved version
def cleanup_tweet(tweet):

    # make all lower case
    tweet = tweet.lower()

    # remove URLs
    # URL_pattern = r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*'
    # tweet = re.sub(URL_pattern, '', tweet, flags=re.MULTILINE)
    # better, albeit slower, version
    urls = list(set(url_extractor.find_urls(tweet)))
    if len(urls) > 0:
        for url in urls:
            tweet = tweet.replace(url, "")

    # unescape HTML entities like &quot
    tweet = unescape(tweet)
    
    # remove user references (including username) or hashtags, etc.     
    pattern = r'\@\w+|\#|\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿\
                |\x82|\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
    
    tweet = re.sub(pattern, '', tweet)

    # remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # remove emojis
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet).strip()
    
    # final test, if after utf-8 encoding it's not ascii decodable, discard
    def is_ascii(text):
        try:
            text.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            return False
        else:
            return True
    
    if is_ascii(tweet) == False:
        return " "
    else:
        pass
    
    # remove stopwords
    # NLTK's set(stopwords.words('english')) removes too many words
    # using list of 25 semantically non-selective words (Reuters-RCV1 dataset)
    stop_words = ['a','an','and','are','as','at','be','by','for','from',
                  'has','he','in','is','it','its','of','on','that','the',
                  'to','was','were','will','with','rt'] # plus rt for retweet

    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]

    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]

    return " ".join(stemmed_words)

In [13]:
# make a copy and clean
df_clean = df.copy()

url_extractor = urlextract.URLExtract()
df_clean['Text'] = [cleanup_tweet(tweet) for tweet in df.loc[:,'Text']]

In [14]:
final_df = df_clean[['ID','Retweet','Text','Polarity']].copy()

In [15]:
# create directory for saving processed data
filepath = os.path.join("data","processed","tweets")
    
if not os.path.exists(filepath):
    os.makedirs(filepath)

In [16]:
# save final_df
today_prefix = datetime.datetime.now().strftime("%Y%m%d")
filename = ''.join([today_prefix, "_tweets.csv"])
final_df.to_csv(os.path.join(filepath, filename), index=False)