In [14]:
import re
import os
import json
import time

import string
import datetime
import urlextract
import pandas as pd

from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

TODO:

- check for dupes, include deduping in ingestion
- cleanup tweet text
- create feature that identifies a retweet (RT)
- use textBlob as yet another way to perform basic sentiment analysis


In [3]:
# load data
filepath = os.path.join("data","raw","tweets") 

dfm = []
for f in os.listdir(filepath):
    dfm.append(pd.read_csv(os.path.join(filepath,f)))

In [4]:
df = pd.concat(dfm)
df = df.reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,ID,Timestamp,User,Text,Polarity
0,1302406168791470081,2020-09-06 00:40:02,asarinanamis,RT @Ayshiun: Totally inspired by@/kianamaiart'...,-1
1,1302406168766078976,2020-09-06 00:40:02,xleave_thecity,@thiinkinaboutit thank you 🥺,-1
2,1302406168170696706,2020-09-06 00:40:02,MsTam_Tam,some of yall retweets really be having me look...,-1
3,1302406168120365056,2020-09-06 00:40:02,_thebdawkk,@jeenbeen__ Thank you Jen 🥺 just miss the old ...,-1
4,1302406167956602880,2020-09-06 00:40:02,4ranghae1015,"@SJofficial My favorite part is the BAD boy, g...",-1


In [6]:
df.tail()

Unnamed: 0,ID,Timestamp,User,Text,Polarity
3995,1303190740311461891,2020-09-08 04:37:39,bokedone,@ClausPelz eant him full naked 1🥰😘😍🤩🤩🤩🤩🤩🤩🤩pls 🍆🍆🍑,1
3996,1303190740185550848,2020-09-08 04:37:39,joealexpitt,RT @onhi: Happy Tuesday! 😊\n\n https://t.co/u0...,1
3997,1303190740164644864,2020-09-08 04:37:39,TeresaASchmidt1,@tyler_hynes prime example of ur incredible ta...,1
3998,1303190740130869248,2020-09-08 04:37:39,jeonginsmilkbag,tw sexual assault. pls if you are not de...,1
3999,1303190740009279488,2020-09-08 04:37:39,artidesai555,@ramesh_ydp Good morning 😊,1


In [7]:
ids = df["ID"]
df[ids.isin(ids[ids.duplicated()])]

Unnamed: 0,ID,Timestamp,User,Text,Polarity


In [8]:
df[df.duplicated(keep=False)]

Unnamed: 0,ID,Timestamp,User,Text,Polarity


No dupes? Just collect more then. Implement deduplication as needed.

## Cleanup

In [9]:
# modified and improved version
def cleanup_tweet(tweet):

    # make all lower case
    tweet = tweet.lower()

    # remove URLs
    # URL_pattern = r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*'
    # tweet = re.sub(URL_pattern, '', tweet, flags=re.MULTILINE)
    # better, albeit slower, version
    urls = list(set(url_extractor.find_urls(tweet)))
    if len(urls) > 0:
        for url in urls:
            tweet = tweet.replace(url, "")

    # unescape HTML entities like &quot
    tweet = unescape(tweet)
    
    # remove user references (including username) or hashtags, etc.     
    pattern = r'\@\w+|\#|\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿\
                |\x82|\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
    
    tweet = re.sub(pattern, '', tweet)

    # remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # remove emojis
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet).strip()
    
    # final test, if after utf-8 encoding it's not ascii decodable, discard
    def is_ascii(text):
        try:
            text.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            return False
        else:
            return True
    
    if is_ascii(tweet) == False:
        return " "
    else:
        pass
    
    # remove stopwords
    # NLTK's set(stopwords.words('english')) removes too many words
    # using list of 25 semantically non-selective words (Reuters-RCV1 dataset)
    stop_words = ['a','an','and','are','as','at','be','by','for','from',
                  'has','he','in','is','it','its','of','on','that','the',
                  'to','was','were','will','with','rt'] # plus rt for retweet

    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]

    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]

    return " ".join(stemmed_words)

In [None]:
df

In [19]:
tweets_text = df.loc[3995:3999,'Text']

In [23]:
tweets_text

3995    @ClausPelz eant him full naked 1🥰😘😍🤩🤩🤩🤩🤩🤩🤩pls 🍆🍆🍑
3996    RT @onhi: Happy Tuesday! 😊\n\n https://t.co/u0...
3997    @tyler_hynes prime example of ur incredible ta...
3998    tw sexual assault.       pls if you are not de...
3999                           @ramesh_ydp Good morning 😊
Name: Text, dtype: object

In [21]:
url_extractor = urlextract.URLExtract()
cleaned_tweets_text = [cleanup_tweet(tweet) for tweet in tweets_text]
cleaned_tweets_text

['eant him full nake 1pl',
 'happi tuesday',
 'prime exampl ur incred talentcreepi bad curnow hotwiredinsurburbia luvurtal',
 'tw sexual assault pl if you not deal thi okay then take break ili all i off w',
 'good morn']