In [102]:
import re
import os
import json
import time

import tweepy
import string
import datetime
import schedule
import urlextract
import pandas as pd

from html import unescape
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [14]:
def initiate_api():
    with open('./.conf/config.json', 'r') as f:
        config = json.load(f)
    auth = tweepy.OAuthHandler(config["CONSUMER_KEY"], config["CONSUMER_SECRET"])
    auth.set_access_token(config["ACCESS_KEY"], config["ACCESS_SECRET"])
    api = tweepy.API(auth, wait_on_rate_limit=True)
    return api

In [15]:
api = initiate_api()

In [16]:
negative_query = '(🤬 OR 🤮 OR 😡 OR 😤 OR 🥺 OR 🤢 OR 😣 OR 😟 OR 😣 OR 🤔 OR 🤥 OR 😫 OR 🤮 OR 🥵 OR 😨 OR 😰 \
                   OR 😭 OR 😥 OR 🙁) AND -(😃 OR 😄 OR 😁 OR 🥰 OR 😊 OR ❤️ OR 💋 OR 😍 OR 😂 OR 😎 OR 🤣 OR 😘 \
                   OR 😇 OR 🙃 OR 😉 OR 😇 OR 🤩 OR 😃 OR 😄 OR 🙂 OR 😭)'

date_since = "2019-01-01"

# Collect tweets
tweets = []
for status in tweepy.Cursor(api.search,
                       q=negative_query,
                        include_entities=True,
                        monitor_rate_limit=True, 
                        wait_on_rate_limit=True,
                       lang="en",
                       since=date_since).items(10):

    tweets.append([status.id_str, 
                   status.created_at, 
                   status.user.screen_name, 
                   status.text])

In [137]:
tweets_df = pd.DataFrame(tweets, columns=["ID", "Timestamp", "User", "Text"])

# save raw data for processing later

now_prefix = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_")

filepath = "./data/tweets/raw"
if not os.path.exists(filepath):
    os.makedirs(filepath)

tweets_df.to_csv(os.path.join(filepath, ''.join([now_prefix, "tweets.csv"])), index=False)

In [134]:
# create feature that identifies a retweet (RT)
# add a label (-1, maybe use textBlob from python to test their sentiment analysis)
# cleanup tweet text

In [135]:
tweets

[['1302377912545370112',
  datetime.datetime(2020, 9, 5, 22, 47, 46),
  'kimseok1204',
  'RT @monipersona: a thread of namjoon; but as you scroll down he gets older 🥺 @BTS_twt https://t.co/2LNRCnfjY7'],
 ['1302377911962480646',
  datetime.datetime(2020, 9, 5, 22, 47, 45),
  'Sabel014',
  'RT @SB19Official: What\'s your favorite track in "Get In The Zone!"? 🤔\n\n#SB191stAlbum #SB19 #SB19_SEJUN #SB19_JUSTIN #SB19_STELL #SB19_JOSH…'],
 ['1302377911580864513',
  datetime.datetime(2020, 9, 5, 22, 47, 45),
  'signor_Wilson',
  'RT @OlisaOsega: After dancing tonight, tomorrow morning Prince will open his hair dressing and barbing salon. 😭😭😭\n\nIf Darling don’t give th…'],
 ['1302377911521955845',
  datetime.datetime(2020, 9, 5, 22, 47, 45),
  'MalakiBusta',
  '@hoobisboobies 🥺 sending hugs'],
 ['1302377911387918336',
  datetime.datetime(2020, 9, 5, 22, 47, 45),
  'ceeitachi',
  'RT @rahm3sh: Y’all need to leave the jobs that require you to serve food on roller skates alone ‼️😭 https://t.co/qN

In [136]:
tweets_df

Unnamed: 0,ID,Timestamp,User,Text
0,1302377912545370112,2020-09-05 22:47:46,kimseok1204,RT @monipersona: a thread of namjoon; but as y...
1,1302377911962480646,2020-09-05 22:47:45,Sabel014,RT @SB19Official: What's your favorite track i...
2,1302377911580864513,2020-09-05 22:47:45,signor_Wilson,"RT @OlisaOsega: After dancing tonight, tomorro..."
3,1302377911521955845,2020-09-05 22:47:45,MalakiBusta,@hoobisboobies 🥺 sending hugs
4,1302377911387918336,2020-09-05 22:47:45,ceeitachi,RT @rahm3sh: Y’all need to leave the jobs that...
5,1302377911291392002,2020-09-05 22:47:45,_Divergent_00,RT @alyssabaptiste_: men love to play the assh...
6,1302377911132053513,2020-09-05 22:47:45,oyinks434,It’s the love emoji for me😭😭 https://t.co/UyA4...
7,1302377911001833472,2020-09-05 22:47:45,bongotttti,RT @lilkowch: lmaooo wtf my bro chilling in ir...
8,1302377910888792064,2020-09-05 22:47:45,vuyiiswaa,RT @sibahlesays: 😭😭😭 pls why is this ending me...
9,1302377910431440896,2020-09-05 22:47:45,versacepapij,I’m convinced the world ended in 2016 cus ain’...


In [94]:
# modified and improved version
def cleanup_tweet(tweet):

    # make all lower case
    tweet = tweet.lower()

    # remove URLs
    # URL_pattern = r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*'
    # tweet = re.sub(URL_pattern, '', tweet, flags=re.MULTILINE)
    # better, albeit slower, version
    urls = list(set(url_extractor.find_urls(tweet)))
    if len(urls) > 0:
        for url in urls:
            tweet = tweet.replace(url, "")

    # unescape HTML entities like &quot
    tweet = unescape(tweet)
    
    # remove user references (including username) or hashtags, etc.     
    pattern = r'\@\w+|\#|\¥|\â|\«|\»|\Ñ|\Ð|\¼|\½|\¾|\!|\?|\¿\
                |\x82|\x83|\x84|\x85|\x86|\x87|\x88|\x89|\
                |\x8a|\x8b|\x8c|\x8d|\x8e|\°|\µ|\´|\º|\¹|\³'
    
    tweet = re.sub(pattern, '', tweet)

    # remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # remove emojis
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet).strip()
    
    # final test, if after utf-8 encoding it's not ascii decodable, discard
    def is_ascii(text):
        try:
            text.encode(encoding='utf-8').decode('ascii')
        except UnicodeDecodeError:
            return False
        else:
            return True
    
    if is_ascii(tweet) == False:
        return " "
    else:
        pass
    
    # remove stopwords
    # NLTK's set(stopwords.words('english')) removes too many words
    # using list of 25 semantically non-selective words (Reuters-RCV1 dataset)
    stop_words = ['a','an','and','are','as','at','be','by','for','from',
                  'has','he','in','is','it','its','of','on','that','the',
                  'to','was','were','will','with','rt'] # plus rt for retweet

    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]

    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]

    return " ".join(stemmed_words)

In [95]:
tweets_text = [tweet[3] for tweet in tweets]

In [96]:
tweets_text

['RT @monipersona: a thread of namjoon; but as you scroll down he gets older 🥺 @BTS_twt https://t.co/2LNRCnfjY7',
 'RT @SB19Official: What\'s your favorite track in "Get In The Zone!"? 🤔\n\n#SB191stAlbum #SB19 #SB19_SEJUN #SB19_JUSTIN #SB19_STELL #SB19_JOSH…',
 'RT @OlisaOsega: After dancing tonight, tomorrow morning Prince will open his hair dressing and barbing salon. 😭😭😭\n\nIf Darling don’t give th…',
 '@hoobisboobies 🥺 sending hugs',
 'RT @rahm3sh: Y’all need to leave the jobs that require you to serve food on roller skates alone ‼️😭 https://t.co/qNo58BAgXL',
 'RT @alyssabaptiste_: men love to play the asshole and no feelings role when they have all the feelings in the world 😭',
 'It’s the love emoji for me😭😭 https://t.co/UyA4sefiq3',
 'RT @lilkowch: lmaooo wtf my bro chilling in iraq 😭 https://t.co/Bo41xsTCA2',
 'RT @sibahlesays: 😭😭😭 pls why is this ending me?? https://t.co/iDiBbldP2c',
 'I’m convinced the world ended in 2016 cus ain’t shit been the same since 😭']

In [97]:
url_extractor = urlextract.URLExtract()
cleanted_tweets_text = [cleanup_tweet(tweet) for tweet in tweets_text]

In [98]:
cleanted_tweets_text

['thread namjoon but you scroll down get older',
 'what your favorit track get zone sb191stalbum sb19 sb19sejun sb19justin sb19stell sb19josh',
 'after danc tonight tomorrow morn princ open hi hair dress barb salon if darl dont give th',
 'send hug',
 'yall need leav job requir you serv food roller skate alon',
 'men love play asshol no feel role when they have all feel world',
 'love emoji me',
 'lmaooo wtf my bro chill iraq',
 'pl whi thi end me',
 'im convinc world end 2016 cu aint shit been same sinc']

In [19]:
def twitter_bot(api, query, date_since):
    
    today = datetime.datetime.today().strftime('_%Y%m%d')

    filepath = "./data/tweets"
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        
    file_tweets = open(filepath + today + "_tweets.csv", "a+")
    writer = csv.writer(file_tweets)
    
    try:
        tweets = get_tweets(api, query, date_since)
    except:
        print("API limit exceeded. Waiting 1 hour.")
        time.sleep(3605)
        tweets = get_tweets(api, query, date_since)
        
    for tweet in tweets:
        writer.writerow(tweet)
    
    file_tweets.close()

def main(query, date_since):

    api = initiate_api()
    
    #schedule.every().day.at("00:00").do(twitter_bot, api, locations)
    schedule.every(60).seconds.do(twitter_bot, api, query, date_since)
    
    while True:
        schedule.run_pending()
        time.sleep(10)

In [None]:
# run
main(negative_query, date_since)