In [5]:
import pandas as pd
from pathlib import Path
import re
from tqdm.auto import tqdm
import preprocessor as p

In [6]:
root_dir = 'Twitter/COVID19-Tweets-KaggleDataset/'
output_file = 'Twitter/COVID19-Tweets-KaggleDataset-parsed_cleaned.txt'
BEGINNING_OF_TWEET_SYMBOL = '<BOT> '
END_OF_TWEET_SYMBOL = ' <EOT> '
NUM_SYMBOL = ' '

In [7]:
#HappyEmoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

#combine sad and happy emoticons
emoticons = emoticons_happy.union(emoticons_sad)

punctuations_pattern = re.compile(r'([,/$%^&*;|<>:+@#{}\[\]\\=`~()])+')

In [8]:
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.RESERVED, p.OPT.MENTION, p.OPT.SMILEY)

In [9]:
def clean_tweets(tweet:str):
    
    tweet = p.clean(tweet)
    
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)
    
    # remove # from hashtags
    tweet = re.sub(r'#(\S+)',r'\1',tweet)
    
#     #replace consecutive non-ASCII characters with a space except ’
    tweet = re.sub(r'(?![’])[^\x00-\x7F]+',' ', tweet)
    
#     #remove emojis from tweet
    tweet = emoji_pattern.sub(r'', tweet)
    
#     # consolidate repetitive punctuations
    tweet = re.sub(r'([.!?])[.!?]+',r'\1',tweet)
    
    #seperate punctuations from words
    tweet = re.sub(r'(\S)([.!?])',r'\1 \2',tweet)
    tweet = re.sub(r'([.!?])(\S)',r'\1 \2',tweet)
    
    #     # remove punctuations
    tweet = punctuations_pattern.sub(r' ', tweet)
    
#     # remove numbers
    tweet = re.sub(r'\s+\d+', NUM_SYMBOL, tweet)
    tweet = re.sub(r'^\d+', NUM_SYMBOL.lstrip(), tweet)
    
#     # remove repetitive white spaces
    tweet = re.sub('\s+', ' ', tweet)
    
    tweet = tweet.strip()
    
    tweet = tweet.lower()
    
    return tweet

In [10]:
with open(output_file,'w') as out_file:
    
    for path in Path(root_dir).rglob('*Tweets*.CSV'):
        print(path)
        tweets = pd.read_csv(path)
        tweets = tweets[['text','lang']]
        
        for index, row in tqdm(tweets.iterrows()):
            
            if row['lang'] == 'en':
                tweet = row['text']
                tweet = clean_tweets(tweet)
                out_file.write(BEGINNING_OF_TWEET_SYMBOL+tweet+END_OF_TWEET_SYMBOL)

Twitter/COVID19-Tweets-KaggleDataset/2020-03-00 Coronavirus Tweets (pre 2020-03-12).CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-24 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-15 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-23 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-12 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-18 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-20 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-27 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-16 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-17 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-26 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-21 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-19 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-13 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-22 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-28 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-14 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Twitter/COVID19-Tweets-KaggleDataset/2020-03-25 Coronavirus Tweets.CSV


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


