In [2]:
import numpy as np
import pandas as pd

import glob
import os
import re
import string

from cleantext import clean

# Define Dataset Files

In [27]:
#'twitter', 'semeval'

dataset_type = 'twitter'
#dataset_type = 'semeval'

In [38]:
# Twitter
if dataset_type == 'twitter':
    relative_import_path = 'Datasets/ScrapedTwitter/'
    relative_export_path = 'Datasets/TwitterCleaned/'

    dataset_files = [
        'dino_tweets_15k.csv', 'league_tweets_15k.csv', 'musk_tweets_15k.csv',
        'more_dino_tweets_15k.csv', 'more_league_tweets_15k.csv', 'more_chief_twit_15k.csv',
        'puppies_15k.csv', '2022_supreme_court_tweets_15k.csv', '2022_imac_tweets_15k.csv',
    ]
    names=None
    sep=','
    header='infer'

In [39]:
# Semeval
if dataset_type == 'semeval':
    relative_import_path = 'Datasets/semeval-datasets/2017_English_final/GOLD/Subtask_A/'
    relative_export_path = 'Datasets/SemevalCleaned/'
    names = ['id', 'sentiment', 'tweet', 'blah']
    sep='\t'
    header=None

    # Get the file names
    import_files = glob.glob(os.path.join(relative_import_path, 'twitter*.txt'))

    # Get the dataset file names
    dataset_files = []
    for file in import_files:
        (head, tail) = os.path.split(file)
        dataset_files.append(tail)

# Define text cleaning functions

In [40]:
def removePunctTokens(tokens):
    punctuation = set(string.punctuation) 
    newTokens = []
    for token in tokens:
        if token in punctuation:
            continue
        else:
            newTokens.append(token)
            
def replaceWithSpaces(string, characters):
    puncutation = set(characters)
    outString = string
    for character in characters:
        outString = outString.replace(character, ' ')
        
    return outString

def removeHtml(string):
    outstring = re.sub('<a [^<]*>', '', string)
    outstring = re.sub('<\/a>', '', outstring)
    outstring = outstring.replace('</a>', '')
    outstring = outstring.replace('<br />', '')
    
    return outstring

def removeStartingRetweet(string):
    return re.sub('^RT @.*?: ', '', string)

def removeLink(string):
    pattern = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    
    return re.sub(pattern, ' ', string)

def removeAts(string):
    return re.sub('@\w*', ' ', string)

def removeHashtags(string):
    return re.sub('#\w*', ' ', string)

def collapseWhitespace(string):
    return re.sub('\s+', ' ', string)

def cleanDirtyWebText(string):
    return clean(string, no_emoji=True, no_emails=True, no_phone_numbers=True, replace_with_email=' ', replace_with_phone_number=' ')

def cleanTweet(string):
    string = removeStartingRetweet(string)
    string = cleanDirtyWebText(string)
    string = removeAts(string)
    string = removeLink(string)
    string = removeHashtags(string)
    string = collapseWhitespace(string)
    
    # Remove starting and ending blank character 
    if len(string) >= 1 and string[0] == ' ':
        string = string[1:]
    if len(string) >= 1 and string[-1] == ' ':
        string = string[:-1]
        
    return string

# Read Datasets

In [41]:
datasets = []

for dataset in dataset_files:
    print(f"Import {dataset}")
    df = pd.read_csv(relative_import_path + dataset, sep=sep, header=header, names=names)
    
    if 'tweet' in df:
        final_df = df[['tweet']].astype('string')
    elif 'text' in df:
        final_df = df[['text']]
        final_df = final_df.rename(columns={"text":'tweet'})
    
    if 'sentiment' in df:
        final_df = final_df.join(df['sentiment'])
        
    size_before = len(final_df)
    final_df['tweet'].replace('', np.nan, inplace=True)
    final_df['tweet'].replace(' ', np.nan, inplace=True) 
    final_df.dropna(inplace=True)
    size_after = len(final_df)
    
    print(f'Dropped {size_before - size_after} null rows')
    print()
    
    datasets.append(final_df)

Import dino_tweets_15k.csv
Dropped 0 null rows

Import league_tweets_15k.csv
Dropped 0 null rows

Import musk_tweets_15k.csv
Dropped 0 null rows

Import more_dino_tweets_15k.csv
Dropped 0 null rows

Import more_league_tweets_15k.csv
Dropped 0 null rows

Import more_chief_twit_15k.csv
Dropped 0 null rows

Import puppies_15k.csv
Dropped 0 null rows

Import 2022_supreme_court_tweets_15k.csv
Dropped 0 null rows

Import 2022_imac_tweets_15k.csv
Dropped 0 null rows



In [33]:
datasets[0]

Unnamed: 0,tweet
0,RT @purinkoo: jungkookie is really the pretty ...
1,Light Painting 101: Illuminating a terrifying ...
2,Light Painting 101: Illuminating a terrifying ...
3,RT @Coolio_Art: Small art tip for drawing rept...
4,RT @nahi_tigray: There is never true peace wit...
...,...
15156,We kicked the day off w/ a rocking dinosaur! S...
15157,Dinosaur T-rex Colored Lights Christmas Hawaii...
15158,Dinosaur T-rex Colored Lights Christmas Xmas B...
15159,#hashtag2 Cartoon cute dinosaur backpack nylon...


# Clean Datasets

In [42]:
for (dataset, filename) in zip(datasets, dataset_files):
    print(f"Clean {filename}")
    dataset['tweet'] = dataset['tweet'].apply(cleanTweet)
    
    beforeSize = len(dataset)
    dataset.drop_duplicates(inplace = True)
    afterSize = len(dataset)
        
    print(f"Dropped {beforeSize - afterSize} duplicates.")
    
    beforeSize = len(dataset)
    dataset.drop(dataset[dataset['tweet'] == ''].index, inplace=True)
    afterSize = len(dataset)
    
    print(f"Dropped {beforeSize - afterSize} empty strings. Final size {afterSize}.")
    
    dataset.reset_index(inplace=True, drop=True)

    print()

Clean dino_tweets_15k.csv
Dropped 6350 duplicates.
Dropped 1 empty strings. Final size 8810.

Clean league_tweets_15k.csv
Dropped 12551 duplicates.
Dropped 0 empty strings. Final size 2609.

Clean musk_tweets_15k.csv
Dropped 11529 duplicates.
Dropped 0 empty strings. Final size 3545.

Clean more_dino_tweets_15k.csv
Dropped 7006 duplicates.
Dropped 1 empty strings. Final size 8064.

Clean more_league_tweets_15k.csv
Dropped 12552 duplicates.
Dropped 0 empty strings. Final size 2457.

Clean more_chief_twit_15k.csv
Dropped 11564 duplicates.
Dropped 1 empty strings. Final size 3486.

Clean puppies_15k.csv
Dropped 8876 duplicates.
Dropped 1 empty strings. Final size 6126.

Clean 2022_supreme_court_tweets_15k.csv
Dropped 3481 duplicates.
Dropped 1 empty strings. Final size 11558.

Clean 2022_imac_tweets_15k.csv
Dropped 4856 duplicates.
Dropped 1 empty strings. Final size 10233.



In [21]:
datasets[0]['tweet'][0]

'i asked soojin to scare me as she was a dinosaur but i actually fell in love instead'

In [22]:
datasets[0]

Unnamed: 0,tweet
0,i asked soojin to scare me as she was a dinosa...
1,"""it was high tide, and the sound of the waves ..."
2,lots of fun on riding a dinosaur and seeing so...
3,a grey cowled wood rail from cali . what a cut...
4,my handsome dinosaur
...,...
8059,dinosaur hunt
8060,the future swamp giant is growing steadily as ...
8061,"ha, at least twice. how he gets a sky gig is b..."
8062,well i wanted to be a dinosaur when i was younger


# Export cleaned datasets

In [43]:
if not os.path.exists(relative_export_path):
    os.mkdir(relative_export_path)

for (dataset, filename) in zip(datasets, dataset_files):
    filename, ext = os.path.splitext(filename)
    print(f"Export {filename}")
    dataset.to_csv(relative_export_path + filename + '.csv', index=False)

Export dino_tweets_15k
Export league_tweets_15k
Export musk_tweets_15k
Export more_dino_tweets_15k
Export more_league_tweets_15k
Export more_chief_twit_15k
Export puppies_15k
Export 2022_supreme_court_tweets_15k
Export 2022_imac_tweets_15k


# Export final dataset

In [44]:
dataset = pd.concat(datasets)
dataset.reset_index(inplace=True, drop=True)

In [45]:
dataset.to_csv(relative_export_path + 'data.csv', index=False)

In [46]:
print(f'Final Dataset Size: {len(dataset)}')

Final Dataset Size: 56888
