In [1]:
import numpy as np
import pandas as pd

import glob
import os
import re
import string

from cleantext import clean

# Define Dataset Files

In [2]:
#'twitter', 'semeval'

#dataset = 'twitter'
dataset = 'semeval'

In [3]:
# Twitter
if dataset == 'twitter':
    relative_import_path = 'Datasets/ScrapedTwitter/'
    relative_export_path = 'Datasets/TwitterCleaned/'

    dataset_files = ['dino_tweets_15k.csv', 'league_tweets_15k.csv', 'musk_tweets_15k.csv']
    names=None
    sep=','
    header='infer'

In [4]:
# Semeval
if dataset == 'semeval':
    relative_import_path = 'Datasets/semeval-datasets/2017_English_final/GOLD/Subtask_A/'
    relative_export_path = 'Datasets/SemevalCleaned/'
    names = ['id', 'sentiment', 'tweet', 'blah']
    sep='\t'
    header=None

    # Get the file names
    import_files = glob.glob(os.path.join(relative_import_path, 'twitter*.txt'))

    # Get the dataset file names
    dataset_files = []
    for file in import_files:
        (head, tail) = os.path.split(file)
        dataset_files.append(tail)

# Define text cleaning functions

In [5]:
def removePunctTokens(tokens):
    punctuation = set(string.punctuation) 
    newTokens = []
    for token in tokens:
        if token in punctuation:
            continue
        else:
            newTokens.append(token)
            
def replaceWithSpaces(string, characters):
    puncutation = set(characters)
    outString = string
    for character in characters:
        outString = outString.replace(character, ' ')
        
    return outString

def removeHtml(string):
    outstring = re.sub('<a [^<]*>', '', string)
    outstring = re.sub('<\/a>', '', outstring)
    outstring = outstring.replace('</a>', '')
    outstring = outstring.replace('<br />', '')
    
    return outstring

def removeStartingRetweet(string):
    return re.sub('^RT @.*?: ', '', string)

def removeLink(string):
    pattern = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    
    return re.sub(pattern, ' ', string)

def removeAts(string):
    return re.sub('@\w*', ' ', string)

def removeHashtags(string):
    return re.sub('#\w*', ' ', string)

def collapseWhitespace(string):
    return re.sub('\s+', ' ', string)

def cleanDirtyWebText(string):
    return clean(string, no_emoji=True, no_emails=True, no_phone_numbers=True, replace_with_email=' ', replace_with_phone_number=' ')

def cleanTweet(string):
    string = removeStartingRetweet(string)
    string = cleanDirtyWebText(string)
    string = removeAts(string)
    string = removeLink(string)
    string = removeHashtags(string)
    string = collapseWhitespace(string)
    
    # Remove starting and ending blank character 
    if len(string) >= 1 and string[0] == ' ':
        string = string[1:]
    if len(string) >= 1 and string[-1] == ' ':
        string = string[:-1]
        
    return string

# Read Datasets

In [6]:
datasets = []

for dataset in dataset_files:
    print(f"Import {dataset}")
    df = pd.read_csv(relative_import_path + dataset, sep=sep, header=header, names=names)
    
    if 'tweet' in df:
        final_df = df[['tweet']].astype('string')
    elif 'text' in df:
        final_df = df[['text']]
        final_df = final_df.rename(columns={"text":'tweet'})
    
    if 'sentiment' in df:
        final_df = final_df.join(df['sentiment'])
        
    size_before = len(final_df)
    final_df['tweet'].replace('', np.nan, inplace=True)
    final_df['tweet'].replace(' ', np.nan, inplace=True) 
    final_df.dropna(inplace=True)
    size_after = len(final_df)
    
    print(f'Dropped {size_before - size_after} null rows')
    print()
    
    datasets.append(final_df)

Import twitter-2013dev-A.txt
Dropped 0 null rows

Import twitter-2013test-A.txt
Dropped 0 null rows

Import twitter-2013train-A.txt
Dropped 0 null rows

Import twitter-2014sarcasm-A.txt
Dropped 0 null rows

Import twitter-2014test-A.txt
Dropped 0 null rows

Import twitter-2015test-A.txt
Dropped 0 null rows

Import twitter-2015train-A.txt
Dropped 0 null rows

Import twitter-2016devtest-A.txt
Dropped 0 null rows

Import twitter-2016test-A.txt
Dropped 0 null rows



In [7]:
datasets[0]

Unnamed: 0,tweet,sentiment
0,Won the match #getin . Plus\u002c tomorrow is ...,neutral
1,Some areas of New England could see the first ...,neutral
2,@francesco_con40 2nd worst QB. DEFINITELY Tony...,negative
3,#Thailand Washington - US President Barack Oba...,neutral
4,Did y\u2019all hear what Tony Romo dressed up ...,neutral
...,...,...
1649,#WEB YouTube improves upload process with opti...,neutral
1650,Gonna change my Tumblr theme. I hope I can fin...,positive
1651,I\u2019m so jealous of everyone at the Justin ...,neutral
1652,Jim Harbaugh\u002c Alex Smith Drive Giants Wor...,neutral


# Clean Datasets

In [8]:
for (dataset, filename) in zip(datasets, dataset_files):
    print(f"Clean {filename}")
    dataset['tweet'] = dataset['tweet'].apply(cleanTweet)
    
    beforeSize = len(dataset)
    dataset.drop_duplicates(inplace = True)
    afterSize = len(dataset)
        
    print(f"Dropped {beforeSize - afterSize} duplicates.")
    
    beforeSize = len(dataset)
    dataset.drop(dataset[dataset['tweet'] == ''].index, inplace=True)
    afterSize = len(dataset)
    
    print(f"Dropped {beforeSize - afterSize} empty strings. Final size {afterSize}.")
    
    dataset.reset_index(inplace=True, drop=True)

    print()

Clean twitter-2013dev-A.txt
Dropped 5 duplicates.
Dropped 0 empty strings. Final size 1649.

Clean twitter-2013test-A.txt
Dropped 11 duplicates.
Dropped 0 empty strings. Final size 3536.

Clean twitter-2013train-A.txt
Dropped 64 duplicates.
Dropped 0 empty strings. Final size 9620.

Clean twitter-2014sarcasm-A.txt
Dropped 0 duplicates.
Dropped 0 empty strings. Final size 49.

Clean twitter-2014test-A.txt
Dropped 0 duplicates.
Dropped 0 empty strings. Final size 1853.

Clean twitter-2015test-A.txt
Dropped 42 duplicates.
Dropped 0 empty strings. Final size 2348.

Clean twitter-2015train-A.txt
Dropped 6 duplicates.
Dropped 0 empty strings. Final size 483.

Clean twitter-2016devtest-A.txt
Dropped 0 duplicates.
Dropped 0 empty strings. Final size 2000.

Clean twitter-2016test-A.txt
Dropped 55 duplicates.
Dropped 1 empty strings. Final size 20576.



In [9]:
datasets[0]['tweet'][0]

"won the match . plus, tomorrow is a very busy day, with awareness day's and debates. gulp. debates..."

In [10]:
datasets[0]

Unnamed: 0,tweet,sentiment
0,"won the match . plus, tomorrow is a very busy ...",neutral
1,some areas of new england could see the first ...,neutral
2,2nd worst qb. definitely tony romo. the man wh...,negative
3,washington - us president barack obama vowed w...,neutral
4,did y'all hear what tony romo dressed up as fo...,neutral
...,...,...
1644,youtube improves upload process with optional ...,neutral
1645,gonna change my tumblr theme. i hope i can fin...,positive
1646,i'm so jealous of everyone at the justin biebe...,neutral
1647,"jim harbaugh, alex smith drive giants world se...",neutral


# Export cleaned datasets

In [11]:
if not os.path.exists(relative_export_path):
    os.mkdir(relative_export_path)

for (dataset, filename) in zip(datasets, dataset_files):
    filename, ext = os.path.splitext(filename)
    print(f"Export {filename}")
    dataset.to_csv(relative_export_path + filename + '.csv', index=False)

Export twitter-2013dev-A
Export twitter-2013test-A
Export twitter-2013train-A
Export twitter-2014sarcasm-A
Export twitter-2014test-A
Export twitter-2015test-A
Export twitter-2015train-A
Export twitter-2016devtest-A
Export twitter-2016test-A


# Export final dataset

In [12]:
dataset = pd.concat(datasets)
dataset.reset_index(inplace=True, drop=True)

In [14]:
dataset.to_csv(relative_export_path + 'data.csv', index=False)

In [15]:
print(f'Final Dataset Size: {len(dataset)}')

Final Dataset Size: 42114
