In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from tqdm import tqdm

## Load Data

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
cache_dir = "../data_cache"

train_dataset = load_dataset(
    "tweet_eval",
    name="emotion",
    split="train",
    cache_dir=cache_dir,
)
print(f"Training dataset with {len(train_dataset)} instances loaded")


val_dataset = load_dataset(
    "tweet_eval",
    name="emotion",
    split="validation",
    cache_dir=cache_dir,
)
print(f"Development/validation dataset with {len(val_dataset)} instances loaded")


test_dataset = load_dataset(
    "tweet_eval",
    name="emotion",
    split="test",
    cache_dir=cache_dir,
)
print(f"Test dataset with {len(test_dataset)} instances loaded")

# Access the input text and target labels like this...
train_texts = train_dataset['text']
train_labels = train_dataset['label']

val_texts = val_dataset['text']
val_labels = val_dataset['label']

test_texts = test_dataset['text']
test_labels = test_dataset['label']

Training dataset with 3257 instances loaded
Development/validation dataset with 374 instances loaded
Test dataset with 1421 instances loaded


In [4]:
train_texts[2]

"No but that's so cute. Atsu was probably shy about photos before but cherry helped her out uwu"

# Data Preprocessing

In [5]:
import re
import emoji
import nltk

In [22]:
class DataPreprocessing:
    def __init__(self):
        self.__text = {
            'train': train_texts,
            'val': val_texts,
            'test': test_texts,
            'trial': train_texts[:20]
        }
    
    # Cleaning
    
    def remove_non_printable(self,text):
        def has_non_printable(text):
            non_printable_pattern = re.compile(r'[\x00-\x1F\x7F-\x9F]')
            match = re.search(non_printable_pattern, text)
            return bool(match)

        while True:
            non_printable_pattern = re.compile(r'[\x00-\x1F\x7F-\x9F]')
            text = re.sub(non_printable_pattern, '', text)
            if not has_non_printable(text):
                break   
        cleaned_text = text
        return cleaned_text
    
    def emoji_to_text(self,emoji_str):
        text = emoji.demojize(emoji_str)
        return text
    
    def remove_urls(self,text):
        url_pattern = re.compile(r'https?://\S+|www\.\S+')
        return url_pattern.sub('', text)
    
    def remove_mentions(self,text):
        mention_pattern = re.compile(r'@[\w_]+')
        return mention_pattern.sub('', text)
    
    def remove_hashtag_symbol(self, text):
        hashtag_pattern = re.compile(r'#(\w+)')
        return hashtag_pattern.sub(r'\1', text)
    
    # Tokenisation
    def text_tokenisation(self, text):
        tokens = nltk.word_tokenize(text)
        
        pattern = re.compile(r'[^a-zA-Z0-9]')
        cleaned_tokens = []
        for token in tokens:
            if not re.search(pattern, token):
                cleaned_tokens.append(token)
        
        return cleaned_tokens
    
    # Lemmatisation
    
    def __call__(self, text_set_type):
        clean_sents = []
        
        for sent in self.__text[text_set_type]:
            # Cleaning
            sent = self.remove_non_printable(sent)
            sent = self.emoji_to_text(sent)
            sent = self.remove_urls(sent)
            sent = self.remove_mentions(sent)
            sent = self.remove_hashtag_symbol(sent)
            # Tokenisation
            sent = self.text_tokenisation(sent)
            
            clean_sents.append(sent)
        return clean_sents

In [23]:
data_preprocessor = DataPreprocessing()

In [24]:
trial = data_preprocessor("trial")