<a href="https://colab.research.google.com/github/BoomLouke/ML-Final-Project-2026/blob/main/1_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ML Final Project 2026 - Preprocessing
This file contains the preprocessing of the data. Throughout this file I will comment on what has been done in order to keep this document structured.

## 0. Loading in packages and data

Packages:

In [None]:
from google.colab import drive
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

Load data:


In [None]:
dataset = load_dataset("mteb/emotion")

First look into data (splits, size, etc.)

In [None]:
print("DATASET OVERVIEW")

# Dataset sizes
print(f"\nTrain size: {len(dataset['train'])}")
print(f"Validation size: {len(dataset['validation'])}")
print(f"Test size: {len(dataset['test'])}")

# Get unique label mappings from the data itself
print("\n LABEL MAPPING")
train_df = pd.DataFrame(dataset['train'])
label_mapping = train_df[['label', 'label_text']].drop_duplicates().sort_values('label')
print(label_mapping.to_string(index=False))

# Class distribution
print("CLASS DISTRIBUTION")
for split in ['train', 'validation', 'test']:
    print(f"\n{split.upper()}:")
    split_df = pd.DataFrame(dataset[split])
    dist = split_df['label_text'].value_counts()
    print(dist)

# Sample texts for each emotion
print("SAMPLE TEXTS (one per emotion)")
for emotion in train_df['label_text'].unique():
    sample = train_df[train_df['label_text'] == emotion].iloc[0]['text']
    print(f"\n{emotion.upper()}: {sample}")

# Text length statistics
print("TEXT LENGTH STATISTICS")
train_df['text_length'] = train_df['text'].str.len()
print(train_df['text_length'].describe())

DATASET OVERVIEW

Train size: 15956
Validation size: 1988
Test size: 1986

 LABEL MAPPING
 label label_text
     0    sadness
     1        joy
     2       love
     3      anger
     4       fear
     5   surprise
CLASS DISTRIBUTION

TRAIN:
label_text
joy         5345
sadness     4663
anger       2152
fear        1931
love        1297
surprise     568
Name: count, dtype: int64

VALIDATION:
label_text
joy         700
sadness     550
anger       274
fear        211
love        173
surprise     80
Name: count, dtype: int64

TEST:
label_text
joy         688
sadness     579
anger       274
fear        224
love        156
surprise     65
Name: count, dtype: int64
SAMPLE TEXTS (one per emotion)

SADNESS: i didnt feel humiliated

ANGER: im grabbing a minute to post i feel greedy wrong

LOVE: i am ever feeling nostalgic about the fireplace i will know that it is still on the property

SURPRISE: ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i 

## 1. Preprocessing Data

first make a directory for the preprocessed data.

In [None]:
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/MyDrive/ML-Final-Project/data'

Mounted at /content/drive


In [None]:
os.makedirs(f'{SAVE_PATH}/preprocessed_minimal', exist_ok=True)
os.makedirs(f'{SAVE_PATH}/preprocessed_extensive', exist_ok=True)

preprocessing data for emotion detection (extensive is for classical machine learning while, minimal is for BERT)


In [None]:
def preprocess_text(text, extensive=False):
  #general preprocessing: lowering text, removing URLs, username mentions & extra whitespaces.
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    if extensive:
        text = re.sub(r'[^a-zA-Z\s!?.]', '', text)
    else:
        text = re.sub(r'([!?.]){2,}', r'\1', text)

    return text

def create_preprocessed_dataset(dataset, extensive=False):
    processed = {}

    for split in ['train', 'validation', 'test']:
        df = pd.DataFrame(dataset[split])
        df['text_original'] = df['text']
        df['text'] = df['text'].apply(lambda x: preprocess_text(x, extensive))

        initial_len = len(df)
        df = df[df['text'].str.len() > 0]
        removed = initial_len - len(df)

        processed[split] = df
        print(f"{split}: {len(df)} samples (removed {removed} empty)")

    return processed


preprocessed_minimal = create_preprocessed_dataset(dataset, extensive=False)
preprocessed_extensive = create_preprocessed_dataset(dataset, extensive=True)

for split in ['train', 'validation', 'test']:
    minimal_path = f'{SAVE_PATH}/preprocessed_minimal/{split}.csv'
    preprocessed_minimal[split].to_csv(minimal_path, index=False)
    print(f"Saved: {minimal_path}")

    extensive_path = f'{SAVE_PATH}/preprocessed_extensive/{split}.csv'
    preprocessed_extensive[split].to_csv(extensive_path, index=False)
    print(f"Saved: {extensive_path}")

print("Preprocessing complete!")


train: 15956 samples (removed 0 empty)
validation: 1988 samples (removed 0 empty)
test: 1986 samples (removed 0 empty)
train: 15956 samples (removed 0 empty)
validation: 1988 samples (removed 0 empty)
test: 1986 samples (removed 0 empty)
Saved: /content/drive/MyDrive/ML-Final-Project/data/preprocessed_minimal/train.csv
Saved: /content/drive/MyDrive/ML-Final-Project/data/preprocessed_extensive/train.csv
Saved: /content/drive/MyDrive/ML-Final-Project/data/preprocessed_minimal/validation.csv
Saved: /content/drive/MyDrive/ML-Final-Project/data/preprocessed_extensive/validation.csv
Saved: /content/drive/MyDrive/ML-Final-Project/data/preprocessed_minimal/test.csv
Saved: /content/drive/MyDrive/ML-Final-Project/data/preprocessed_extensive/test.csv
Preprocessing complete!
