# DATA PREPARATION

With the findings from data exploration, we can create our datasets. Since we will be using both standard and SOTA models, we will prepare datasets according to them.

In [None]:
import pandas as pd

# lets load the cleaned data, which will be the basis for all types of datasets we will create
train_df_cleaned = pd.read_csv('data/training_cleaned.csv')
test_df_cleaned = pd.read_csv('data/validation_cleaned.csv')



#### Including topic information

We simply prepend the topic to the tweet and obtain a new column which we will use further.

In [5]:
train_df_topic_merged = train_df_cleaned.copy()
train_df_topic_merged['topic_tweet'] = train_df_topic_merged['topic'] + ' ' + train_df_topic_merged['tweet']

test_df_topic_merged = test_df_cleaned.copy()
test_df_topic_merged['topic_tweet'] = test_df_topic_merged['topic'] + ' ' + test_df_topic_merged['tweet']


In [6]:
train_df_topic_merged.to_csv('data/training_topic_merged.csv', index=False)
test_df_topic_merged.to_csv('data/validation_topic_merged.csv', index=False)

#### Making a balanced dataset



In [7]:
# balance the dataset within each topic by undersampling the larger classes

from sklearn.utils import resample

train_df_balanced_us = pd.DataFrame(columns=train_df_topic_merged.columns)
train_df_balanced_os = pd.DataFrame(columns=train_df_topic_merged.columns)

topics = train_df_topic_merged['topic'].unique()

for topic in topics:
    topic_df = train_df_topic_merged[train_df_topic_merged['topic'] == topic]

    # for undersampling, we will resample larger classes to the size of the smallest class
    min_class_size = topic_df['sentiment'].value_counts().min()
    resampled_df = pd.DataFrame(columns=topic_df.columns)

    for sentiment in topic_df['sentiment'].unique():
        sentiment_df = topic_df[topic_df['sentiment'] == sentiment]
        resampled_df = pd.concat([resampled_df, resample(sentiment_df, replace=False, n_samples=min_class_size, random_state=123)])

    train_df_balanced_us = pd.concat([train_df_balanced_us, resampled_df])

    # for oversampling, we will resample smaller classes to the size of the largest class
    max_class_size = topic_df['sentiment'].value_counts().max()
    resampled_df = pd.DataFrame(columns=topic_df.columns)

    for sentiment in topic_df['sentiment'].unique():
        sentiment_df = topic_df[topic_df['sentiment'] == sentiment]
        resampled_df = pd.concat([resampled_df, resample(sentiment_df, replace=True, n_samples=max_class_size, random_state=123)])
    train_df_balanced_os = pd.concat([train_df_balanced_os, resampled_df])

train_df_balanced_us.to_csv('data/training_balanced_us.csv', index=False)
train_df_balanced_os.to_csv('data/training_balanced_os.csv', index=False)


In [8]:
# lets create partitions

# 1. Cleaned data
X_train_cleaned, y_train_cleaned = train_df_cleaned['tweet'], train_df_cleaned['sentiment']
X_test_cleaned, y_test_cleaned = test_df_cleaned['tweet'], test_df_cleaned['sentiment']

# 2. Topic merged data
X_train_topic_merged, y_train_topic_merged = train_df_topic_merged['topic_tweet'], train_df_topic_merged['sentiment']
X_test_topic_merged, y_test_topic_merged = test_df_topic_merged['topic_tweet'], test_df_topic_merged['sentiment']

# 3. Balanced undersampled data
X_train_balanced_us, y_train_balanced_us = train_df_balanced_us['topic_tweet'], train_df_balanced_us['sentiment']
X_test_balanced_us, y_test_balanced_us = test_df_topic_merged['topic_tweet'], test_df_topic_merged['sentiment']

# 4. Balanced oversampled data
X_train_balanced_os, y_train_balanced_os = train_df_balanced_os['topic_tweet'], train_df_balanced_os['sentiment']
X_test_balanced_os, y_test_balanced_os = test_df_topic_merged['topic_tweet'], test_df_topic_merged['sentiment']

In [None]:
# now let us embed the data using pretrained DistilBERT model
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from tqdm import tqdm
import pandas as pd

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :]  # cls token

def get_bert_representations(X):
    return torch.cat([get_bert_embeddings(text).detach().cpu() for text in tqdm(X)])


In [None]:
# lets save these models in order to avoid recomputing them

X_train_cleaned_bert = get_bert_representations(X_train_cleaned)
X_test_cleaned_bert = get_bert_representations(X_test_cleaned)
torch.save(X_train_cleaned_bert, 'data/X_train_cleaned_bert.pt')
torch.save(X_test_cleaned_bert, 'data/X_test_cleaned_bert.pt')

X_train_topic_merged_bert = get_bert_representations(X_train_topic_merged)
X_test_topic_merged_bert = get_bert_representations(X_test_topic_merged)
torch.save(X_train_topic_merged_bert, 'data/X_train_topic_merged_bert.pt')
torch.save(X_test_topic_merged_bert, 'data/X_test_topic_merged_bert.pt')

# balanced datasets will used the X_test_topic_merged_bert as the test set

X_train_balanced_us_bert = get_bert_representations(X_train_balanced_us)
torch.save(X_train_balanced_us_bert, 'data/X_train_balanced_us_bert.pt')

X_train_balanced_os_bert = get_bert_representations(X_train_balanced_os)
torch.save(X_train_balanced_os_bert, 'data/X_train_balanced_os_bert.pt')