## **CONFIGURE SCRIPT**

In [1]:
# Install the latest version of torchtext library quietly without showing output
!pip install torchtext -qq
!pip install transformers evaluate wandb datasets accelerate -U -qq ## NEW LINES ##

[0m

In [2]:
#Pandas
import pandas as pd

# Importing PyTorch library for tensor computations and neural network modules
import torch
import torch.nn as nn

# For working with textual data vocabularies and for displaying model summaries
from torchtext.vocab import vocab

# General-purpose Python libraries for random number generation and numerical operations
import random
import numpy as np

# Utilities for efficient serialization/deserialization of Python objects and for element tallying
import joblib
from collections import Counter

# For creating lightweight attribute classes and for partial function application
from functools import partial

# For filesystem path handling, generating and displaying confusion matrices, and date-time manipulations
from pathlib import Path
from sklearn.metrics import confusion_matrix,hamming_loss, f1_score
from datetime import datetime

# For plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

### NEW ##########################
# imports from Huggingface ecosystem
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import PreTrainedModel, PretrainedConfig
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate

# wandb library
import wandb

#For Preprocessor
from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
import re
import spacy
import numpy as np
from nltk.stem.porter import PorterStemmer
import os
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
comp_df = pd.read_csv("/content/drive/MyDrive/hw5/emotion-detection-spring2014/train.csv")

In [5]:
comp_df_initial = comp_df.sample(frac=1)

## **Preprocessing The Data Set**

In [6]:
class SpacyPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, model, *, batch_size = 64, lemmatize=True, lower=True, remove_stop=True,
                remove_punct=True, remove_email=True, remove_url=True, remove_num=False, stemming = False,
                add_user_mention_prefix=True, remove_hashtag_prefix=False, basic_clean_only=False):

        self.model = model
        self.batch_size = batch_size
        self.remove_stop = remove_stop
        self.remove_punct = remove_punct
        self.remove_num = remove_num
        self.remove_url = remove_url
        self.remove_email = remove_email
        self.lower = lower
        self.add_user_mention_prefix = add_user_mention_prefix
        self.remove_hashtag_prefix = remove_hashtag_prefix
        self.basic_clean_only = basic_clean_only

        if lemmatize and stemming:
            raise ValueError("Only one of 'lemmatize' and 'stemming' can be True.")

        # Validate basic_clean_only option
        if self.basic_clean_only and (lemmatize or lower or remove_stop or remove_punct or remove_num or stemming or
                                      add_user_mention_prefix or remove_hashtag_prefix):
            raise ValueError("If 'basic_clean_only' is set to True, other processing options must be set to False.")

        # Assign lemmatize and stemming

        self.lemmatize = lemmatize
        self.stemming = stemming

    def basic_clean(self, text):
        soup = BeautifulSoup(text, "html.parser")
        text = soup.get_text()
        text = re.sub(r'[\n\r]', ' ', text)
        return text.strip()

    def spacy_preprocessor(self, texts):
        final_result = []
        nlp = spacy.load(self.model)

        # Disable unnecessary pipelines in spaCy model
        if self.lemmatize:
            # Disable parser and named entity recognition
            disabled_pipes = ['parser', 'ner']
        else:
            # Disable tagger, parser, attribute ruler, lemmatizer and named entity recognition
            disabled_pipes = ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

        with nlp.select_pipes(disable=disabled_pipes):
          # Modify tokenizer behavior based on user_mention_prefix and hashtag_prefix settings
          if self.add_user_mention_prefix or self.remove_hashtag_prefix:
              prefixes = list(nlp.Defaults.prefixes)
              if self.add_user_mention_prefix:
                  prefixes += ['@']  # Treat '@' as a separate token
              if self.remove_hashtag_prefix:
                  prefixes.remove(r'#')  # Don't separate '#' from the following text
              prefix_regex = spacy.util.compile_prefix_regex(prefixes)
              nlp.tokenizer.prefix_search = prefix_regex.search

          # Process text data in parallel using spaCy's nlp.pipe()
          for doc in nlp.pipe(texts, batch_size=self.batch_size):
              filtered_tokens = []
              for token in doc:
                  # Check if token should be removed based on specified filters
                  if self.remove_stop and token.is_stop:
                      continue
                  if self.remove_punct and token.is_punct:
                      continue
                  if self.remove_num and token.like_num:
                      continue
                  if self.remove_url and token.like_url:
                      continue
                  if self.remove_email and token.like_email:
                      continue

                  # Append the token's text, lemma, or stemmed form to the filtered_tokens list
                  if self.lemmatize:
                      filtered_tokens.append(token.lemma_)
                  elif self.stemming:
                      filtered_tokens.append(PorterStemmer().stem(token.text))
                  else:
                      filtered_tokens.append(token.text)

              # Join the tokens and apply lowercasing if specified
              text = ' '.join(filtered_tokens)
              if self.lower:
                  text = text.lower()
              final_result.append(text.strip())

        return final_result


    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        try:
            if not isinstance(X, (list, np.ndarray)):
                raise TypeError(f'Expected list or numpy array, got {type(X)}')

            x_clean = [self.basic_clean(text).encode('utf-8', 'ignore').decode() for text in X]

            # Check if only basic cleaning is required
            if self.basic_clean_only:
                return x_clean  # Return the list of basic-cleaned texts

            x_clean_final = self.spacy_preprocessor(x_clean)
            return x_clean_final

        except Exception as error:
            print(f'An exception occurred: {repr(error)}')


In [7]:
# import spacy pre-processor from custom module
preprocessor = SpacyPreprocessor(model='en_core_web_sm', batch_size=64, lemmatize=False, lower=True,
                                    remove_stop=False, remove_punct=False, remove_email=False,
                                    remove_url=False, remove_num=False, stemming=False,
                                    add_user_mention_prefix=False, remove_hashtag_prefix=False, basic_clean_only=False)

In [8]:
cleaned_text = preprocessor.fit_transform(comp_df_initial['Tweet'].values)

  soup = BeautifulSoup(text, "html.parser")


In [9]:
comp_df_initial['cleaned_tweets']= cleaned_text
comp_df_initial.drop(['ID','Tweet'],axis=1 , inplace = True)
y = comp_df_initial.iloc[:,:-1].values
X = comp_df_initial['cleaned_tweets'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
trainset = Dataset.from_dict({
    'texts': X_train,
    'labels': y_train
})

validset = Dataset.from_dict({
    'texts': X_val,
    'labels': y_val
})

## **Configuring The Model**

In [11]:
class CustomConfig(PretrainedConfig):
  def __init__(self, vocab_size=0, embedding_dim=0, hidden_dim1=0, hidden_dim2=0, num_labels=11, **kwargs):
      super().__init__()
      self.vocab_size = vocab_size
      self.embedding_dim = embedding_dim
      self.hidden_dim1 = hidden_dim1
      self.hidden_dim2 = hidden_dim2
      self.num_labels = num_labels

In [12]:
class CustomMLP(PreTrainedModel):
    config_class = CustomConfig

    def __init__(self, config):
        super().__init__(config)

        self.embedding_bag = nn.EmbeddingBag(config.vocab_size, config.embedding_dim)
        self.layers = nn.Sequential(
            nn.Linear(config.embedding_dim, config.hidden_dim1),
            nn.BatchNorm1d(num_features=config.hidden_dim1),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(config.hidden_dim1, config.hidden_dim2),
            nn.BatchNorm1d(num_features=config.hidden_dim2),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(config.hidden_dim2, config.num_labels)
        )


    def forward(self, input_ids, offsets, labels=None):
        embed_out = self.embedding_bag(input_ids, offsets)
        logits = self.layers(embed_out)


        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits,labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )

In [13]:
def get_vocab(dataset, min_freq=1):
    """
    Generate a vocabulary from a dataset.

    Args:
        dataset (Dataset): A Hugging Face Dataset object. The dataset should
                           have a key 'texts' that contains the text data.
        min_freq (int): The minimum frequency for a token to be included in
                        the vocabulary.

    Returns:
        torchtext.vocab.Vocab: Vocabulary object containing tokens from the
                               dataset that meet or exceed the specified
                               minimum frequency. It also includes a special
                               '<unk>' token for unknown words.
    """
    # Initialize a counter object to hold token frequencies
    counter = Counter()

    # Update the counter with tokens from each text in the dataset
    # Iterating through texts in the dataset
    for text in dataset['texts']:  ###### Change from previous function ####
        counter.update(str(text).split())

    # Create a vocabulary using the counter object
    # Tokens that appear fewer times than `min_freq` are excluded
    my_vocab = vocab(counter, min_freq=min_freq)

    # Insert a '<unk>' token at index 0 to represent unknown words
    my_vocab.insert_token('<unk>', 0)

    # Set the default index to 0
    # This ensures that any unknown word will be mapped to '<unk>'
    my_vocab.set_default_index(0)

    return my_vocab

In [14]:
# Creating a function that will be used to get the indices of words from vocab
def tokenizer(text, vocab):
    """Converts text to a list of indices using a vocabulary dictionary"""
    return [vocab[token] for token in str(text).split()]

In [15]:
def collate_batch(batch, my_vocab):
    """
    Prepares a batch of data by transforming texts into indices based on a vocabulary and
    converting labels into a tensor.

    Args:
        batch (list of dict): A batch of data where each element is a dictionary with keys
                              'labels' and 'texts'. 'labels' are the sentiment labels, and
                              'texts' are the corresponding texts.
        my_vocab (torchtext.vocab.Vocab): A vocabulary object that maps tokens to indices.

    Returns:
        dict: A dictionary with three keys:
              - 'input_ids': a tensor containing concatenated indices of the texts.
              - 'offsets': a tensor representing the starting index of each text in 'input_ids'.
              - 'labels': a tensor of the labels for each text in the batch.

    The function transforms each text into a list of indices based on the provided vocabulary.
    It also converts the labels into a tensor. The 'offsets' are computed to keep track of the
    start of each text within the 'input_ids' tensor, which is a flattened representation of all text indices.
    """

    # Get labels and texts from batch dict samples
    labels = [sample['labels'] for sample in batch]
    texts = [sample['texts'] for sample in batch]

    # Convert the list of labels into a tensor of dtype int32
    labels = torch.tensor(labels, dtype=torch.float32)

    # Convert the list of texts into a list of lists; each inner list contains the vocabulary indices for a text
    list_of_list_of_indices = [tokenizer(text, my_vocab) for text in texts]

    # Concatenate all text indices into a single tensor
    input_ids = torch.cat([torch.tensor(i, dtype=torch.int64) for i in list_of_list_of_indices])

    # Compute the offsets for each text in the concatenated tensor
    offsets = [0] + [len(i) for i in list_of_list_of_indices]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

    return {
        'input_ids': input_ids,
        'offsets': offsets,
        'labels': labels
    }

In [16]:
imdb_vocab = get_vocab(trainset, min_freq=2)
collate_fn = partial(collate_batch, my_vocab=imdb_vocab)

In [17]:
my_config = CustomConfig(vocab_size=len(imdb_vocab),
                         embedding_dim=300,
                         hidden_dim1=200,
                         hidden_dim2=100,
                         num_labels=11)

In [18]:
model = CustomMLP(config=my_config)

In [19]:
model

CustomMLP(
  (embedding_bag): EmbeddingBag(6548, 300, mode='mean')
  (layers): Sequential(
    (0): Linear(in_features=300, out_features=200, bias=True)
    (1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=100, out_features=11, bias=True)
  )
)

In [20]:
from sklearn.metrics import hamming_loss,f1_score

def compute_metrics(eval_pred):
    #combined_metrics = evaluate.combine([evaluate.load("accuracy"),
                                        #evaluate.load("f1", average="macro")])

    logits, labels = eval_pred
    labels = labels.astype('int32')
    predictions = (logits>=0).astype('int32')
    hamming_loss_value = hamming_loss(labels, predictions)
    f1 = f1_score(labels, predictions, average = 'macro')
    evaluations = {
        'hamming_loss' : hamming_loss_value,
        'f1_score': f1
    }
    return evaluations

## **Model Initialization with Hyper Parameters**
### Hyperparameters were optimized through iteration

In [21]:
# Configure training parameters
training_args = TrainingArguments(

    # Training-specific configurations
    num_train_epochs=40,
    per_device_train_batch_size=32, # Number of samples per training batch
    per_device_eval_batch_size=32, # Number of samples per validation batch
    weight_decay=0.1, # weight decay (L2 regularization)
    learning_rate=0.001, # learning arte
    optim='adamw_torch', # optimizer
    remove_unused_columns=False, # flag to retain unused columns

    # Checkpoint saving and model evaluation settings
    output_dir='/content/drive/MyDrive/hw5/saved_models',  # Directory to save model checkpoints
    evaluation_strategy='steps',  # Evaluate model at specified step intervals
    eval_steps=25,  # Perform evaluation every 50 training steps
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps=25,  # Save a model checkpoint every 50 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    # Use 'accuracy' as the metric to determine the best model
    metric_for_best_model="f1_score",
    greater_is_better=True,  # A model is 'better' if its accuracy is higher


    # Experiment logging configurations
    logging_strategy='steps',
    logging_steps=50,
    report_to='wandb',  # Log metrics and results to Weights & Biases platform
    run_name='imdb_hf_trainer',  # Experiment name for Weights & Biases
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=trainset,
    eval_dataset = validset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [23]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [24]:
 #specify the project name where the experiment will be logged
%env WANDB_PROJECT = ahson_ml_project

env: WANDB_PROJECT=ahson_ml_project


## **Model Training**

In [25]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mahsonriaz98[0m ([33mahson_ml[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Hamming Loss,F1 Score,Runtime,Samples Per Second,Steps Per Second
25,No log,0.599077,0.225596,0.060852,0.3451,4477.391,142.001
50,0.579400,0.527876,0.218535,0.055853,0.3006,5139.94,163.014
75,0.579400,0.497883,0.21724,0.04681,0.3214,4806.792,152.448
100,0.493400,0.48319,0.214122,0.049212,0.2148,7192.446,228.11
125,0.493400,0.475447,0.213475,0.034724,0.2143,7208.191,228.609
150,0.487200,0.470859,0.212886,0.023809,0.2145,7201.886,228.409
175,0.487200,0.471281,0.210944,0.042547,0.2099,7359.991,233.424
200,0.482500,0.469954,0.210003,0.047015,0.2225,6944.725,220.253
225,0.482500,0.461667,0.207237,0.082906,0.3097,4988.656,158.216
250,0.461900,0.456741,0.206355,0.083299,0.3209,4814.57,152.695


TrainOutput(global_step=7760, training_loss=0.2682370384943854, metrics={'train_runtime': 474.4563, 'train_samples_per_second': 520.933, 'train_steps_per_second': 16.356, 'total_flos': 59341335039360.0, 'train_loss': 0.2682370384943854, 'epoch': 40.0})

In [26]:
trainer.evaluate()

{'eval_loss': 0.42336711287498474,
 'eval_hamming_loss': 0.15834068843777582,
 'eval_f1_score': 0.44868038293130164,
 'eval_runtime': 0.2255,
 'eval_samples_per_second': 6849.946,
 'eval_steps_per_second': 217.247,
 'epoch': 40.0}

## **Deploying Model for Validation Set**

In [27]:
outputs = trainer.predict(validset)

In [28]:
valid_preds = (outputs.predictions>0).astype('int32')
valid_labels = outputs.label_ids

In [29]:
print("Validation F1 Score: ", f1_score(valid_labels,valid_preds, average= 'macro'), "\nValidation Hamming Loss: ",hamming_loss(valid_labels,valid_preds))

Validation F1 Score:  0.44868038293130164 
Validation Hamming Loss:  0.15834068843777582


In [30]:
# After training, let us check the best checkpoint
# We need this for Inference
best_model_checkpoint_step = trainer.state.best_model_checkpoint.split('-')[-1]
print(f"The best model was saved at step {best_model_checkpoint_step}.")

The best model was saved at step 6800.


In [31]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/f1_score,▁▁▃▅▅▆▆▇▇▇▇▇▇▇▇▇██████▇█████████████████
eval/hamming_loss,██▅▄▃▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▂▂▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂
eval/loss,█▇▅▃▃▂▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▃▃▄▄▄▄▄▄▄▅▅▅
eval/runtime,▁▁▁▁▂▂▁▁▁▃▂▁▁▁▃▃▂▂▂█▄▂▂▂▄▃▂▂▂▃▄▅▃▃▃▃▄▃▅▄
eval/samples_per_second,████▅▆█▇█▄▅▇█▇▄▃▇▆▆▁▃▅▅▅▃▄▅▅▅▄▃▂▄▄▄▄▃▄▂▃
eval/steps_per_second,████▅▆█▇█▄▅▇█▇▄▃▇▆▆▁▃▅▅▅▃▄▅▅▅▄▃▂▄▄▄▄▃▄▂▃
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▇▄▃▆▃▅▆▃▂▅▃▃▆▅▄▂▁▆▆▇▄▃▄▄▄█▅▄▃▄▄▂▇▄▃▅█▄█▇
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁

0,1
eval/f1_score,0.44868
eval/hamming_loss,0.15834
eval/loss,0.42337
eval/runtime,0.2255
eval/samples_per_second,6849.946
eval/steps_per_second,217.247
total_flos,59341335039360.0
train/epoch,40.0
train/global_step,7760.0
train/grad_norm,0.4989


## **Model Inference on Test Set from Kaggle**

In [32]:
test_df = pd.read_csv('/content/drive/MyDrive/hw5/emotion-detection-spring2014/test.csv')

In [33]:
test_df.head(2)

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2018-01559,@Adnan__786__ @AsYouNotWish Dont worry Indian ...,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE
1,2018-03739,"Academy of Sciences, eschews the normally sobe...",NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE


In [34]:
# Convert the list of texts into a list of lists; each inner list contains the vocabulary indices for a text
list_of_list_of_indices = [tokenizer(text, imdb_vocab) for text in test_df['Tweet'].values]

# Compute the offsets for each text in the concatenated tensor
offsets = [0] + [len(i) for i in list_of_list_of_indices]
offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

# Concatenate all text indices into a single tensor
indices = torch.cat([torch.tensor(i, dtype=torch.int64) for i in list_of_list_of_indices])

In [35]:
# put model in evaluation mode
model.eval()

# get outputs (logits) from model
outputs = model(indices, offsets)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.4171, -1.7904,  0.9803,  ..., -2.5416, -4.1564, -4.3358],
        [-3.1547, -2.3462, -2.6858,  ..., -1.8907, -5.1289, -3.7264],
        [-6.2990, -0.5413, -5.6393,  ..., -3.8259, -4.1584, -0.4776],
        ...,
        [ 1.5871, -3.0663,  0.3265,  ..., -5.0263, -8.1408, -8.0116],
        [-5.8920, -1.4668, -5.5915,  ..., -4.8247, -3.2419, -1.1149],
        [-2.6097, -0.6212, -2.4520,  ..., -4.7579, -2.2754, -5.1601]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [36]:
predictions = (outputs.logits>=0).int()

In [37]:
predict_df = test_df.drop(['ID','Tweet'],axis=1)

In [38]:
predict_X = test_df[['ID']].copy()

In [39]:
predict_map_df = pd.DataFrame(predictions.numpy(), columns = predict_df.columns)

In [40]:
test_df = pd.concat([predict_X,predict_map_df],axis=1)

In [41]:
test_df.head(5)

Unnamed: 0,ID,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2018-01559,1,0,1,0,0,0,0,0,0,0,0
1,2018-03739,0,0,0,0,0,0,0,0,0,0,0
2,2018-00385,0,0,0,0,1,0,1,0,0,0,0
3,2018-03001,0,0,0,0,0,0,0,0,1,0,0
4,2018-01988,1,0,1,0,0,0,0,0,0,0,0


In [42]:
test_df.to_csv('test_set_predictions_kaggle')