In [1]:
!pip install transformers
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np


class Model(ABC):
    """
    Abstract class for a machine learning model. Whenever it is needed to
    implement a new model it should inherit and implement each of its methods.
    Each inheritted model might be implemented differently but should respect
    the signature of the abstract class.
    """

    def __init__(self, output_dir: str) -> None:
        self.output_dir = output_dir

    @abstractmethod
    def fit(self,
            x_train: pd.Series,
            y_train: pd.Series,
            x_dev: pd.Series = None,
            y_dev: pd.Series = None):
        """
        Abstract fit method that takes training text documents `x_train` and
        their labels `y_train` and train a model. `x_dev` and `y_dev` can be
        used to obtain cross-validation insights, early stopping, or simply
        ignore them.

        parameters:
            - `x_train` (pd.Series[str]) training text documents.
            - `y_train` (pd.Series[int]) training labels.
            - `x_dev` (pd.Series[str]) dev text documents.
            - `y_dev` (pd.Series[int]) dev labels.
        """
        pass

    @abstractmethod
    def predict(self, x: pd.Series) -> np.array:
        """
        Abstract method to perform classification on samples in `x`.

        parameters:
            - `x` (pd.Series[str]) sample to predict.

        returns:
            - `y_pred` (np.array[int]) class labels for sample `x`.
        """
        pass

    @abstractmethod
    def predict_proba(self, x: pd.Series) -> np.array:
        """
        Abstract method to estimate classification probabilities on samples in
        `x`.

        parameters:
            - `x` (pd.Series[str]) sample to predict.

        returns:
            - `y_pred` (np.array of floats with n classes columns) probability
              labels for sample `x`.
        """
        pass

    @abstractmethod
    def save_model(self) -> None:
        """
        Save model weights as a pickle python file in `self.output_dir` using
        its identifier `self.model_name`.
        """
        pass

    @abstractmethod
    def load_model(self, model_dirpath: str) -> None:
        """
        Load model weights. It takes directory path `model_dirpath` where the
        model necessary data is in.

        parameters:
            - `model_dirpath` (str) Directory path where the model is saved.
        """
        pass

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py

In [2]:
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          TextClassificationPipeline, TrainingArguments,
                          Trainer, DataCollatorWithPadding)
from datasets import Dataset
import pandas as pd
import numpy as np
import os
import warnings
from typing import Tuple
import numpy as np


class TransformerModel(Model):
    """
    Huggingface Transformer model for classification such as BERT, DeBERTa,
    RoBERTa, etc.

    parameters:
        - `output_dir` (str) Directory path where the model outputs will be
          recorded. That is weights, predictions, etc.

        - `model_name` (str) Identifier of the model. It is used to recognize an
          instance of the class. For example, if multiple runs are executed with
          different parameters, `model_name` can be used to assign a different
          name. Also, when saving an instance of the model, it will create a
          directory using this parameters as its name and will be saved in
          `output_dir`.

        - `huggingface-path` (str) the name of the model in the hub of
          huggingface. For example: `bert-base-uncased` or
          `microsoft/deberta-v3-large`.

        - `checkpoint-path` (str) [optional] path to a huggingface checkpoint
        directory containing its configuration.

        - `epochs` (int) number of epochs for training the transformer.

        - `batch-size` (int) batch size used for training the transformer.

        - `random_state` (int) integer number to initialize the random state
          during the training process.

        - `lr` (float) learning rate for training the transformer.

        - `weight-decay` (float) weight decay penalty applied to the
          transformer.

        - `device` (str) Use `cpu` or `gpu`.
    """

    def __init__(self,
                 huggingface_path: str = "GroNLP/hateBERT",
                 checkpoint_path: str = None,
                 epochs: int = 4,
                 batch_size: int = 32,
                 random_state: int = 42,
                 lr: float = 2e-5,
                 weight_decay: float = 0.01,
                 num_labels: int = 2,
                 output_dir: str = "./default_output_dir",
                 device: str = "cpu") -> None:
        super(TransformerModel, self).__init__(output_dir)

        # Load model from hugginface hub.
        model = AutoModelForSequenceClassification.from_pretrained(
            huggingface_path,
            num_labels=num_labels,
            output_attentions=False,
            output_hidden_states=False,
        )

        # Load tokenizer from huggingface hub.
        tokenizer = AutoTokenizer.from_pretrained(huggingface_path,
                                                  do_lower_case=True)
        # Set class attributes.
        self.model = model
        self.tokenizer = tokenizer
        self.checkpoint_path = checkpoint_path
        self.epochs = epochs
        self.batch_size = batch_size
        self.random_state = random_state
        self.lr = lr
        self.weight_decay = weight_decay
        self.device = device
        self.num_labels = num_labels
        self.args = None
        self.trainer = None

    def set_training_args(self):
        self.args = TrainingArguments(
            output_dir=self.output_dir,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_strategy="epoch",
            learning_rate=self.lr,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            num_train_epochs=self.epochs,
            weight_decay=self.weight_decay,
            seed=self.random_state,
            #data_seed=self.random_state,
            optim="adamw_hf")

    def tokenize(self, example: str):
        """
        Tokenize a sentence using the model tokenizer.
        """
        return self.tokenizer(example["text"], truncation=True)

    def build_loader(self, sentences: pd.Series, labels: pd.Series = None):
        """
        Create a Dataset loader from huggingface tokenizing each sentence.

        parameters:
            - `sentences` (pd.Series[str])
            - `labels` (pd.Series[int])
        """
        dataset = Dataset.from_dict({"text": sentences}
                                    | ({
                                        "label": labels
                                    } if labels is not None else {}))
        return dataset.map(self.tokenize, batched=True)

    def fit(self,
            x_train: pd.Series,
            y_train: pd.Series,
            x_dev: pd.Series = None,
            y_dev: pd.Series = None) -> None:
        """
        Fit method that takes training text documents `x_train` and their labels
        `y_train` and train a transformer based model. In this case the `x_dev`
        and `y_dev` are used to evaluate the model in each epoch. When saving
        the model, train and dev losses are saved too.

        parameters:
            - `x_train` (pd.Series[str]) training text documents.
            - `y_train` (pd.Series[int]) training labels.
            - `x_dev` (pd.Series[str]) dev text documents.
            - `y_dev` (pd.Series[int]) dev labels.
        """
        self.set_training_args()

        # Create data collator.
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer,
                                                padding=True)

        # Create dataset loaders for train and dev sets.
        train = self.build_loader(sentences=x_train, labels=y_train)
        dev = self.build_loader(sentences=x_dev, labels=y_dev)

       # Move huggingface model to the device indicated.
        self.model = self.model.to(self.device)

        # Instance huggingface Trainer.
        self.trainer = Trainer(model=self.model,
                               args=self.args,
                               train_dataset=train,
                               eval_dataset=dev,
                               tokenizer=self.tokenizer,
                               data_collator=data_collator)

        # If there is any checkpoint provided, training is resumed from it.
        if self.checkpoint_path is not None:
            self.trainer.train(self.checkpoint_path)
        else:
            self.trainer.train()

    def predict_proba(self, x: pd.Series) -> np.array:
        pipe = TextClassificationPipeline(model=self.model,
                                          tokenizer=self.tokenizer,
                                          return_all_scores=True,
                                          framework="pt")
        preds = pipe(x.tolist())
        y_prob = np.array([[pred[i]["score"] for i in range(self.num_labels)]
                           for pred in preds])
        return y_prob

    def predict(self, x: pd.Series) -> np.array:
        y_prob = self.predict_proba(x)
        y_pred = np.argmax(y_prob, axis=1)
        return y_pred

    def save_model(self):
        """
        Save model weights and its configuration in `self.output_dir`. It
        follows huggingface save standards so the model can be re-loaded using
        huggingface `from_pretrained()` functionality.
        """
        if self.trainer is not None:
            os.makedirs(f"{self.output_dir}/model", exist_ok=True)
            self.trainer.save_model(output_dir=f"{self.output_dir}/model")
        else:
            warnings.warn(
                "Method ignored. Trying to save model without training it."
                "Please use `fit` before `save_model`",
                UserWarning,
            )

    def load_model(self, model_dirpath):
        """
        Load model weights. It takes directory path `model_dirpath` where the
        model necessary data is in.

        parameters:
            - `model_dirpath` (str) Directory path where the model is saved.
        """
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_dirpath)
        self.tokenizer = AutoTokenizer.from_pretrained(model_dirpath)

    def embed(self, x: pd.Series) -> Tuple[np.array, np.array]:
        inputs = self.tokenizer(x.tolist(),
                                truncation=True,
                                padding= "max_length",
                                max_length = 512,
                                return_tensors="pt")
        outputs = self.model(**inputs, output_hidden_states=True)

        # Get the last hidden state
        last_hidden_states = outputs.hidden_states[-1]

        # Get only the CLS token for each instance in `x` (the one used for classification).
        cls = last_hidden_states[:, 0, :]

        return last_hidden_states.detach().numpy(),cls.detach().numpy()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd

# Step 1: Load data from CSV files
train_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/train.csv")
dev_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/dev.csv")
test_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/test.csv")

# Step 2: Preprocess the data, separating sentences and labels
x_train, y_train = train_data["cleaned_text"], train_data["class"]
x_dev, y_dev = dev_data["cleaned_text"], dev_data["class"]
x_test, y_test = test_data["cleaned_text"], test_data["class"]

# Step 3: Initialize the TransformerModel
model = TransformerModel(huggingface_path="bert-base-uncased",
                         epochs=4,
                         batch_size=16,
                         random_state=42,
                         lr=2e-5,
                         weight_decay=0.01,
                         num_labels=3,
                         device="cuda")

# Step 4: Train the model on the training data
model.fit(x_train, y_train, x_dev, y_dev)

# Set the output directory where you want to save the model
output_dir = "drive/MyDrive/Dual Contrastive Approach/Data2/model_implicit_bert"  # Replace this with your desired output directory

# Set the output_dir in the model instance
model.output_dir = output_dir

model.save_model()

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/15036 [00:00<?, ? examples/s]

Map:   0%|          | 0/3222 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.6693,0.592241
2,0.5033,0.604027
3,0.3717,0.69175
4,0.27,0.816377


In [7]:
from sklearn.metrics import classification_report
import torch

# Step 1: Load data from CSV files
train_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/train.csv")
dev_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/dev.csv")
test_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/test.csv")

# Step 2: Preprocess the data, separating sentences and labels
x_train, y_train = train_data["cleaned_text"], train_data["class"]
x_dev, y_dev = dev_data["cleaned_text"], dev_data["class"]
x_test, y_test = test_data["cleaned_text"], test_data["class"]

# Assuming 'model' is your trained PyTorch model
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

# Move the model's parameters to the specified device
model.model.to(device)

y_test_pred = model.predict(x_test)

print(classification_report(y_test, y_test_pred, digits = 6))



              precision    recall  f1-score   support

           0   0.827416  0.829461  0.828437      2023
           1   0.634921  0.656371  0.645467      1036
           2   0.512195  0.386503  0.440559       163

    accuracy                       0.751397      3222
   macro avg   0.658177  0.624112  0.638155      3222
weighted avg   0.749574  0.751397  0.749983      3222



In [8]:
import pandas as pd

# Step 1: Load data from CSV files
train_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/train.csv")
dev_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/dev.csv")
test_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/test.csv")

# Step 2: Preprocess the data, separating sentences and labels
x_train, y_train = train_data["cleaned_text"], train_data["class"]
x_dev, y_dev = dev_data["cleaned_text"], dev_data["class"]
x_test, y_test = test_data["cleaned_text"], test_data["class"]

# Step 3: Initialize the TransformerModel
model = TransformerModel(huggingface_path="GroNLP/hateBERT",
                         epochs=4,
                         batch_size=16,
                         random_state=42,
                         lr=2e-5,
                         weight_decay=0.01,
                         num_labels=3,
                         device="cuda")

# Step 4: Train the model on the training data
model.fit(x_train, y_train, x_dev, y_dev)

# Set the output directory where you want to save the model
output_dir = "drive/MyDrive/Dual Contrastive Approach/Data2/model_implicit_hatebert"  # Replace this with your desired output directory

# Set the output_dir in the model instance
model.output_dir = output_dir

model.save_model()

config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/15036 [00:00<?, ? examples/s]

Map:   0%|          | 0/3222 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.6666,0.58922
2,0.5006,0.604239
3,0.3764,0.696627
4,0.2829,0.785835


In [9]:
from sklearn.metrics import classification_report
import torch

# Assuming 'model' is your trained PyTorch model
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

# Move the model's parameters to the specified device
model.model.to(device)

y_test_pred = model.predict(x_test)

# Print the classification report
print(classification_report(y_test, y_test_pred, digits = 6))



              precision    recall  f1-score   support

           0   0.817303  0.835887  0.826491      2023
           1   0.624404  0.632239  0.628297      1036
           2   0.538462  0.343558  0.419476       163

    accuracy                       0.745500      3222
   macro avg   0.660056  0.603895  0.624755      3222
weighted avg   0.741172  0.745500  0.742173      3222



In [10]:
import pandas as pd

# Step 1: Load data from CSV files
train_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/train.csv")
dev_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/dev.csv")
test_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/test.csv")

# Step 2: Preprocess the data, separating sentences and labels
x_train, y_train = train_data["cleaned_text"], train_data["class"]
x_dev, y_dev = dev_data["cleaned_text"], dev_data["class"]
x_test, y_test = test_data["cleaned_text"], test_data["class"]

# Step 3: Initialize the TransformerModel
model = TransformerModel(huggingface_path="roberta-base",
                         epochs=4,
                         batch_size=16,
                         random_state=42,
                         lr=2e-5,
                         weight_decay=0.01,
                         num_labels=3,
                         device="cuda")

# Step 4: Train the model on the training data
model.fit(x_train, y_train, x_dev, y_dev)

# Set the output directory where you want to save the model
output_dir = "drive/MyDrive/Dual Contrastive Approach/Data2/model_implicit_roberta"  # Replace this with your desired output directory

# Set the output_dir in the model instance
model.output_dir = output_dir

model.save_model()

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/15036 [00:00<?, ? examples/s]

Map:   0%|          | 0/3222 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.6791,0.602179
2,0.5525,0.590442
3,0.4721,0.609986
4,0.4021,0.664306


In [11]:
from sklearn.metrics import classification_report
import torch

# Assuming 'model' is your trained PyTorch model
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

# Move the model's parameters to the specified device
model.model.to(device)

y_test_pred = model.predict(x_test)

# Print the classification report
print(classification_report(y_test, y_test_pred, digits = 3))



              precision    recall  f1-score   support

           0      0.816     0.844     0.830      2023
           1      0.641     0.631     0.636      1036
           2      0.509     0.344     0.410       163

    accuracy                          0.750      3222
   macro avg      0.655     0.606     0.625      3222
weighted avg      0.744     0.750     0.746      3222



In [12]:
!pip install transformers
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install sentencepiece
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
import pandas as pd
from transformers import DebertaTokenizer, DebertaForSequenceClassification

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [14]:
# Step 1: Load data from CSV files
train_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/train.csv")
dev_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/dev.csv")
test_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/test.csv")

# Step 2: Preprocess the data, separating sentences and labels
x_train, y_train = train_data["cleaned_text"], train_data["class"]
x_dev, y_dev = dev_data["cleaned_text"], dev_data["class"]
x_test, y_test = test_data["cleaned_text"], test_data["class"]

# Step 3: Initialize the TransformerModel
model = TransformerModel(huggingface_path="microsoft/deberta-base",
                         epochs=4,
                         batch_size=16,
                         random_state=42,
                         lr=2e-5,
                         weight_decay=0.01,
                         num_labels=3,
                         device="cuda")

# Step 4: Train the model on the training data
model.fit(x_train, y_train, x_dev, y_dev)

# Set the output directory where you want to save the model
output_dir = "drive/MyDrive/Dual Contrastive Approach/Data2/model_implicit_deberta"  # Replace this with your desired output directory

# Set the output_dir in the model instance
model.output_dir = output_dir

model.save_model()

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Map:   0%|          | 0/15036 [00:00<?, ? examples/s]

Map:   0%|          | 0/3222 [00:00<?, ? examples/s]

You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.6759,0.605412
2,0.5333,0.59545
3,0.4304,0.657783
4,0.3383,0.730528


In [15]:
from sklearn.metrics import classification_report
import torch

# Assuming 'model' is your trained PyTorch model
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

# Move the model's parameters to the specified device
model.model.to(device)

y_test_pred = model.predict(x_test)

# Print the classification report
print(classification_report(y_test, y_test_pred, digits = 6))

              precision    recall  f1-score   support

           0   0.817693  0.826990  0.822315      2023
           1   0.620983  0.634170  0.627507      1036
           2   0.474576  0.343558  0.398577       163

    accuracy                       0.740534      3222
   macro avg   0.637751  0.601573  0.616133      3222
weighted avg   0.737085  0.740534  0.738240      3222



In [16]:
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
import tensorflow_hub as hub
import pickle
import pandas as pd
import numpy as np


class USETransformer(BaseEstimator, TransformerMixin):
    """
    Custom scikit-learn wrapper encoder/transformer that implements Universal
    Sentence Encoder. It follows scikit-learn conventions to be used in
    scikit-learn pipelines.
    """

    def fit(self, X, y):
        """
        Dummy fit implementation that implements identity function and
        passthrough its own instance classifier.
        """
        return self

    def transform(self, X):
        """
        Encode text documents and returns an array like of features.
        """
        module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
        encode = hub.load(module_url)
        return encode(X)


class USE_SVM(Model):
    """
    Support Vector Machine with Universal Sentence Encoder for codification.

    parameters:
        - `output_dir` (str) Directory path where the model outputs will be
          recorded. That is weights, predictions, etc.

        - `model_name` (str) Identifier of the model. It is used to recognize an
          instance of the class. For example, if multiple runs are executed with
          different parameters, `model_name` can be used to assign a different
          name. Also, when saving an instance of the model, it will create a
          directory using this parameters as its name and will be saved in
          `output_dir`.

        - `C` (float) Regularization parameter. The strength of the
          regularization is inversely proportional to C. Must be strictly
          positive. The penalty is a squared l2 penalty.

        - `kernel` (str) Specifies the kernel type to be used in the algorithm.
          If none is given, `rbf` will be used:
            - `linear`
            - `poly`
            - `rbf`
            - `sigmoid`
            - `precomputed`

        - `gamma` (float) Kernel coefficient for `rbf`, `poly` and `sigmoid`.

        - `probability` (bool) Whether to enable probability estimates.

        - `verbose` (bool) Enable verbose output during SVM training.

        - `class-weight` (bool) Set the parameter C of class i to
          class_weight[i]*C for SVC. If not given, all classes are supposed to
          have weight one. Good for unbalanced datasets.

        - `random_state` (int) Controls the pseudo random number generation.
    """

    def __init__(self,
                 output_dir: str = "./default_output_dir",
                 C: float = 1.0,
                 kernel: str = "rbf",
                 degree: int = 3,
                 gamma: str = "scale",
                 probability: bool = True,
                 verbose: bool = True,
                 class_weight: bool = True,
                 random_state: int = 0) -> None:
        # Define attributes.
        super().__init__(output_dir)
        self.kernel = kernel
        self.degree = degree
        self.gamma = gamma
        self.probability = probability
        self.verbose = verbose
        self.class_weight = class_weight
        self.random_state = random_state

        # Instance Universal Sentence Encoder. Note that is an custom
        # scikit-learn transformer.that can be used with the Pipeline
        # scikit-learn class.
        self.use = USETransformer()

        # Instance Support Vector Machine algorithm from scikit-learn.
        self.svm = SVC(C=C,
                       kernel=kernel,
                       degree=degree,
                       gamma=gamma,
                       probability=probability,
                       verbose=verbose,
                       class_weight="balanced" if class_weight else None,
                       random_state=random_state)

        # Make a scikit-learn pipeline combining the Universal Sentence Encoder,
        # and SVM.
        self.model = make_pipeline(self.use, self.svm)

    def fit(self,
            x_train: pd.Series,
            y_train: pd.Series,
            x_dev: pd.Series = None,
            y_dev: pd.Series = None) -> None:
        """
        Fit method that takes training text documents `x_train` and their labels
        `y_train` and train the pipeline USE + SVM. In this case the `x_dev` and
        `y_dev` sets are not used as dev sets in scikit-learn algorithms do not
        use early stopping criterias. All the series need to have the same
        shape.

        parameters:
            - `x_train` (pd.Series[str]) training text documents.
            - `y_train` (pd.Series[int]) training labels.
            - `x_dev` (pd.Series[str]) dev text documents.
            - `y_dev` (pd.Series[int]) dev labels.
        """
        self.model.fit(x_train, y_train)

    def predict(self, x: pd.Series) -> np.array:
        """
        Perform classification on samples in `x`.

        parameters:
            - `x` (pd.Series[str]) sample to predict.

        returns:
            - `y_pred` (np.array[int]) class labels for sample `x`.
        """
        return self.model.predict(x)

    def predict_proba(self, x: pd.Series) -> np.array:
        """
        Estimate classification probabilities on samples in `x`.

        parameters:
            - `x` (pd.Series[str]) sample to predict.

        returns:
            - `y_pred` (np.array of floats with n classes columns) probability
              labels for sample `x`.
        """
        return self.model.predict_proba(x)

    def save_model(self) -> None:
        """
        Save model weights as a pickle python file in `self.output_dir` using
        its identifier `self.model_name`.
        """
        pickle.dump(self.model, open(f"{self.output_dir}/model.pkl", "wb"))

    def load_model(self, model_dirpath: str) -> None:
        """
        Load model weights. It takes directory path `model_dirpath` and the
        refered directory has to contain a pickle file in it named `model.pkl`.

        parameters:
            - `model_dirpath` (str) Directory path where the model is saved.
        """
        with open(f"{model_dirpath}/model.pkl", 'rb') as model_pkl:
            self.model = pickle.load(model_pkl)

In [18]:
from google.colab import drive
drive.mount('/content/drive')

# Step 1: Load data from CSV files
train_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/train.csv")
dev_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/dev.csv")
test_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/Data2/test.csv")

# Step 2: Preprocess the data, separating sentences and labels
x_train, y_train = train_data["cleaned_text"], train_data["class"]
x_dev, y_dev = dev_data["cleaned_text"], dev_data["class"]
x_test, y_test = test_data["cleaned_text"], test_data["class"]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
import pandas as pd
from sklearn.metrics import classification_report

# Create an instance of USE_SVM with the specified parameters
svm_model = USE_SVM(
    C=1.0,
    kernel="rbf",
    degree=3,
    gamma="scale",
    probability=True,
    verbose=True,
    class_weight=True,
    random_state=42
)

# Fit the model on the training data
svm_model.fit(train_data["cleaned_text"], train_data["class"])

# Make predictions on the test data
predictions = svm_model.predict(test_data["cleaned_text"])

# Print the classification report
report = classification_report(test_data["class"], predictions)
print("Classification Report:\n", report)

[LibSVM]Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.74      0.78      2023
           1       0.55      0.68      0.61      1036
           2       0.33      0.31      0.32       163

    accuracy                           0.70      3222
   macro avg       0.57      0.58      0.57      3222
weighted avg       0.72      0.70      0.70      3222



In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
train_tfidf = tfidf_vectorizer.fit_transform(x_train)
dev_tfidf = tfidf_vectorizer.transform(x_dev)
test_tfidf = tfidf_vectorizer.transform(x_test)

# Support Vector Machine (SVM) classifier
svm_classifier = SVC(kernel='linear')  # You can choose different kernels (linear, rbf, etc.)
svm_classifier.fit(train_tfidf, y_train)

# Predictions
dev_predictions = svm_classifier.predict(dev_tfidf)
test_predictions = svm_classifier.predict(test_tfidf)

dev_accuracy = accuracy_score(y_dev, dev_predictions)
dev_report = classification_report(y_dev, dev_predictions, digits=3)

# Evaluation on test set
test_accuracy = accuracy_score(y_test, test_predictions)
test_report = classification_report(y_test, test_predictions, digits=3)

# Print results with three decimal places
print('Dev Set:')
print(f'Accuracy: {dev_accuracy:.3f}')
print('Classification Report:\n', dev_report)

print('\nTest Set:')
print(f'Accuracy: {test_accuracy:.3f}')
print('Classification Report:\n', test_report)

Dev Set:
Accuracy: 0.703
Classification Report:
               precision    recall  f1-score   support

           0      0.739     0.864     0.797      1990
           1      0.602     0.497     0.544      1054
           2      0.808     0.118     0.206       178

    accuracy                          0.703      3222
   macro avg      0.716     0.493     0.516      3222
weighted avg      0.698     0.703     0.682      3222


Test Set:
Accuracy: 0.713
Classification Report:
               precision    recall  f1-score   support

           0      0.746     0.875     0.805      2023
           1      0.614     0.489     0.545      1036
           2      0.826     0.117     0.204       163

    accuracy                          0.713      3222
   macro avg      0.729     0.494     0.518      3222
weighted avg      0.707     0.713     0.691      3222



In [21]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report

# Create a pipeline with CountVectorizer and SVM
model = make_pipeline(CountVectorizer(), SVC(kernel='linear'))

# Train the model on the training data
model.fit(x_train, y_train)

y_dev_pred = model.predict(x_dev)

# Evaluate the model on the development set
dev_accuracy = accuracy_score(y_dev, y_dev_pred)
print(f"Development Set Accuracy: {dev_accuracy:.3f}")

# Predict on the test set
y_test_pred = model.predict(x_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.3f}")

# Display classification report for the test set with three decimal places
print("Classification Report:")
print(classification_report(y_test, y_test_pred, digits=3))

Development Set Accuracy: 0.674
Test Set Accuracy: 0.681
Classification Report:
              precision    recall  f1-score   support

           0      0.745     0.818     0.780      2023
           1      0.553     0.493     0.521      1036
           2      0.372     0.178     0.241       163

    accuracy                          0.681      3222
   macro avg      0.557     0.496     0.514      3222
weighted avg      0.664     0.681     0.669      3222



In [15]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, preprocessing
import numpy as np
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras import layers, models

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load data from CSV files
train_data = pd.read_csv("/content/drive/MyDrive/Dual Contrastive Approach/Data2/train.csv")
dev_data = pd.read_csv("/content/drive/MyDrive/Dual Contrastive Approach/Data2/dev.csv")
test_data = pd.read_csv("/content/drive/MyDrive/Dual Contrastive Approach/Data2/test.csv")

# Preprocess the data, separating sentences and labels
x_train, y_train = train_data["cleaned_text"], train_data["class"]
x_dev, y_dev = dev_data["cleaned_text"], dev_data["class"]
x_test, y_test = test_data["cleaned_text"], test_data["class"]

# Tokenize and pad the sequences
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_dev = tokenizer.texts_to_sequences(x_dev)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=50, padding='post')
x_dev = preprocessing.sequence.pad_sequences(x_dev, maxlen=50, padding='post')
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=50, padding='post')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical

def build_bichat_multi_class_model(embedding_dim=768, max_seq_length=50, cnn_filter_size=3, num_cnn_filters=256, num_lstm_neurons=256, dropout_rate=0.3, num_classes=3):
    input_layer = layers.Input(shape=(max_seq_length,))
    embedding_layer = layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_seq_length)(input_layer)

    # BERT layer (embedding)
    bert_output = layers.Bidirectional(layers.LSTM(units=embedding_dim, return_sequences=True))(embedding_layer)

    # Update the input shape for the next layer
    bert_output = layers.Reshape((max_seq_length, -1))(bert_output)

    # Deep CNN layer
    cnn_output = layers.Conv1D(filters=num_cnn_filters, kernel_size=cnn_filter_size, activation='relu')(bert_output)
    cnn_output = layers.MaxPooling1D(pool_size=3)(cnn_output)

    # BiLSTM layer
    lstm_output = layers.Bidirectional(layers.LSTM(units=num_lstm_neurons, return_sequences=True))(cnn_output)

    # Reduce dimensionality
    lstm_output = layers.TimeDistributed(layers.Dense(num_cnn_filters))(lstm_output)

    # High-level attention layer
    attention_output = layers.Attention(use_scale=False)([cnn_output, lstm_output])

    # Dense layer
    dense_output = layers.Dense(128, activation='relu')(attention_output)
    dense_output = layers.Dropout(rate=dropout_rate)(dense_output)

    # Global Average Pooling layer to reduce sequence length dimension
    global_avg_pooling = layers.GlobalAveragePooling1D()(attention_output)

    # Output layer for multi-class classification
    output_layer = layers.Dense(num_classes, activation='softmax')(global_avg_pooling)

    # Model compilation
    model = models.Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

y_train_one_hot = to_categorical(y_train, num_classes=3)
y_dev_one_hot = to_categorical(y_dev, num_classes=3)

# Instantiate the multi-class model
bichat_multi_class_model = build_bichat_multi_class_model(num_classes=3)

# Print model summary
bichat_multi_class_model.summary()

# Compile and train the multi-class model
bichat_multi_class_model.fit(x_train, y_train_one_hot, epochs=10, batch_size=16, validation_data=(x_dev, y_dev_one_hot))

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 embedding_8 (Embedding)     (None, 50, 768)              1392537   ['input_9[0][0]']             
                                                          6                                       
                                                                                                  
 bidirectional_16 (Bidirect  (None, 50, 1536)             9443328   ['embedding_8[0][0]']         
 ional)                                                                                           
                                                                                            

<keras.src.callbacks.History at 0x7ab6af6f0160>

In [19]:
y_pred_probs = bichat_multi_class_model.predict(x_test)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1)

# Display the classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.76      0.68      0.72      2023
           1       0.46      0.59      0.52      1036
           2       0.28      0.17      0.21       163

    accuracy                           0.62      3222
   macro avg       0.50      0.48      0.48      3222
weighted avg       0.64      0.62      0.63      3222



In [20]:
report = classification_report(y_test, y_pred, digits=3)
print(report)

              precision    recall  f1-score   support

           0      0.760     0.680     0.717      2023
           1      0.463     0.589     0.518      1036
           2      0.284     0.166     0.209       163

    accuracy                          0.624      3222
   macro avg      0.502     0.478     0.482      3222
weighted avg      0.640     0.624     0.628      3222

