In [1]:
!pip install transformers
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!rm -rf /kaggle/working/*
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np

class Model(ABC):
    """
    Abstract class for a machine learning model. Whenever it is needed to
    implement a new model it should inherit and implement each of its methods.
    Each inheritted model might be implemented differently but should respect
    the signature of the abstract class.
    """

    def __init__(self, output_dir: str) -> None:
        self.output_dir = output_dir

    @abstractmethod
    def fit(self,
            x_train: pd.Series,
            y_train: pd.Series,
            x_dev: pd.Series = None,
            y_dev: pd.Series = None):
        """
        Abstract fit method that takes training text documents `x_train` and
        their labels `y_train` and train a model. `x_dev` and `y_dev` can be
        used to obtain cross-validation insights, early stopping, or simply
        ignore them.

        parameters:
            - `x_train` (pd.Series[str]) training text documents.
            - `y_train` (pd.Series[int]) training labels.
            - `x_dev` (pd.Series[str]) dev text documents.
            - `y_dev` (pd.Series[int]) dev labels.
        """
        pass

    @abstractmethod
    def predict(self, x: pd.Series) -> np.array:
        """
        Abstract method to perform classification on samples in `x`.

        parameters:
            - `x` (pd.Series[str]) sample to predict.

        returns:
            - `y_pred` (np.array[int]) class labels for sample `x`.
        """
        pass

    @abstractmethod
    def predict_proba(self, x: pd.Series) -> np.array:
        """
        Abstract method to estimate classification probabilities on samples in
        `x`.

        parameters:
            - `x` (pd.Series[str]) sample to predict.

        returns:
            - `y_pred` (np.array of floats with n classes columns) probability
              labels for sample `x`.
        """
        pass

    @abstractmethod
    def save_model(self) -> None:
        """
        Save model weights as a pickle python file in `self.output_dir` using
        its identifier `self.model_name`.
        """
        pass

    @abstractmethod
    def load_model(self, model_dirpath: str) -> None:
        """
        Load model weights. It takes directory path `model_dirpath` where the
        model necessary data is in.

        parameters:
            - `model_dirpath` (str) Directory path where the model is saved.
        """
        pass



In [2]:
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          TextClassificationPipeline, TrainingArguments,
                          Trainer, DataCollatorWithPadding, set_seed)
from datasets import Dataset
import pandas as pd
import numpy as np
import os
import warnings


class TransformerModel(Model):
    """
    Huggingface Transformer model for classification such as BERT, DeBERTa,
    RoBERTa, etc.

    parameters:
        - `output_dir` (str) Directory path where the model outputs will be
          recorded. That is weights, predictions, etc.

        - `model_name` (str) Identifier of the model. It is used to recognize an
          instance of the class. For example, if multiple runs are executed with
          different parameters, `model_name` can be used to assign a different
          name. Also, when saving an instance of the model, it will create a
          directory using this parameters as its name and will be saved in
          `output_dir`.

        - `huggingface-path` (str) the name of the model in the hub of
          huggingface. For example: `bert-base-uncased` or
          `microsoft/deberta-v3-large`.

        - `checkpoint-path` (str) [optional] path to a huggingface checkpoint
        directory containing its configuration.

        - `epochs` (int) number of epochs for training the transformer.

        - `batch-size` (int) batch size used for training the transformer.

        - `random_state` (int) integer number to initialize the random state
          during the training process.

        - `lr` (float) learning rate for training the transformer.

        - `weight-decay` (float) weight decay penalty applied to the
          transformer.

        - `device` (str) Use `cpu` or `gpu`.
    """

    def __init__(self,
                 huggingface_path: str = "bert-base-uncased",
                 checkpoint_path: str = None,
                 epochs: int = 4,
                 batch_size: int = 32,
                 random_state: int = 42,
                 lr: float = 2e-5,
                 weight_decay: float = 0.01,
                 num_labels: int = 2,
                 output_dir: str = "./default_output_dir",
                 device: str = "cpu") -> None:
        super(TransformerModel, self).__init__(output_dir)

        set_seed(random_state)

        # Load model from hugginface hub.
        model = AutoModelForSequenceClassification.from_pretrained(
            huggingface_path,
            num_labels=num_labels,
            output_attentions=False,
            output_hidden_states=False,
        )

        # Load tokenizer from huggingface hub.
        tokenizer = AutoTokenizer.from_pretrained(huggingface_path,
                                                  do_lower_case=True)
        # Set class attributes.
        self.model = model
        self.tokenizer = tokenizer
        self.checkpoint_path = checkpoint_path
        self.epochs = epochs
        self.batch_size = batch_size
        self.random_state = random_state
        self.lr = lr
        self.weight_decay = weight_decay
        self.device = device
        self.num_labels = num_labels
        self.args = None
        self.trainer = None

    def set_training_args(self):
        self.args = TrainingArguments(
            output_dir=self.output_dir,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_strategy="epoch",
            learning_rate=self.lr,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            num_train_epochs=self.epochs,
            weight_decay=self.weight_decay,
            seed=self.random_state,
            #device=self.device,
            #data_seed=self.random_state,
            optim="adamw_hf")

    def tokenize(self, example: str):
        """
        Tokenize a sentence using the model tokenizer.
        """
        return self.tokenizer(example["text"], truncation=True)

    def build_loader(self, sentences: pd.Series, labels: pd.Series = None):
        """
        Create a Dataset loader from huggingface tokenizing each sentence.

        parameters:
            - `sentences` (pd.Series[str])
            - `labels` (pd.Series[int])
        """
        dataset = Dataset.from_dict({"text": sentences}
                                    | ({
                                        "label": labels
                                    } if labels is not None else {}))
        return dataset.map(self.tokenize, batched=True)

    def fit(self,
            x_train: pd.Series,
            y_train: pd.Series,
            x_dev: pd.Series = None,
            y_dev: pd.Series = None) -> None:
        """
        Fit method that takes training text documents `x_train` and their labels
        `y_train` and train a transformer based model. In this case the `x_dev`
        and `y_dev` are used to evaluate the model in each epoch. When saving
        the model, train and dev losses are saved too.

        parameters:
            - `x_train` (pd.Series[str]) training text documents.
            - `y_train` (pd.Series[int]) training labels.
            - `x_dev` (pd.Series[str]) dev text documents.
            - `y_dev` (pd.Series[int]) dev labels.
        """
        self.set_training_args()

        # Create data collator.
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer,
                                                padding=True)

        # Create dataset loaders for train and dev sets.
        train = self.build_loader(sentences=x_train, labels=y_train)
        dev = self.build_loader(sentences=x_dev, labels=y_dev)

        # Move huggingface model to the device indicated.
        self.model = self.model.to(self.device)

        # Instance huggingface Trainer.
        self.trainer = Trainer(model=self.model,
                               args=self.args,
                               train_dataset=train,
                               eval_dataset=dev,
                               tokenizer=self.tokenizer,
                               data_collator=data_collator)

        # If there is any checkpoint provided, training is resumed from it.
        if self.checkpoint_path is not None:
            self.trainer.train(self.checkpoint_path)
        else:
            self.trainer.train()

    def predict_proba(self, x: pd.Series) -> np.array:
        """
        Estimate classification probabilities on samples in `x`.

        parameters:
            - `x` (pd.Series[str]) sample to predict.

        returns:
            - `y_pred` (np.array of floats with n classes columns) probability
              labels for sample `x`.
        """
        # Use text classification pipeline to make predictions.
        pipe = TextClassificationPipeline(model=self.model,
                                          tokenizer=self.tokenizer,
                                          return_all_scores=True,
                                          framework="pt")
        preds = pipe(x.tolist())
        y_prob = np.array([[pred[i]["score"] for i in range(self.num_labels)]
                           for pred in preds])
        return y_prob

    def predict(self, x: pd.Series) -> np.array:
        """
        Perform classification on samples in `x`.

        parameters:
            - `x` (pd.Series[str]) sample to predict.

        returns:
            - `y_pred` (np.array[int]) class labels for sample `x`.
        """
        y_prob = self.predict_proba(x)
        y_pred = np.argmax(y_prob, axis=1)
        return y_pred

    def save_model(self):
        """
        Save model weights and its configuration in `self.output_dir`. It
        follows huggingface save standards so the model can be re-loaded using
        huggingface `from_pretrained()` functionality.
        """
        if self.trainer is not None:
            os.makedirs(f"{self.output_dir}/model", exist_ok=True)
            self.trainer.save_model(output_dir=f"{self.output_dir}/model")
        else:
            warnings.warn(
                "Method ignored. Trying to save model without training it."
                "Please use `fit` before `save_model`",
                UserWarning,
            )

    def load_model(self, model_dirpath):
        """
        Load model weights. It takes directory path `model_dirpath` where the
        model necessary data is in.

        parameters:
            - `model_dirpath` (str) Directory path where the model is saved.
        """
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_dirpath)
        self.tokenizer = AutoTokenizer.from_pretrained(model_dirpath)

    def embed(self, x: pd.Series) -> np.array:
        inputs = self.tokenizer(x.tolist(),
                                truncation=True,
                                padding=True,
                                return_tensors="pt")

        # Move inputs to GPU
        inputs = {key: value.to(self.device) for key, value in inputs.items()}

        # Move model to GPU
        self.model = self.model.to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)

        # Get the last hidden state
        last_hidden_states = outputs.hidden_states[-1]

        # Get only the CLS token for each instance in `x` (the one used for classification).
        cls = last_hidden_states[:, 0, :]

        # Detach Pytorch tensor to Numpy array.
        return cls.cpu().detach().numpy()

2024-05-03 03:30:13.092406: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-03 03:30:13.092548: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-03 03:30:13.218313: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import pandas as pd

#Step 1: Load data from CSV files
train_data = pd.read_csv("/kaggle/input/mib-dataset/Data/train.csv")
dev_data = pd.read_csv("/kaggle/input/mib-dataset/Data/val.csv")
test_data = pd.read_csv("/kaggle/input/mib-dataset/Data/test.csv")

# Step 2: Preprocess the data, separating sentences and labels
x_train, y_train = train_data["text"], train_data["type"]
x_dev, y_dev = dev_data["text"], dev_data["type"]
x_test, y_test = test_data["text"], test_data["type"]

# Assuming your DataFrame is called x_train and the type column contains 'real' and 'social_spam'
y_train = train_data['type'].map({'real': 0, 'social_spam': 1})
y_dev= dev_data['type'].map({'real': 0, 'social_spam': 1})
y_test= test_data['type'].map({'real': 0, 'social_spam': 1})

# Step 3: Initialize the TransformerModel
model = TransformerModel(huggingface_path="bert-base-uncased",
                         epochs=1,
                         batch_size=30,
                         random_state=42,
                         lr=2e-5,
                         weight_decay=0.01,
                         num_labels=2,
                         device="cuda")

# Define the directory path where the model is saved
model_dirpath = "/kaggle/input/epoch1/model"

# Load the model
model.load_model(model_dirpath)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import torch

# Instantiate TransformerModel with GPU
model = TransformerModel(device="cuda")

# Define the directory path where the model is saved
model_dirpath = "/kaggle/input/epoch1/model"

# Load the model
model.load_model(model_dirpath)

from tqdm import tqdm

# Define batch size
batch_size = 300

# # Get embeddings for train data
# train_embeddings = []
# with tqdm(total=len(x_train), desc="Processing train data") as pbar:
#     for i in range(0, len(x_train), batch_size):
#         batch = x_train[i:i+batch_size]
#         batch_embeddings = model.embed(batch)
#         train_embeddings.append(batch_embeddings)
#         pbar.update(len(batch))
# train_embeddings = np.concatenate(train_embeddings)

# Get embeddings for dev data
dev_embeddings = []
with tqdm(total=len(x_dev), desc="Processing dev data") as pbar:
    for i in range(0, len(x_dev), batch_size):
        batch = x_dev[i:i+batch_size]
        batch_embeddings = model.embed(batch)
        dev_embeddings.append(batch_embeddings)
        pbar.update(len(batch))
dev_embeddings = np.concatenate(dev_embeddings)
np.save("/kaggle/working/dev_embeddings.npy", dev_embeddings)

del dev_embeddings

# Get embeddings for test data
test_embeddings = []
with tqdm(total=len(x_test), desc="Processing test data") as pbar:
    for i in range(0, len(x_test), batch_size):
        batch = x_test[i:i+batch_size]
        batch_embeddings = model.embed(batch)
        test_embeddings.append(batch_embeddings)
        pbar.update(len(batch))
test_embeddings = np.concatenate(test_embeddings)
np.save("/kaggle/working/test_embeddings.npy", test_embeddings)
# Save embeddings to .npy files
# np.save("/kaggle/working/train_embeddings.npy", train_embeddings)
del test_embeddings

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing dev data: 100%|██████████| 1306221/1306221 [1:01:24<00:00, 354.56it/s]
Processing test data:  62%|██████▏   | 1567800/2513746 [1:05:53<44:41, 352.80it/s]  

In [None]:
# import pandas as pd

# # Step 1: Load data from CSV files
# train_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/train.csv")
# dev_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/dev.csv")
# test_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/test.csv")

# # Step 2: Preprocess the data, separating sentences and labels
# x_train, y_train = train_data["cleaned_text"], train_data["label"]
# x_dev, y_dev = dev_data["cleaned_text"], dev_data["label"]
# x_test, y_test = test_data["cleaned_text"], test_data["label"]

# # Step 3: Initialize the TransformerModel
# model = TransformerModel(huggingface_path="GroNLP/hateBERT",
#                          epochs=4,
#                          batch_size=16,
#                          random_state=42,
#                          lr=2e-5,
#                          weight_decay=0.01,
#                          num_labels=3,
#                          device="cuda")

# # Step 4: Train the model on the training data
# model.fit(x_train, y_train, x_dev, y_dev)

# # Set the output directory where you want to save the model
# output_dir = "drive/MyDrive/Dual Contrastive Approach/model_implicit_hatebert"  # Replace this with your desired output directory

# # Set the output_dir in the model instance
# model.output_dir = output_dir

# model.save_model()

In [None]:
# from sklearn.metrics import classification_report
# import torch

# # Assuming 'model' is your trained PyTorch model
# device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

# # Move the model's parameters to the specified device
# model.model.to(device)

# y_test_pred = model.predict(x_test)

# # Print the classification report
# print(classification_report(y_test, y_test_pred, digits = 6))

In [None]:
# import pandas as pd

# # Step 1: Load data from CSV files
# train_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/train.csv")
# dev_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/dev.csv")
# test_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/test.csv")

# # Step 2: Preprocess the data, separating sentences and labels
# x_train, y_train = train_data["cleaned_text"], train_data["label"]
# x_dev, y_dev = dev_data["cleaned_text"], dev_data["label"]
# x_test, y_test = test_data["cleaned_text"], test_data["label"]

# # Step 3: Initialize the TransformerModel
# model = TransformerModel(huggingface_path="roberta-base",
#                          epochs=4,
#                          batch_size=16,
#                          random_state=42,
#                          lr=2e-5,
#                          weight_decay=0.01,
#                          num_labels=3,
#                          device="cuda")

# # Step 4: Train the model on the training data
# model.fit(x_train, y_train, x_dev, y_dev)

# # Set the output directory where you want to save the model
# output_dir = "drive/MyDrive/Dual Contrastive Approach/model_implicit_roberta"  # Replace this with your desired output directory

# # Set the output_dir in the model instance
# model.output_dir = output_dir

# model.save_model()

In [None]:
from sklearn.metrics import classification_report
import torch

# Assuming 'model' is your trained PyTorch model
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

# Move the model's parameters to the specified device
model.model.to(device)

y_test_pred = model.predict(x_test)

# Print the classification report
print(classification_report(y_test, y_test_pred, digits = 6))

In [None]:
# !pip install transformers
# !pip install datasets
# !pip install transformers[torch]
# !pip install accelerate -U
# !pip install sentencepiece
# from abc import ABC, abstractmethod
# import pandas as pd
# import numpy as np
# import pandas as pd
# from transformers import DebertaTokenizer, DebertaForSequenceClassification

In [None]:
# # Step 1: Load data from CSV files
# train_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/train.csv")
# dev_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/dev.csv")
# test_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/test.csv")

# # Step 2: Preprocess the data, separating sentences and labels
# x_train, y_train = train_data["cleaned_text"], train_data["label"]
# x_dev, y_dev = dev_data["cleaned_text"], dev_data["label"]
# x_test, y_test = test_data["cleaned_text"], test_data["label"]

# # Step 3: Initialize the TransformerModel
# model = TransformerModel(huggingface_path="microsoft/deberta-base",
#                          epochs=4,
#                          batch_size=16,
#                          random_state=42,
#                          lr=2e-5,
#                          weight_decay=0.01,
#                          num_labels=3,
#                          device="cuda")

# # Step 4: Train the model on the training data
# model.fit(x_train, y_train, x_dev, y_dev)

# # Set the output directory where you want to save the model
# output_dir = "drive/MyDrive/Dual Contrastive Approach/model_implicit_deberta"  # Replace this with your desired output directory

# # Set the output_dir in the model instance
# model.output_dir = output_dir

# model.save_model()

In [None]:
# from sklearn.metrics import classification_report
# import torch

# # Assuming 'model' is your trained PyTorch model
# device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

# # Move the model's parameters to the specified device
# model.model.to(device)

# y_test_pred = model.predict(x_test)

# # Print the classification report
# print(classification_report(y_test, y_test_pred, digits = 6))

In [None]:
# from sklearn.svm import SVC
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.pipeline import make_pipeline
# import tensorflow_hub as hub
# import pickle
# import pandas as pd
# import numpy as np


# class USETransformer(BaseEstimator, TransformerMixin):
#     """
#     Custom scikit-learn wrapper encoder/transformer that implements Universal
#     Sentence Encoder. It follows scikit-learn conventions to be used in
#     scikit-learn pipelines.
#     """

#     def fit(self, X, y):
#         """
#         Dummy fit implementation that implements identity function and
#         passthrough its own instance classifier.
#         """
#         return self

#     def transform(self, X):
#         """
#         Encode text documents and returns an array like of features.
#         """
#         module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
#         encode = hub.load(module_url)
#         return encode(X)


# class USE_SVM(Model):
#     """
#     Support Vector Machine with Universal Sentence Encoder for codification.

#     parameters:
#         - `output_dir` (str) Directory path where the model outputs will be
#           recorded. That is weights, predictions, etc.

#         - `model_name` (str) Identifier of the model. It is used to recognize an
#           instance of the class. For example, if multiple runs are executed with
#           different parameters, `model_name` can be used to assign a different
#           name. Also, when saving an instance of the model, it will create a
#           directory using this parameters as its name and will be saved in
#           `output_dir`.

#         - `C` (float) Regularization parameter. The strength of the
#           regularization is inversely proportional to C. Must be strictly
#           positive. The penalty is a squared l2 penalty.

#         - `kernel` (str) Specifies the kernel type to be used in the algorithm.
#           If none is given, `rbf` will be used:
#             - `linear`
#             - `poly`
#             - `rbf`
#             - `sigmoid`
#             - `precomputed`

#         - `gamma` (float) Kernel coefficient for `rbf`, `poly` and `sigmoid`.

#         - `probability` (bool) Whether to enable probability estimates.

#         - `verbose` (bool) Enable verbose output during SVM training.

#         - `class-weight` (bool) Set the parameter C of class i to
#           class_weight[i]*C for SVC. If not given, all classes are supposed to
#           have weight one. Good for unbalanced datasets.

#         - `random_state` (int) Controls the pseudo random number generation.
#     """

#     def __init__(self,
#                  output_dir: str = "./default_output_dir",
#                  C: float = 1.0,
#                  kernel: str = "rbf",
#                  degree: int = 3,
#                  gamma: str = "scale",
#                  probability: bool = True,
#                  verbose: bool = True,
#                  class_weight: bool = True,
#                  random_state: int = 0) -> None:
#         # Define attributes.
#         super().__init__(output_dir)
#         self.kernel = kernel
#         self.degree = degree
#         self.gamma = gamma
#         self.probability = probability
#         self.verbose = verbose
#         self.class_weight = class_weight
#         self.random_state = random_state

#         # Instance Universal Sentence Encoder. Note that is an custom
#         # scikit-learn transformer.that can be used with the Pipeline
#         # scikit-learn class.
#         self.use = USETransformer()

#         # Instance Support Vector Machine algorithm from scikit-learn.
#         self.svm = SVC(C=C,
#                        kernel=kernel,
#                        degree=degree,
#                        gamma=gamma,
#                        probability=probability,
#                        verbose=verbose,
#                        class_weight="balanced" if class_weight else None,
#                        random_state=random_state)

#         # Make a scikit-learn pipeline combining the Universal Sentence Encoder,
#         # and SVM.
#         self.model = make_pipeline(self.use, self.svm)

#     def fit(self,
#             x_train: pd.Series,
#             y_train: pd.Series,
#             x_dev: pd.Series = None,
#             y_dev: pd.Series = None) -> None:
#         """
#         Fit method that takes training text documents `x_train` and their labels
#         `y_train` and train the pipeline USE + SVM. In this case the `x_dev` and
#         `y_dev` sets are not used as dev sets in scikit-learn algorithms do not
#         use early stopping criterias. All the series need to have the same
#         shape.

#         parameters:
#             - `x_train` (pd.Series[str]) training text documents.
#             - `y_train` (pd.Series[int]) training labels.
#             - `x_dev` (pd.Series[str]) dev text documents.
#             - `y_dev` (pd.Series[int]) dev labels.
#         """
#         self.model.fit(x_train, y_train)

#     def predict(self, x: pd.Series) -> np.array:
#         """
#         Perform classification on samples in `x`.

#         parameters:
#             - `x` (pd.Series[str]) sample to predict.

#         returns:
#             - `y_pred` (np.array[int]) class labels for sample `x`.
#         """
#         return self.model.predict(x)

#     def predict_proba(self, x: pd.Series) -> np.array:
#         """
#         Estimate classification probabilities on samples in `x`.

#         parameters:
#             - `x` (pd.Series[str]) sample to predict.

#         returns:
#             - `y_pred` (np.array of floats with n classes columns) probability
#               labels for sample `x`.
#         """
#         return self.model.predict_proba(x)

#     def save_model(self) -> None:
#         """
#         Save model weights as a pickle python file in `self.output_dir` using
#         its identifier `self.model_name`.
#         """
#         pickle.dump(self.model, open(f"{self.output_dir}/model.pkl", "wb"))

#     def load_model(self, model_dirpath: str) -> None:
#         """
#         Load model weights. It takes directory path `model_dirpath` and the
#         refered directory has to contain a pickle file in it named `model.pkl`.

#         parameters:
#             - `model_dirpath` (str) Directory path where the model is saved.
#         """
#         with open(f"{model_dirpath}/model.pkl", 'rb') as model_pkl:
#             self.model = pickle.load(model_pkl)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# # Step 1: Load data from CSV files
# train_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/train.csv")
# dev_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/dev.csv")
# test_data = pd.read_csv("drive/MyDrive/Dual Contrastive Approach/test.csv")

# # Step 2: Preprocess the data, separating sentences and labels
# x_train, y_train = train_data["cleaned_text"], train_data["label"]
# x_dev, y_dev = dev_data["cleaned_text"], dev_data["label"]
# x_test, y_test = test_data["cleaned_text"], test_data["label"]

In [None]:
# import pandas as pd
# from sklearn.metrics import classification_report

# # Create an instance of USE_SVM with the specified parameters
# svm_model = USE_SVM(
#     C=1.0,
#     kernel="rbf",
#     degree=3,
#     gamma="scale",
#     probability=True,
#     verbose=True,
#     class_weight=True,
#     random_state=42
# )

# # Fit the model on the training data
# svm_model.fit(train_data["cleaned_text"], train_data["class"])

# # Make predictions on the test data
# predictions = svm_model.predict(test_data["cleaned_text"])

# # Print the classification report
# report = classification_report(test_data["class"], predictions)
# print("Classification Report:\n", report)