<a href="https://colab.research.google.com/github/AkankshaGiliyal/binaryClassifier/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers datasets torch scikit-learn
import re
from transformers import DistilBertTokenizerFast
from datasets import Dataset
import pandas as pd


train_df = pd.read_csv("train_enc.tsv", sep="\t", header=None, names=["label", "text"])
dev_df = pd.read_csv("dev_enc.tsv", sep="\t", header=None, names=["label", "text"])


train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)


tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments,DistilBertModel
from sklearn.metrics import accuracy_score
from transformers import EarlyStoppingCallback
import torch
import torch.nn as nn

class DistilBertLSTM(nn.Module):
    def __init__(self):
        super(DistilBertLSTM, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.lstm = nn.LSTM(input_size=self.distilbert.config.hidden_size, hidden_size=128, batch_first=True)
        self.fc = nn.Linear(128, 2)

    def forward(self, input_ids, attention_mask, labels=None):

        distilbert_output = self.distilbert(input_ids, attention_mask=attention_mask)
        hidden_states = distilbert_output.last_hidden_state


        lstm_out, (h_n, c_n) = self.lstm(hidden_states)
        last_hidden_state = h_n[-1]


        logits = self.fc(last_hidden_state)


        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
            return loss, logits
        else:
            return logits


model = DistilBertLSTM()


def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    acc = accuracy_score(p.label_ids, preds)
    print(f"Evaluation Accuracy: {acc:.4f}")
    return {"accuracy": acc}


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    report_to=["none"],
    gradient_accumulation_steps=4,
    warmup_steps=500,

)
early_stopping = EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.01)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)


trainer.train()
test_df = pd.read_csv("test_enc_unlabeled.tsv", sep="\t", header=None, names=["text"])
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)

test_predictions = trainer.predict(test_dataset).predictions.argmax(axis=1)


with open("upload_predictions.txt", "w") as f:
    for pred in test_predictions:
        f.write(f"{pred}\n")

print("Predictions saved to upload_predictions.txt")

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/16220 [00:00<?, ? examples/s]

Map:   0%|          | 0/2027 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.674933,0.585594
2,0.641200,0.520872,0.74741
3,0.641200,0.426912,0.810557
4,0.404600,0.393281,0.844598
5,0.404600,0.387518,0.860878
6,0.210000,0.417954,0.862358
7,0.210000,0.399408,0.876665
8,0.120500,0.469088,0.875678
9,0.120500,0.484594,0.873705


Evaluation Accuracy: 0.5856
Evaluation Accuracy: 0.7474
Evaluation Accuracy: 0.8106
Evaluation Accuracy: 0.8446
Evaluation Accuracy: 0.8609
Evaluation Accuracy: 0.8624
Evaluation Accuracy: 0.8767
Evaluation Accuracy: 0.8757
Evaluation Accuracy: 0.8737


Map:   0%|          | 0/2028 [00:00<?, ? examples/s]

Predictions saved to upload_predictions.txt
