In [None]:
!pip install transformers datasets torch scikit-learn
import re
from transformers import DistilBertTokenizerFast
from datasets import Dataset
import pandas as pd


train_df = pd.read_csv("train_enc.tsv", sep="\t", header=None, names=["label", "text"])
dev_df = pd.read_csv("dev_enc.tsv", sep="\t", header=None, names=["label", "text"])


train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)


tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments,DistilBertModel
from sklearn.metrics import accuracy_score
from transformers import EarlyStoppingCallback
import torch
import torch.nn as nn

class DistilBertLSTM(nn.Module):
    def __init__(self):
        super(DistilBertLSTM, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.lstm = nn.LSTM(input_size=self.distilbert.config.hidden_size, hidden_size=128, batch_first=True)
        self.fc = nn.Linear(128, 2)

    def forward(self, input_ids, attention_mask, labels=None):
        # Get the output from the DistilBERT model
        distilbert_output = self.distilbert(input_ids, attention_mask=attention_mask)
        hidden_states = distilbert_output.last_hidden_state

        # Pass the hidden states through the LSTM layer
        lstm_out, (h_n, c_n) = self.lstm(hidden_states)
        last_hidden_state = h_n[-1]

        # Get the logits from the fully connected layer
        logits = self.fc(last_hidden_state)

        # If labels are provided (during training), compute loss
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
            return loss, logits
        else:
            return logits

# Initialize the model
model = DistilBertLSTM()


def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    acc = accuracy_score(p.label_ids, preds)
    print(f"Evaluation Accuracy: {acc:.4f}")  
    return {"accuracy": acc}


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    report_to=["none"],                   
    gradient_accumulation_steps=4,
    warmup_steps=500,
    
)
early_stopping = EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.01)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

# Train the model
trainer.train()test_df = pd.read_csv("test_enc_unlabeled.tsv", sep="\t", header=None, names=["text"])
test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(tokenize_function, batched=True)

test_predictions = trainer.predict(test_dataset).predictions.argmax(axis=1)

# Save predictions to file
with open("upload_predictions.txt", "w") as f:
    for pred in test_predictions:
        f.write(f"{pred}\n")

print("Predictions saved to upload_predictions.txt")