In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import random
import numpy as np
from transformers import (
    BertTokenizer,
    BertForMaskedLM,
    BertModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from datasets import Dataset as HFDataset
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm


random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


def load_unlabeled_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        texts = [line.strip() for line in f if line.strip()]
    data = [{"text": text} for text in texts]
    return HFDataset.from_list(data)

unlabeled_dataset = load_unlabeled_data('/content/cleaned_train.txt')
print("Unlabeled dataset size:", len(unlabeled_dataset))

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

unlabeled_dataset = unlabeled_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

mlm_training_args = TrainingArguments(
    output_dir="/content/mlm_model",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
)

mlm_model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)
trainer = Trainer(
    model=mlm_model,
    args=mlm_training_args,
    data_collator=data_collator,
    train_dataset=unlabeled_dataset,
)
print("Starting MLM pre-training...")
trainer.train()
trainer.save_model("/content/fine_tuned_bert_mlm")



In [None]:

def load_labeled_data(file_path):
    texts = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.rsplit(' ', 1)
            if len(parts) == 2:
                text, label_str = parts
                texts.append(text)
                labels.append(int(label_str))
    return texts, labels

labeled_texts, labeled_labels = load_labeled_data('/content/processed_test.txt')
print("Labeled dataset size:", len(labeled_texts))


class SentimentDatasetIntegers(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        encoding = self.tokenizer(text, truncation=True, padding="max_length",
                                  max_length=self.max_length, return_tensors="pt")
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": label}

full_dataset = SentimentDatasetIntegers(labeled_texts, labeled_labels, tokenizer, max_length=128)
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])
print("Train size:", len(train_dataset), "Test size:", len(test_dataset))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


class DebugCNNClassifier(nn.Module):
    def __init__(self, num_labels=3, bert_model_name="/content/fine_tuned_bert_mlm",
                 cnn_out_channels=128, dropout=0.5):
        super(DebugCNNClassifier, self).__init__()

        self.bert = BertModel.from_pretrained(bert_model_name)
        bert_hidden_size = self.bert.config.hidden_size


        self.conv3 = nn.Conv1d(in_channels=bert_hidden_size, out_channels=cnn_out_channels,
                               kernel_size=3, padding=1)
        self.conv4 = nn.Conv1d(in_channels=bert_hidden_size, out_channels=cnn_out_channels,
                               kernel_size=4, padding=2)
        self.conv5 = nn.Conv1d(in_channels=bert_hidden_size, out_channels=cnn_out_channels,
                               kernel_size=5, padding=2)

        self.layernorm = nn.LayerNorm(cnn_out_channels * 3)
        self.dropout = nn.Dropout(dropout)

        self.classifier = nn.Linear(cnn_out_channels * 3, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        x = sequence_output.transpose(1, 2)
        conv3_out = F.relu(self.conv3(x))
        conv4_out = F.relu(self.conv4(x))
        conv5_out = F.relu(self.conv5(x))
        expected_seq_len = x.size(2)
        if conv4_out.size(2) > expected_seq_len:
            conv4_out = conv4_out[:, :, :expected_seq_len]

        conv_cat = torch.cat((conv3_out, conv4_out, conv5_out), dim=1)

        conv_cat = conv_cat.transpose(1, 2)
        conv_cat = self.layernorm(conv_cat)

        pooled, _ = torch.max(conv_cat, dim=1)
        pooled = self.dropout(pooled)
        logits = self.classifier(pooled)
        return logits

debug_cnn_model = DebugCNNClassifier(num_labels=3).to(device)
criterion_debug_cnn = nn.CrossEntropyLoss()
optimizer_debug_cnn = torch.optim.Adam(debug_cnn_model.parameters(), lr=2e-5)
num_epochs_debug_cnn = 15


for epoch in range(num_epochs_debug_cnn):
    debug_cnn_model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"DebugCNN Epoch {epoch+1}/{num_epochs_debug_cnn}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        optimizer_debug_cnn.zero_grad()
        logits = debug_cnn_model(input_ids, attention_mask)
        loss = criterion_debug_cnn(logits, labels)
        loss.backward()
        optimizer_debug_cnn.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"DebugCNN Epoch {epoch+1}/{num_epochs_debug_cnn}, Loss: {avg_loss:.4f}")


debug_cnn_model.eval()
all_preds_debug_cnn = []
all_labels_debug_cnn = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        logits = debug_cnn_model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)
        all_preds_debug_cnn.extend(preds.cpu().numpy())
        all_labels_debug_cnn.extend(labels.cpu().numpy())

debug_cnn_acc = accuracy_score(all_labels_debug_cnn, all_preds_debug_cnn)
print("\nDebugCNNClassifier Test Accuracy:", debug_cnn_acc)
print(classification_report(all_labels_debug_cnn, all_preds_debug_cnn, target_names=["Negative", "Neutral", "Positive"]))
