In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import random
import numpy as np
from transformers import (
    BertTokenizer,
    BertForMaskedLM,
    BertForSequenceClassification,
    BertModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from datasets import Dataset as HFDataset
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import matplotlib.pyplot as plt


random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


def load_unlabeled_texts(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        texts = [line.strip() for line in f if line.strip()]
    return texts

all_texts = load_unlabeled_texts('/content/cleaned_train.txt')
print("Number of raw sentences:", len(all_texts))


unlabeled_dataset = HFDataset.from_list([{"text": t} for t in all_texts])

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
unlabeled_dataset = unlabeled_dataset.map(tokenize_function, batched=True)
unlabeled_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])


unlabeled_dataloader = DataLoader(unlabeled_dataset, batch_size=32, shuffle=True)


generator = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)

discriminator = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)


gen_optimizer = torch.optim.Adam(generator.parameters(), lr=2e-5)
disc_optimizer = torch.optim.Adam(discriminator.parameters(), lr=2e-5)


num_epochs_gan = 10
mask_prob = 0.15
for epoch in range(num_epochs_gan):
    generator.train()
    discriminator.train()
    total_gen_loss = 0
    total_disc_loss = 0
    for batch in tqdm(unlabeled_dataloader, desc=f"GAN Epoch {epoch+1}/{num_epochs_gan}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)


        rand = torch.rand(input_ids.shape).to(device)
        mask_positions = (rand < mask_prob) & (input_ids != tokenizer.cls_token_id) & (input_ids != tokenizer.sep_token_id)


        masked_input_ids = input_ids.clone()
        masked_input_ids[mask_positions] = tokenizer.mask_token_id


        gen_outputs = generator(masked_input_ids, attention_mask=attention_mask)
        gen_logits = gen_outputs.logits


        gen_tokens = input_ids.clone()
        if mask_positions.sum() > 0:
            gen_tokens[mask_positions] = torch.argmax(gen_logits, dim=-1)[mask_positions]


        real_labels = torch.ones(input_ids.size(0), dtype=torch.long).to(device)
        fake_labels = torch.zeros(input_ids.size(0), dtype=torch.long).to(device)


        disc_real_outputs = discriminator(input_ids=input_ids, attention_mask=attention_mask)
        loss_real = torch.nn.functional.cross_entropy(disc_real_outputs.logits, real_labels)


        disc_fake_outputs = discriminator(input_ids=gen_tokens, attention_mask=attention_mask)
        loss_fake = torch.nn.functional.cross_entropy(disc_fake_outputs.logits, fake_labels)

        disc_loss = (loss_real + loss_fake) / 2
        disc_optimizer.zero_grad()
        disc_loss.backward(retain_graph=True)
        disc_optimizer.step()

        total_disc_loss += disc_loss.item()

        disc_fake_outputs_for_gen = discriminator(input_ids=gen_tokens, attention_mask=attention_mask)
        gen_adv_loss = torch.nn.functional.cross_entropy(disc_fake_outputs_for_gen.logits, real_labels)

        gen_optimizer.zero_grad()
        gen_adv_loss.backward()
        gen_optimizer.step()

        total_gen_loss += gen_adv_loss.item()

    avg_gen_loss = total_gen_loss / len(unlabeled_dataloader)
    avg_disc_loss = total_disc_loss / len(unlabeled_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs_gan} - Generator Loss: {avg_gen_loss:.4f}, Discriminator Loss: {avg_disc_loss:.4f}")


generator.save_pretrained("/content/fine_tuned_bert_gan")



In [None]:


def load_labeled_data(file_path):
    texts = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.rsplit(' ', 1)
            if len(parts) == 2:
                text, label_str = parts
                texts.append(text)
                labels.append(int(label_str))
    return texts, labels

labeled_texts, labeled_labels = load_labeled_data('/content/processed_test.txt')
print("Labeled dataset size:", len(labeled_texts))


class SentimentDatasetIntegers(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        encoding = self.tokenizer(text, truncation=True, padding="max_length",
                                  max_length=self.max_length, return_tensors="pt")
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": label}

full_dataset = SentimentDatasetIntegers(labeled_texts, labeled_labels, tokenizer, max_length=128)
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])
print("Train size:", len(train_dataset), "Test size:", len(test_dataset))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


class DebugCNNBiLSTMHighwayAttentionClassifier(nn.Module):
    def __init__(self, num_labels=3, bert_model_name="/content/fine_tuned_bert_gan",
                 cnn_out_channels=128, lstm_hidden=256, lstm_layers=1, dropout=0.5):
        super(DebugCNNBiLSTMHighwayAttentionClassifier, self).__init__()

        self.bert = BertModel.from_pretrained(bert_model_name, add_pooling_layer=False)
        bert_hidden_size = self.bert.config.hidden_size


        self.conv3 = nn.Conv1d(in_channels=bert_hidden_size, out_channels=cnn_out_channels,
                               kernel_size=3, padding=1)
        self.conv4 = nn.Conv1d(in_channels=bert_hidden_size, out_channels=cnn_out_channels,
                               kernel_size=4, padding=2)
        self.conv5 = nn.Conv1d(in_channels=bert_hidden_size, out_channels=cnn_out_channels,
                               kernel_size=5, padding=2)
        self.layernorm = nn.LayerNorm(cnn_out_channels * 3)


        self.lstm = nn.LSTM(input_size=cnn_out_channels * 3, hidden_size=lstm_hidden,
                            num_layers=lstm_layers, batch_first=True, bidirectional=True,
                            dropout=dropout if lstm_layers > 1 else 0.0)
        self.dropout = nn.Dropout(dropout)

        self.highway_linear = nn.Linear(2 * lstm_hidden, 2 * lstm_hidden)
        self.highway_gate = nn.Linear(2 * lstm_hidden, 2 * lstm_hidden)


        self.attention_fc = nn.Linear(2 * lstm_hidden, 1)

        self.classifier = nn.Linear(2 * lstm_hidden, num_labels)

    def highway(self, x):
        H = F.relu(self.highway_linear(x))
        T = torch.sigmoid(self.highway_gate(x))
        return H * T + x * (1 - T)

    def attention_pooling(self, x):
        scores = self.attention_fc(x)
        weights = F.softmax(scores, dim=1)
        pooled = torch.sum(weights * x, dim=1)
        return pooled

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        x = sequence_output.transpose(1, 2)
        conv3 = F.relu(self.conv3(x))
        conv4 = F.relu(self.conv4(x))
        conv5 = F.relu(self.conv5(x))
        expected_seq_len = x.size(2)
        if conv4.size(2) > expected_seq_len:
            conv4 = conv4[:, :, :expected_seq_len]
        conv_cat = torch.cat((conv3, conv4, conv5), dim=1)
        conv_cat = conv_cat.transpose(1, 2)
        conv_cat = self.layernorm(conv_cat)
        lstm_out, _ = self.lstm(conv_cat)
        lstm_out = self.dropout(lstm_out)
        highway_out = self.highway(lstm_out)
        pooled = self.attention_pooling(highway_out)
        logits = self.classifier(pooled)
        return logits


debug_cnn_bilstm_highway_attention_model = DebugCNNBiLSTMHighwayAttentionClassifier(num_labels=3).to(device)
criterion_debug_attention = nn.CrossEntropyLoss()
optimizer_debug_attention = torch.optim.Adam(debug_cnn_bilstm_highway_attention_model.parameters(), lr=2e-5)
num_epochs_debug_attention = 3

train_losses_debug = []
val_losses_debug = []


for epoch in range(num_epochs_debug_attention):
    debug_cnn_bilstm_highway_attention_model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Attention Epoch {epoch+1}/{num_epochs_debug_attention}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        optimizer_debug_attention.zero_grad()
        logits = debug_cnn_bilstm_highway_attention_model(input_ids, attention_mask)
        loss = criterion_debug_attention(logits, labels)
        loss.backward()
        optimizer_debug_attention.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    train_losses_debug.append(avg_loss)


    debug_cnn_bilstm_highway_attention_model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            logits = debug_cnn_bilstm_highway_attention_model(input_ids, attention_mask)
            loss = criterion_debug_attention(logits, labels)
            total_val_loss += loss.item()
    avg_val_loss = total_val_loss / len(test_loader)
    val_losses_debug.append(avg_val_loss)

    print(f"Attention Epoch {epoch+1}/{num_epochs_debug_attention}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

with open("losses_debug.txt", "w") as f:
    f.write("Epoch,Train_Loss,Val_Loss\n")
    for i in range(num_epochs_debug_attention):
        f.write(f"{i+1},{train_losses_debug[i]:.4f},{val_losses_debug[i]:.4f}\n")
print("Losses saved to losses_debug.txt")

plt.figure(figsize=(8, 6))
plt.plot(range(1, num_epochs_debug_attention+1), train_losses_debug, marker='o', label='Train Loss')
plt.plot(range(1, num_epochs_debug_attention+1), val_losses_debug, marker='s', label='Validation Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Curve")
plt.legend()
plt.show()


debug_cnn_bilstm_highway_attention_model.eval()
all_preds_debug_attention = []
all_labels_debug_attention = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        logits = debug_cnn_bilstm_highway_attention_model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)
        all_preds_debug_attention.extend(preds.cpu().numpy())
        all_labels_debug_attention.extend(labels.cpu().numpy())

debug_attention_acc = accuracy_score(all_labels_debug_attention, all_preds_debug_attention)
print("\nDebugCNNBiLSTMHighwayAttentionClassifier Test Accuracy:", debug_attention_acc)
print(classification_report(all_labels_debug_attention, all_preds_debug_attention, target_names=["Negative", "Neutral", "Positive"]))