In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import brown
import nltk

nltk.download('brown')

class ELMo(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2):
        super(ELMo, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm_layers = nn.ModuleList([
            nn.LSTM(
                embedding_dim if i == 0 else hidden_dim * 2,
                hidden_dim,
                num_layers=1,
                bidirectional=True,
                batch_first=True
            )
            for i in range(num_layers)
        ])
        self.linear = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, x):
        embeds = self.embedding(x)
        layer_outputs = [embeds]
        for i, lstm in enumerate(self.bilstm_layers):
            out, _ = lstm(layer_outputs[-1])
            layer_outputs.append(out)
        final_output = self.linear(out)
        return final_output, layer_outputs[1:]

class BrownDataset(Dataset):
    def __init__(self):
        self.sentences = brown.sents()
        self.vocab = set(brown.words())
        self.word2idx = {word: idx + 2 for idx, word in enumerate(self.vocab)}
        self.word2idx['<UNK>'] = 0
        self.word2idx['<PAD>'] = 1
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        input_ids = [self.word2idx.get(word, 0) for word in sentence]
        return torch.tensor(input_ids)

def collate_fn(batch):
    max_len = max(len(seq) for seq in batch)
    padded_batch = torch.zeros(len(batch), max_len, dtype=torch.long)
    for i, seq in enumerate(batch):
        padded_batch[i, :len(seq)] = seq
        padded_batch[i, len(seq):] = 1
    return padded_batch

def train_elmo(model, dataloader, epochs=10, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=1)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            batch = batch.to(device)
            optimizer.zero_grad()
            if batch.size(1) < 2:
                continue
            input_seq = batch[:, :-1]
            target_seq = batch[:, 1:]
            output, _ = model(input_seq)
            loss = criterion(output.view(-1, len(dataset.vocab) + 2), target_seq.reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")
    torch.save(model.state_dict(), "bilstm.pt")
    print("Model saved as bilstm.pt")

if __name__ == "__main__":
    dataset = BrownDataset()
    with open("vocab.txt", "w") as f:
        f.write("\n".join(dataset.word2idx.keys()))
    print("Vocabulary saved as vocab.txt")

    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=2, collate_fn=collate_fn)
    vocab_size = len(dataset.vocab) + 2
    embedding_dim = 300
    hidden_dim = 256
    model = ELMo(vocab_size, embedding_dim, hidden_dim)
    train_elmo(model, dataloader, epochs=10)

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


Vocabulary saved as vocab.txt
Epoch 1, Loss: 2.8427
Epoch 2, Loss: 0.6321
Epoch 3, Loss: 0.1237
Epoch 4, Loss: 0.0085
Epoch 5, Loss: 0.0011
Epoch 6, Loss: 0.0005
Epoch 7, Loss: 0.0004
Epoch 8, Loss: 0.0019
Epoch 9, Loss: 0.0002
Epoch 10, Loss: 0.0004
Model saved as bilstm.pt


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class ELMo(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2):
        super(ELMo, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm_layers = nn.ModuleList([
            nn.LSTM(
                embedding_dim if i == 0 else hidden_dim * 2,
                hidden_dim,
                num_layers=1,
                bidirectional=True,
                batch_first=True
            )
            for i in range(num_layers)
        ])
        self.linear = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, x):
        embeds = self.embedding(x)
        layer_outputs = [embeds]
        for i, lstm in enumerate(self.bilstm_layers):
            out, _ = lstm(layer_outputs[-1])
            layer_outputs.append(out)
        final_output = self.linear(out)
        return final_output, layer_outputs[1:]

class NewsClassifier(nn.Module):
    def __init__(self, elmo_model, hidden_dim, num_classes=4):
        super(NewsClassifier, self).__init__()
        self.elmo = elmo_model
        for param in self.elmo.parameters():
            param.requires_grad = False
        self.gru = nn.GRU(hidden_dim * 2, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.lambda_weights = nn.Parameter(torch.ones(2) / 2)

    def forward(self, x):
        _, elmo_embeds = self.elmo(x)
        weighted_embeds = sum(w * e for w, e in zip(self.lambda_weights, elmo_embeds))
        out, _ = self.gru(weighted_embeds)
        out = self.fc(out[:, -1, :])
        return out

class AGNewsDataset(Dataset):
    def __init__(self, csv_file, word2idx):
        self.df = pd.read_csv(csv_file)
        self.texts = self.df["Description"].tolist()
        self.labels = self.df["Class Index"].tolist()
        self.word2idx = word2idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        words = self.texts[idx].split()
        input_ids = [self.word2idx.get(word.lower(), 0) for word in words]
        return torch.tensor(input_ids), self.labels[idx] - 1  # Adjust to 0-3

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(seq) for seq in texts)
    padded_texts = torch.zeros(len(texts), max_len, dtype=torch.long)
    for i, seq in enumerate(texts):
        padded_texts[i, :len(seq)] = seq
        padded_texts[i, len(seq):] = 1
    return padded_texts, torch.tensor(labels)

def train_classifier(classifier, dataloader, epochs=5, device='cuda' if torch.cuda.is_available() else 'cpu'):
    classifier = classifier.to(device)
    optimizer = optim.Adam(classifier.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    classifier.train()
    for epoch in range(epochs):
        total_loss = 0
        for texts, labels in dataloader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = classifier(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")
    torch.save(classifier.state_dict(), "classifier.pt")
    print("Classifier saved as classifier.pt")

if __name__ == "__main__":
    word2idx = {'<UNK>': 0, '<PAD>': 1}
    with open("vocab.txt", "r") as f:
        for idx, word in enumerate(f.read().splitlines()):
            word2idx[word] = idx
    vocab_size = len(word2idx)

    csv_file = "train.csv"
    dataset = AGNewsDataset(csv_file, word2idx)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=2, collate_fn=collate_fn)

    embedding_dim = 300
    hidden_dim = 256
    elmo = ELMo(vocab_size, embedding_dim, hidden_dim)
    elmo.load_state_dict(torch.load("bilstm.pt"))

    classifier = NewsClassifier(elmo, hidden_dim)
    train_classifier(classifier, dataloader, epochs=5)

Epoch 1, Loss: 0.8008
Epoch 2, Loss: 0.4651
Epoch 3, Loss: 0.3862
Epoch 4, Loss: 0.3293
Epoch 5, Loss: 0.2840
Classifier saved as classifier.pt


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

class ELMo(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2):
        super(ELMo, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm_layers = nn.ModuleList([
            nn.LSTM(
                embedding_dim if i == 0 else hidden_dim * 2,
                hidden_dim,
                num_layers=1,
                bidirectional=True,
                batch_first=True
            )
            for i in range(num_layers)
        ])
        self.linear = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, x):
        embeds = self.embedding(x)
        layer_outputs = [embeds]
        for i, lstm in enumerate(self.bilstm_layers):
            out, _ = lstm(layer_outputs[-1])
            layer_outputs.append(out)
        final_output = self.linear(out)
        return final_output, layer_outputs[1:]

class NewsClassifier(nn.Module):
    def __init__(self, elmo_model, hidden_dim, num_classes=4, mode="trainable"):
        super(NewsClassifier, self).__init__()
        self.elmo = elmo_model
        for param in self.elmo.parameters():
            param.requires_grad = False
        self.gru = nn.GRU(hidden_dim * 2, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.mode = mode

        if mode == "trainable":
            self.lambda_weights = nn.Parameter(torch.ones(2) / 2)
        elif mode == "frozen":
            self.lambda_weights = torch.ones(2) / 2
        elif mode == "learnable":
            self.combiner = nn.Sequential(
                nn.Linear(hidden_dim * 2 * 2, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim * 2)
            )

    def forward(self, x):
        _, elmo_embeds = self.elmo(x)
        if self.mode in ["trainable", "frozen"]:
            weighted_embeds = sum(w * e for w, e in zip(self.lambda_weights, elmo_embeds))
            out, _ = self.gru(weighted_embeds)
        elif self.mode == "learnable":
            combined = torch.cat(elmo_embeds, dim=-1)
            combined = self.combiner(combined)
            out, _ = self.gru(combined)
        out = self.fc(out[:, -1, :])
        return out

class AGNewsDataset(Dataset):
    def __init__(self, csv_file, word2idx):
        self.df = pd.read_csv(csv_file)
        self.texts = self.df["Description"].tolist()
        self.labels = self.df["Class Index"].tolist()
        self.word2idx = word2idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        words = self.texts[idx].split()
        input_ids = [self.word2idx.get(word.lower(), 0) for word in words]
        return torch.tensor(input_ids), self.labels[idx] - 1

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(seq) for seq in texts)
    padded_texts = torch.zeros(len(texts), max_len, dtype=torch.long)
    for i, seq in enumerate(texts):
        padded_texts[i, :len(seq)] = seq
        padded_texts[i, len(seq):] = 1
    return padded_texts, torch.tensor(labels)

def train_classifier(classifier, dataloader, epochs=5, device='cuda' if torch.cuda.is_available() else 'cpu'):
    classifier = classifier.to(device)
    optimizer = optim.Adam(classifier.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    classifier.train()
    for epoch in range(epochs):
        total_loss = 0
        for texts, labels in dataloader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = classifier(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")
    torch.save(classifier.state_dict(), f"classifier_{classifier.mode}.pt")

def evaluate_classifier(classifier, dataloader, device='cuda' if torch.cuda.is_available() else 'cpu'):
    classifier = classifier.to(device)
    classifier.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for texts, labels in dataloader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = classifier(texts)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    cm = confusion_matrix(all_labels, all_preds)

    print(f"Mode: {classifier.mode}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Confusion Matrix:\n{cm}\n")
    return accuracy, f1, precision, recall, cm

if __name__ == "__main__":
    word2idx = {'<UNK>': 0, '<PAD>': 1}
    with open("vocab.txt", "r") as f:
        for idx, word in enumerate(f.read().splitlines()):
            word2idx[word] = idx
    vocab_size = len(word2idx)

    train_csv = "/content/train.csv"
    test_csv = "/content/test.csv"
    train_dataset = AGNewsDataset(train_csv, word2idx)
    test_dataset = AGNewsDataset(test_csv, word2idx)
    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2, collate_fn=collate_fn)
    test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2, collate_fn=collate_fn)

    embedding_dim = 300
    hidden_dim = 256
    elmo = ELMo(vocab_size, embedding_dim, hidden_dim)
    elmo.load_state_dict(torch.load("bilstm.pt"))

    for mode in ["trainable", "frozen", "learnable"]:
        print(f"\nTraining with {mode} mode")
        classifier = NewsClassifier(elmo, hidden_dim, mode=mode)
        train_classifier(classifier, train_dataloader, epochs=5)
        evaluate_classifier(classifier, test_dataloader)


Training with trainable mode
Epoch 1, Loss: 0.7871


In [None]:
!python3 inference.py classifier_trainable.pt "Stocks rose today after positive earnings reports"

class-1 0.0
class-2 0.0
class-3 1.0
class-4 0.0


In [None]:
!python inference.py classifier_trainable.pt "War broke out in the Middle East"
!python inference.py classifier_trainable.pt "New smartphone released with AI features"
!python inference.py classifier_trainable.pt "Team wins championship title"

class-1 0.9
class-2 0.0
class-3 0.0
class-4 0.1
class-1 0.2
class-2 0.0
class-3 0.2
class-4 0.6
class-1 0.2
class-2 0.7
class-3 0.0
class-4 0.1


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

class AGNewsDataset(Dataset):
    def __init__(self, csv_file, word2idx):
        self.df = pd.read_csv(csv_file)
        self.texts = self.df["Description"].tolist()
        self.labels = self.df["Class Index"].tolist()
        self.word2idx = word2idx

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        words = self.texts[idx].split()
        input_ids = [self.word2idx.get(word.lower(), 0) for word in words]
        return torch.tensor(input_ids), self.labels[idx] - 1

def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(seq) for seq in texts)
    padded_texts = torch.zeros(len(texts), max_len, dtype=torch.long)
    for i, seq in enumerate(texts):
        padded_texts[i, :len(seq)] = seq
        padded_texts[i, len(seq):] = 1
    return padded_texts, torch.tensor(labels)

class StaticClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_classes=4):
        super(StaticClassifier, self).__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embeds = self.embedding(x)
        out, _ = self.gru(embeds)
        out = self.fc(out[:, -1, :])
        return out

def train_and_evaluate(model, train_loader, test_loader, emb_name, epochs=5, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"{emb_name} - Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    cm = confusion_matrix(all_labels, all_preds)
    print(f"{emb_name} - Accuracy: {accuracy:.4f}, F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
    print(f"{emb_name} - Confusion Matrix:\n{cm}\n")
    return accuracy, f1, precision, recall, cm

if __name__ == "__main__":
    word2idx = {'<UNK>': 0, '<PAD>': 1}
    with open("vocab.txt", "r") as f:
        for idx, word in enumerate(f.read().splitlines()):
            word2idx[word] = idx
    vocab_size = len(word2idx)

    train_csv = "/content/train.csv"
    test_csv = "/content/test.csv"
    train_dataset = AGNewsDataset(train_csv, word2idx)
    test_dataset = AGNewsDataset(test_csv, word2idx)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2, collate_fn=collate_fn)

    hidden_dim = 256
    embeddings = {}
    for name in ["svd", "skipgram", "cbow"]:
        emb_dict = torch.load(f"{name}.pt", weights_only=False)

        embedding_dim = len(next(iter(emb_dict.values())))
        emb_matrix = torch.zeros((vocab_size, embedding_dim))

        for word, idx in word2idx.items():
            if word in emb_dict:
                emb_matrix[idx] = torch.tensor(emb_dict[word])
            else:
                emb_matrix[idx] = torch.randn(embedding_dim) * 0.01

        embeddings[name] = emb_matrix

    results = {}
    for emb_name, emb_matrix in embeddings.items():
        print(f"\nTraining with {emb_name} embeddings")
        if emb_matrix.shape[0] != vocab_size:
            print(f"Warning: {emb_name} vocab size mismatch. Expected {vocab_size}, got {emb_matrix.shape[0]}. Padding/truncating.")
            emb_matrix = torch.nn.functional.pad(emb_matrix, (0, 0, 0, max(0, vocab_size - emb_matrix.shape[0])))[:vocab_size]
        classifier = StaticClassifier(emb_matrix, hidden_dim)
        results[emb_name] = train_and_evaluate(classifier, train_loader, test_loader, emb_name)

  emb_matrix[idx] = torch.tensor(emb_dict[word])



Training with svd embeddings
svd - Epoch 1, Loss: 1.2823
svd - Epoch 2, Loss: 0.8985
svd - Epoch 3, Loss: 0.7613
svd - Epoch 4, Loss: 0.6778
svd - Epoch 5, Loss: 0.6211
svd - Accuracy: 0.7508, F1: 0.7515, Precision: 0.7551, Recall: 0.7508
svd - Confusion Matrix:
[[1440  109  211  140]
 [ 142 1520  120  118]
 [ 118   60 1475  247]
 [ 148   92  389 1271]]


Training with skipgram embeddings
skipgram - Epoch 1, Loss: 0.6810
skipgram - Epoch 2, Loss: 0.4398
skipgram - Epoch 3, Loss: 0.3638
skipgram - Epoch 4, Loss: 0.2991
skipgram - Epoch 5, Loss: 0.2378
skipgram - Accuracy: 0.8496, F1: 0.8500, Precision: 0.8521, Recall: 0.8496
skipgram - Confusion Matrix:
[[1612   78   82  128]
 [  54 1739   34   73]
 [  82   29 1497  292]
 [  81   51  159 1609]]


Training with cbow embeddings
cbow - Epoch 1, Loss: 0.5492
cbow - Epoch 2, Loss: 0.3743
cbow - Epoch 3, Loss: 0.3049
cbow - Epoch 4, Loss: 0.2559
cbow - Epoch 5, Loss: 0.2226
cbow - Accuracy: 0.8433, F1: 0.8431, Precision: 0.8432, Recall: 0.84

In [None]:
import torch
for name in ["svd.pt", "skipgram.pt", "cbow.pt"]:
    emb = torch.load(name, weights_only=False)
    print(f"{name}: type={type(emb)}, shape={emb.shape if isinstance(emb, torch.Tensor) else list(emb.keys())}")

