In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from IPython.display import clear_output
import torch.nn as nn
import torch.nn.functional as F

# Enable interactive mode
plt.ion()


<contextlib.ExitStack at 0x7ffdd75edcd0>

In [2]:
import mlflow
# set the experiment id
mlflow.set_tracking_uri("/scratch/project_2006600/fin_experiment_cnn")
mlflow.set_experiment('fin_experiment_cnn')



<Experiment: artifact_location='/scratch/project_2006600/fin_experiment/372080994826105752', creation_time=1734112413085, experiment_id='372080994826105752', last_update_time=1734112413085, lifecycle_stage='active', name='fin_experiment', tags={}>

In [4]:
def plot_losses(train_losses, val_losses):
    clear_output(wait=True)  # Clear previous output in Jupyter
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('N Batches')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.grid()
    plt.show()


In [3]:
def load_glove_embeddings(filepath):
    embeddings = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load GloVe embeddings
glove_path = "/projappl/project_2006600/fin_experiment/embeddings/glove.6B.100d.txt"
glove_embeddings = load_glove_embeddings(glove_path)
print(f"Loaded {len(glove_embeddings)} word vectors.")

Loaded 400000 word vectors.


In [5]:
data_dir = '/projappl/project_2006600/fin_experiment/data'
data_combined_news = pd.read_csv(os.path.join(data_dir, 'data_combined_news.csv'), sep='\t', encoding='utf-8')

In [4]:
from sklearn.model_selection import train_test_split

x = data_combined_news['All_news_clean']
y = data_combined_news['Label']

X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

In [6]:
len(X_train)

1591

In [9]:
y_train.value_counts()

Label
1    838
0    753
Name: count, dtype: int64

In [6]:
class Vocab:
    """Vocabulary for text."""
    def __init__(self, texts, min_freq=5, reserved_tokens=[]):
        counter = dict()
        for text in texts:
            for word in text.split():
                counter[word] = counter.get(word, 0) + 1
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The list of unique tokens
        self.idx_to_token = list(sorted(set(['<unk>'] + reserved_tokens + [
            token for token, freq in self.token_freqs if freq >= min_freq])))
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def get(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if hasattr(indices, '__len__') and len(indices) > 1:
            return [self.idx_to_token[int(index)] for index in indices]
        return self.idx_to_token[indices]

    @property
    def unk(self):  # Index for the unknown token
        return self.token_to_idx['<unk>']

In [14]:
vocab = Vocab(data_combined_news['All_news_clean'])
vocab_size = len(vocab)

In [12]:
y_valid.value_counts()

Label
1    227
0    171
Name: count, dtype: int64

In [8]:
X_valid[1173]



In [15]:
from nltk import sent_tokenize, word_tokenize 
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset, Dataset

# Below we use indexes of tokens in a vocab dict, without any segmentation of tokens, because we use count-based approaches
# or pre-trained word embeddings like glove/word2vec

def tokenize(text):
    return text.split()

def nltk_tokenize(text):
    sentences = sent_tokenize(text)
    return [word_tokenize(sent) for sent in sentences]

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenize(text)

def build_vocab(texts):
    counter = dict()
    for text in texts:
        for word in text.split():
            counter[word] = counter.get(word, 0) + 1
    vocab = {token: idx + 1 for idx, token in enumerate(counter.keys())}
    vocab['<unk>'] = 0
    vocab['<pad>'] = len(vocab)
    print(f'Vocab length: {len(vocab)}')
    return vocab

def encode_texts(texts, vocab):
    return [torch.tensor([vocab.get(token, 0) for token in tokenize(text)]) for text in texts]

In [17]:
def load_fasttext_embeddings(filepath, embedding_dim):
    embeddings = {}
    with open(filepath, "r", encoding="utf-8") as f:
        next(f)  # Skip the first line (header)
        for line in f:
            values = line.split()
            word = values[0]  # The word
            vector = np.asarray(values[1:], dtype='float32')  # The embedding vector
            embeddings[word] = vector
    print(f"Loaded {len(embeddings)} word vectors.")
    return embeddings

# Path to your FastText embedding file (e.g., cc.en.300.vec)
fasttext_path = "/projappl/project_2006600/fin_experiment/embeddings/crawl-300d-2M.vec"
embedding_dim = 300
fasttext_embeddings = load_fasttext_embeddings(fasttext_path, embedding_dim)


Loaded 1999995 word vectors.


In [18]:
embedding_dim = 300#100  # GloVe embedding dimension
vocab_size = len(vocab)

# Initialize embedding matrix with random values
embedding_matrix = np.random.uniform(-0.01, 0.01, (vocab_size, embedding_dim))

# Fill the embedding matrix with GloVe embeddings
for word, idx in vocab.token_to_idx.items():
    if word in fasttext_embeddings:#glove_embeddings:
        embedding_matrix[idx] = fasttext_embeddings[word]
    elif word == "<pad>" or word == "<unk>":
        embedding_matrix[idx] = np.zeros(embedding_dim)  # Padding token

# Convert to PyTorch tensor
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)


In [12]:
embedding_matrix.shape

torch.Size([16358, 300])

In [19]:
from torch.nn.utils.rnn import pad_sequence


class NewsDatasetFixedLen(Dataset):
    def __init__(self, texts, labels, max_len):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts.values[idx]
        tokenized = tokenize(text)[:self.max_len]
        # tokenized += [0] * (self.max_len - len(tokenized))
        text = torch.tensor([vocab.get(token) for token in tokenized])
        label = torch.tensor(self.labels.values[idx], dtype=torch.float32)
        return text, label

In [87]:
train_dataset = NewsDatasetFixedLen(X_train, y_train, 900)
valid_dataset = NewsDatasetFixedLen(X_valid, y_valid, 900)

In [15]:
len(train_dataset) / 16

99.4375

In [88]:
# print(next(iter(valid_dataset)))
# Collate function to pad sequences
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_sequence = pad_sequence(texts, batch_first=True)
    labels = torch.stack(labels)
    return padded_sequence, labels

In [89]:
# Define the embedding layer
embedding_layer = nn.Embedding(vocab_size, embedding_dim)

# Load the pre-trained weights
embedding_layer.weight = nn.Parameter(embedding_matrix)

# Optionally freeze the embeddings
embedding_layer.weight.requires_grad = False

In [64]:
# CNN model


class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_channels, kernel_sizes, **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        # self.embedding = embedding_layer
        # self.embedding.weight.requires_grad = True
        self.constant_embedding = embedding_layer
        self.convs = nn.ModuleList()
        self.batch_norms = nn.ModuleList()
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(embedding_dim, c, k))
            self.batch_norms.append(nn.BatchNorm1d(c))
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.relu = nn.LeakyReLU()
        self.dropout = nn.Dropout(p=0.5)
        # self.dense = nn.Linear(sum(num_channels), 128)
        self.decoder = nn.Linear(sum(num_channels), 1)


    def forward(self, x):
        # emb. layer = batch size * seq len * emb.dim
        embedded = self.constant_embedding(x)
        # embedded = torch.cat((self.embedding(x), self.constant_embedding(x)), dim=2)
        # print(embedded.shape)
        embedded = torch.permute(embedded, (0, 2, 1))
        # print(torch.squeeze(self.relu(self.pool(self.convs[0](embedded))), dim=-1).shape)
        # encoding = torch.cat([torch.squeeze(self.relu(self.pool(conv(embedded))), dim=-1) for conv in self.convs], dim=1)
        encoding = []
        for conv, batch_norm in zip(self.convs, self.batch_norms):
            pooled_out = self.relu(self.pool(batch_norm(conv(embedded))))
            encoding.append(torch.squeeze(pooled_out, dim=-1))
        encoding = torch.cat(encoding, dim=1)
        output = self.decoder(encoding)
        return torch.squeeze(output)

    

In [90]:
# Setting device on GPU if available
torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

Using device: cuda

NVIDIA A100-SXM4-40GB MIG 1g.5gb
Memory Usage:
Allocated: 0.6 GB
Cached:    0.6 GB


In [91]:
embed_size, kernel_sizes, nums_channels = embedding_dim, [2, 3, 4], [32, 64, 128]
net = TextCNN(len(vocab), embed_size, nums_channels, kernel_sizes)

def init_weights(module):
    if type(module) in (nn.Linear, nn.Conv1d):
        nn.init.xavier_uniform_(module.weight)

net.apply(init_weights)

TextCNN(
  (constant_embedding): Embedding(16358, 300)
  (convs): ModuleList(
    (0): Conv1d(300, 32, kernel_size=(2,), stride=(1,))
    (1): Conv1d(300, 64, kernel_size=(3,), stride=(1,))
    (2): Conv1d(300, 128, kernel_size=(4,), stride=(1,))
  )
  (batch_norms): ModuleList(
    (0): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (pool): AdaptiveAvgPool1d(output_size=1)
  (relu): LeakyReLU(negative_slope=0.01)
  (dropout): Dropout(p=0.5, inplace=False)
  (decoder): Linear(in_features=224, out_features=1, bias=True)
)

In [92]:
from dataclasses import dataclass

@dataclass
class TrainArgs:
    learning_rate: float
    batch_size: int
    epochs: int

cnn_args = TrainArgs(1e-3, 64, 30)

In [93]:
from torch.utils.data import DataLoader
from torch.optim import Adam

train_dataloader = DataLoader(train_dataset, batch_size=cnn_args.batch_size, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=cnn_args.batch_size, collate_fn=collate_fn)

    

In [94]:
optimizer = Adam(net.parameters(), lr=cnn_args.learning_rate, weight_decay=1e-6)
loss_fn = nn.BCEWithLogitsLoss()
from torch.optim.lr_scheduler import StepLR
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)  # Reduce LR by 10x every 2 epochs
softmax = nn.Softmax(dim=1)

train_losses = []
valid_losses = []

def train_model(model, train_loader, val_loader, optimizer, loss_fn, device, epochs):
    model.to(device)
    best_val_loss = float("inf")

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        
        # Training phase
        model.train()
        train_loss = 0
        for input_ids, labels in train_loader:
            input_ids, labels = input_ids.to(device), labels.to(device)
            
            optimizer.zero_grad()
            logits = model(input_ids)
            loss = loss_fn(logits, labels)
            loss.backward()
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            train_loss += loss.item()
            
        train_loss /= len(train_loader)
        print(f"Train Loss: {train_loss:.4f}")
        train_losses.append(train_loss)
        
        # Validation phase
        val_loss = evaluate_model(model, val_loader, loss_fn, device)
        print(f"Validation Loss: {val_loss:.4f}")
        scheduler.step(val_loss)
        valid_losses.append(val_loss)
        
        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_model.pth")
            print("Saved best model.")

def evaluate_model(model, val_loader, loss_fn, device):
    model.eval()
    val_loss = 0
    running_corrects = 0
    total_samples = 0

    with torch.no_grad():
        for input_ids, labels in val_loader:
            input_ids, labels = input_ids.to(device), labels.to(device)
            logits = model(input_ids)
            loss = loss_fn(logits, labels)
            val_loss += loss.item()
            probs = torch.sigmoid(logits)
            predictions = (probs > 0.5).float()
            # Update correct predictions
            running_corrects += (predictions == labels).sum().item()
            total_samples += labels.size(0)

    acc = running_corrects / total_samples
    print(f'Validation accuracy: {acc}')

    val_loss /= len(val_loader)
    return val_loss



In [31]:
train_model(net, train_dataloader, valid_dataloader, optimizer, loss_fn, device, cnn_args.epochs)

Epoch 1/30
Train Loss: 0.7034
Validation accuracy: 0.4271356783919598
Validation Loss: 0.7108
Saved best model.
Epoch 2/30




Train Loss: 0.6748
Validation accuracy: 0.4396984924623116
Validation Loss: 0.7336
Epoch 3/30
Train Loss: 0.6605
Validation accuracy: 0.4271356783919598
Validation Loss: 0.7315
Epoch 4/30
Train Loss: 0.6460
Validation accuracy: 0.43467336683417085
Validation Loss: 0.7459
Epoch 5/30
Train Loss: 0.6300
Validation accuracy: 0.4271356783919598
Validation Loss: 0.7519
Epoch 6/30
Train Loss: 0.6122
Validation accuracy: 0.44221105527638194
Validation Loss: 0.7532
Epoch 7/30
Train Loss: 0.5925
Validation accuracy: 0.46733668341708545
Validation Loss: 0.7489
Epoch 8/30
Train Loss: 0.5721
Validation accuracy: 0.4798994974874372
Validation Loss: 0.7471
Epoch 9/30
Train Loss: 0.5516
Validation accuracy: 0.4648241206030151
Validation Loss: 0.7686
Epoch 10/30
Train Loss: 0.5316
Validation accuracy: 0.48743718592964824
Validation Loss: 0.7625
Epoch 11/30
Train Loss: 0.5118
Validation accuracy: 0.4824120603015075
Validation Loss: 0.7779
Epoch 12/30
Train Loss: 0.4915
Validation accuracy: 0.47487437185

In [96]:
class ResidualBlockWithProjection(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super(ResidualBlockWithProjection, self).__init__()
        
        # Convolutional layers
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu = nn.LeakyReLU(inplace=True)
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm1d(out_channels)
        
        # Shortcut connection (projection using 1x1 convolution)
        self.projection = nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
        self.bn_proj = nn.BatchNorm1d(out_channels)

    def forward(self, x):
        identity = self.projection(x)  # Project input to match output dimensions
        identity = self.bn_proj(identity)

        # Forward pass through convolutional layers
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        
        # Add the shortcut connection
        out += identity
        out = self.relu(out)
        
        return out


In [97]:
class ResidualCNN(nn.Module):
    def __init__(self, num_classes=1):
        super(ResidualCNN, self).__init__()
        self.constant_embedding = embedding_layer

        # Initial convolutional layer
        self.conv1 = nn.Conv1d(900, 64, kernel_size=3, stride=1, padding=3, bias=False)
        self.bn1 = nn.BatchNorm1d(64)
        self.relu = nn.LeakyReLU(inplace=True)
        self.pool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

        # Residual blocks
        self.layer2 = ResidualBlockWithProjection(64, 128, stride=2)  # Change dimensions with projection

        # Fully connected layer
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.constant_embedding(x)
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool(x)

        x = self.layer2(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return torch.squeeze(x)



In [98]:
model = ResidualCNN()

In [99]:
model

ResidualCNN(
  (constant_embedding): Embedding(16358, 300)
  (conv1): Conv1d(900, 64, kernel_size=(3,), stride=(1,), padding=(3,), bias=False)
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): LeakyReLU(negative_slope=0.01, inplace=True)
  (pool): MaxPool1d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer2): ResidualBlockWithProjection(
    (conv1): Conv1d(64, 128, kernel_size=(3,), stride=(2,), padding=(1,), bias=False)
    (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): LeakyReLU(negative_slope=0.01, inplace=True)
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
    (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (projection): Conv1d(64, 128, kernel_size=(1,), stride=(2,), bias=False)
    (bn_proj): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)


In [100]:
train_model(model, train_dataloader, valid_dataloader, optimizer, loss_fn, device, cnn_args.epochs)

Epoch 1/30
Train Loss: 0.6950
Validation accuracy: 0.5703517587939698
Validation Loss: 0.6833
Saved best model.
Epoch 2/30
Train Loss: 0.6950
Validation accuracy: 0.5175879396984925
Validation Loss: 0.6928
Epoch 3/30
Train Loss: 0.6950
Validation accuracy: 0.4472361809045226
Validation Loss: 0.6998
Epoch 4/30
Train Loss: 0.6950
Validation accuracy: 0.44221105527638194
Validation Loss: 0.7005
Epoch 5/30
Train Loss: 0.6950
Validation accuracy: 0.44221105527638194
Validation Loss: 0.7006
Epoch 6/30
Train Loss: 0.6950
Validation accuracy: 0.44221105527638194
Validation Loss: 0.7006
Epoch 7/30
Train Loss: 0.6950
Validation accuracy: 0.44221105527638194
Validation Loss: 0.7006
Epoch 8/30
Train Loss: 0.6950
Validation accuracy: 0.44221105527638194
Validation Loss: 0.7006
Epoch 9/30
Train Loss: 0.6950
Validation accuracy: 0.44221105527638194
Validation Loss: 0.7006
Epoch 10/30
Train Loss: 0.6950
Validation accuracy: 0.44221105527638194
Validation Loss: 0.7006
Epoch 11/30
Train Loss: 0.6950
Val