In [3]:
import re
import os
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from numpy import genfromtxt

#%% Import libraries
import torch
import torchmetrics
from torch.utils.data import DataLoader, TensorDataset
import os
from glob import glob
from sklearn.model_selection import train_test_split

# Load training and test dataframes & TF-IDF matrices

In [4]:
# Load sentences and labels
train = pd.read_csv('train_dataframe.csv')
test = pd.read_csv('test_dataframe.csv')

In [5]:
train_sentences = train['Sentences'].tolist()
train_labels = train["Labels"].tolist()

test_sentences = test['Sentences'].tolist()
test_labels = test["Labels"].tolist()

In [None]:
train_tf_idf = pd.read_csv('DTTFIDFM_train.csv')
test_tf_idf = pd.read_csv('DTTFIDFM_test.csv')

# CNN

## Load SVD-matrix:

In [118]:
# Load SVD-matrix from csv-file
V_transpose = genfromtxt('V200.csv', delimiter=',')

In [149]:
V_transpose.shape

(10731, 200)

## Sentence LSA embedding

## Make tensors

In [139]:
# Convert inputs
train_input_tensor = torch.tensor(train_input, dtype=torch.float32)  # Shape: (num_train_sentences, 200)
test_input_tensor = torch.tensor(test_input, dtype=torch.float32)    # Shape: (num_test_sentences, 200)

# Convert labels
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)  # Shape: (num_train_sentences,)
test_labels_tensor = torch.tensor(test_labels, dtype=torch.long)    # Shape: (num_test_sentences,)

In [151]:
# Reshape the tensors for CNN input (adding the channel dimension)
train_input_tensor = train_input_tensor.unsqueeze(1)  # Shape: (num_train_sentences, 1, 200)
test_input_tensor = test_input_tensor.unsqueeze(1)    # Shape: (num_test_sentences, 1, 200)

In [154]:
train_input_tensor.shape, test_input_tensor.shape

(torch.Size([8584, 1, 200]), torch.Size([2147, 1, 200]))

In [172]:
batch_size = 256
# Create datasets
train_dataset = TensorDataset(train_input_tensor, train_labels_tensor)
test_dataset = TensorDataset(test_input_tensor, test_labels_tensor)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # Shuffle training data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)   # No shuffle for testing

## Build the CNN

In [137]:
num_epochs = 100 
learning_rate = 0.01
weight_decay = 0.

In [156]:
#%% Device
# Run on GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

In [185]:
 #%% Neural network
net = torch.nn.Sequential(
    torch.nn.Conv1d(1, 16, kernel_size=3), # 200 - 3 + 1 = 198 
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.2),
    torch.nn.MaxPool1d(kernel_size=2),     # 198 // 2 = 99

    torch.nn.Conv1d(16, 16, kernel_size=3),  # 99 - 3 + 1 = 97
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.2),
    torch.nn.MaxPool1d(kernel_size=2),     # 97 // 2 = 48

    torch.nn.Flatten(),                    
    torch.nn.Linear(16*48, 254),
    torch.nn.ReLU(),
    torch.nn.Linear(254, 1),
    torch.nn.Sigmoid()       # Sigmoid activation for binary classification
            

).to(device)

In [None]:
# %% Load trained network from file
# net.load_state_dict(torch.load('net.pt'))

In [182]:
#%% Loss and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [183]:
#%% Metrics
accuracy_metric = torchmetrics.classification.BinaryAccuracy().to(device)

## Train

In [None]:
#%% Train
step = 0
for epoch in range(num_epochs):
    accuracy_metric.reset()
    for inputs, labels in train_loader:
        step += 1

        # Put data on GPU 
        inputs = inputs.to(device)
        labels = labels.to(device).float()

        # Compute loss and take gradient step
        prediction = net(inputs)
        prediction = prediction.squeeze(1)  # Shape: [batch_size]
        loss = loss_function(prediction, labels)
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()

        # Update accuracy metric
        accuracy_metric.update(prediction, labels)

    # Print accuracy for epoch            
    acc = accuracy_metric.compute()
    print(f'Training accuracy = {acc}')

Training accuracy = 0.5031453967094421
Training accuracy = 0.49743708968162537
Training accuracy = 0.48858341574668884
Training accuracy = 0.49580615758895874
Training accuracy = 0.5010484457015991
Training accuracy = 0.4939422309398651


In [None]:
# %% Save the trained model
# torch.save(net.state_dict(), 'net.pt')