In [39]:
import re
import os
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from numpy import genfromtxt

#%% Import libraries
import torch
import torchmetrics
from torch.utils.data import DataLoader, TensorDataset
import os
from glob import glob
from sklearn.model_selection import train_test_split

# Load training and test dataframes & TF-IDF matrices

In [40]:
# Load sentences and labels
train = pd.read_csv(r'Test_og_Train_df\train_dataframe.csv')
test = pd.read_csv(r'Test_og_Train_df\test_dataframe.csv')

In [41]:
train_labels = train["Labels"].tolist()

test_labels = test["Labels"].tolist()

In [42]:
train_tf_idf = pd.read_csv(r'DTTFIDFM_data\DTTFIDFM_train.csv')
test_tf_idf = pd.read_csv(r'DTTFIDFM_data\DTTFIDFM_test.csv')

train_tf_idf = train_tf_idf.drop('Unnamed: 0', axis=1)
test_tf_idf = test_tf_idf.drop('Unnamed: 0', axis=1)

In [43]:
train_tf_idf_numpy = train_tf_idf.to_numpy()
test_tf_idf_numpy = test_tf_idf.to_numpy()

In [44]:
# train_tf_idf_numpy

In [45]:
train_tf_idf_numpy.shape,test_tf_idf_numpy.shape

((8048, 24653), (2683, 24653))

# CNN

## Load SVD-matrix:

In [46]:
# Load SVD-matrix from csv-file
V_200 = genfromtxt('V200.csv', delimiter=',')

In [47]:
V_200

array([[-1.01522614e-01,  1.11495275e-01, -1.94767568e-01, ...,
         1.68914466e-02, -1.59969860e-02, -4.14657793e-02],
       [-8.21504456e-02,  5.38646152e-02, -5.83989941e-03, ...,
        -6.59796006e-03, -5.48038621e-03,  3.82854767e-02],
       [-9.73437764e-04,  7.31839751e-04, -1.98419009e-04, ...,
        -4.65777427e-04,  7.10243628e-04,  7.45946970e-04],
       ...,
       [-6.25781132e-04,  4.19897409e-04,  4.62287900e-05, ...,
         1.21027324e-04,  6.06961375e-06,  7.62899044e-04],
       [-5.13986347e-04,  4.40283044e-05,  2.03693542e-04, ...,
        -1.50341156e-04,  7.16554560e-05,  6.35583134e-04],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [48]:
V_200.shape

(24653, 200)

In [49]:
type(V_200)

numpy.ndarray

## Sentence LSA embedding

In [50]:
def sentence_LSA_embedding(TF_IDF_matrix_numpy, V_k):
    liste_LSA_vektorer = []
    for i in range(TF_IDF_matrix_numpy.shape[0]):
        lsa_vector = np.dot(TF_IDF_matrix_numpy[i], V_k)
        liste_LSA_vektorer.append(lsa_vector)
    return np.array(liste_LSA_vektorer)


In [51]:
train_lsa = sentence_LSA_embedding(train_tf_idf_numpy, V_200)
test_lsa = sentence_LSA_embedding(test_tf_idf_numpy, V_200)

## Make tensors

In [52]:
# Convert inputs
train_input_tensor = torch.tensor(train_lsa, dtype=torch.float32)  # Shape: (num_train_sentences, 200)
test_input_tensor = torch.tensor(test_lsa, dtype=torch.float32)    # Shape: (num_test_sentences, 200)

# Convert labels
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)  # Shape: (num_train_sentences,)
test_labels_tensor = torch.tensor(test_labels, dtype=torch.long)    # Shape: (num_test_sentences,)

In [53]:
# Reshape the tensors for CNN input (adding the channel dimension)
train_input_tensor = train_input_tensor.unsqueeze(1)  # Shape: (num_train_sentences, 1, 200)
test_input_tensor = test_input_tensor.unsqueeze(1)    # Shape: (num_test_sentences, 1, 200)

In [54]:
train_input_tensor.shape, test_input_tensor.shape

(torch.Size([8048, 1, 200]), torch.Size([2683, 1, 200]))

In [100]:
batch_size = 400
# Create datasets
train_dataset = TensorDataset(train_input_tensor, train_labels_tensor)
test_dataset = TensorDataset(test_input_tensor, test_labels_tensor)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # Shuffle training data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)   # No shuffle for testing

## Build the CNN

In [194]:
num_epochs = 100 
learning_rate = 0.01
weight_decay = 0.

In [195]:
#%% Device
# Run on GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

In [196]:
 #%% Neural network
net = torch.nn.Sequential(
    torch.nn.Conv1d(1, 16, kernel_size=3), # 200 - 3 + 1 = 198 
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.05),
    torch.nn.MaxPool1d(kernel_size=2),     # 198 // 2 = 99

    torch.nn.Conv1d(16, 16, kernel_size=3),  # 99 - 3 + 1 = 97
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.2),
    torch.nn.MaxPool1d(kernel_size=2),     # 97 // 2 = 48
    
    torch.nn.Flatten(),                    
    torch.nn.Linear(16*48, 254),
    torch.nn.ReLU(),
    torch.nn.Linear(254, 128), # Output 2 logits (for two classes: 0 or 1)
    torch.nn.ReLU(),
    torch.nn.Linear(128, 64),
    torch.nn.ReLU(),
    torch.nn.Linear(64, 16),
    torch.nn.ReLU(),
    torch.nn.Linear(16, 2)
).to(device)

In [197]:
#%% Loss and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [198]:
#%% Metrics
accuracy_metric = torchmetrics.classification.BinaryAccuracy().to(device)

In [165]:
# %% Load trained network from file
net.load_state_dict(torch.load('netV5.pt'))

  net.load_state_dict(torch.load('netV5.pt'))


<All keys matched successfully>

## Train

In [199]:
#%% Train
step = 0
for epoch in range(num_epochs):
    accuracy_metric.reset()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        net.train()

        # Put data on GPU 
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        # Compute loss and take gradient step
        outputs = net(inputs)
        loss = loss_function(outputs, labels)

        loss.backward()
        optimizer.step()

        predicted = torch.argmax(outputs, dim=1)

        # Update accuracy metric
        accuracy_metric.update(predicted, labels)

        running_loss += loss.item()
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    # Print accuracy for epoch            
    acc = accuracy_metric.compute()
    print(f'Training accuracy = {acc}')

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

Training accuracy = 0.6903578639030457
Epoch 1/100, Loss: 0.6739, Accuracy: 69.04%
Training accuracy = 0.6903578639030457
Epoch 2/100, Loss: 0.6002, Accuracy: 69.04%
Training accuracy = 0.6903578639030457
Epoch 3/100, Loss: 0.5890, Accuracy: 69.04%
Training accuracy = 0.6903578639030457
Epoch 4/100, Loss: 0.5739, Accuracy: 69.04%
Training accuracy = 0.7022863030433655
Epoch 5/100, Loss: 0.5649, Accuracy: 70.23%
Training accuracy = 0.7200546860694885
Epoch 6/100, Loss: 0.5528, Accuracy: 72.01%
Training accuracy = 0.7328528761863708
Epoch 7/100, Loss: 0.5409, Accuracy: 73.29%
Training accuracy = 0.7327286005020142
Epoch 8/100, Loss: 0.5298, Accuracy: 73.27%
Training accuracy = 0.741923451423645
Epoch 9/100, Loss: 0.5259, Accuracy: 74.19%
Training accuracy = 0.7421719431877136
Epoch 10/100, Loss: 0.5222, Accuracy: 74.22%
Training accuracy = 0.7439115047454834
Epoch 11/100, Loss: 0.5267, Accuracy: 74.39%
Training accuracy = 0.7502484917640686
Epoch 12/100, Loss: 0.5087, Accuracy: 75.02%
Tr

In [124]:
# %% Save the trained model
# torch.save(net.state_dict(), 'netV5.pt')

# Test

In [200]:
# Evaluate the model
net.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        # Put data on GPU 
        inputs = inputs.to(device)
        labels = labels.to(device).float()
        
        # Forward pass
        outputs = net(inputs)
        
        predicted = torch.argmax(outputs, dim=1)
        
        # Count correct predictions
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

test_accuracy = 100 * correct / total
print(f"Test Accuracy: {test_accuracy:.2f}%")


Test Accuracy: 71.60%
