In [115]:
import re
import os
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from numpy import genfromtxt

#%% Import libraries
import torch
import torchmetrics
from torch.utils.data import DataLoader, TensorDataset
import os
from glob import glob
from sklearn.model_selection import train_test_split

# Load previous code

### Speeches to sentences with labels

In [3]:
path1 = r"Talerne\Margrethe"
path2 = r"Talerne\Statsministeren"

def sentence_and_label(path):

    sentences = {}

    for file in os.listdir(path):
        filename = os.path.join(path, file)

        # Label the sentence
        if "Margrethe" in file:
            label = 0
        else:
            label = 1

        with open(filename, encoding='utf-8') as speech:
            speech = speech.read()
        
        speech = re.sub("[,*\"'-]"," ",speech)

        speech = speech.lower()

        tokens = nltk.sent_tokenize(speech, language="danish")

        for sentence in tokens:
            sentence = re.sub("[.:;?!`–”«»%…‘’/()]"," ",sentence) 
            sentence = sentence.strip() # Uden mellemrum forrest eller bagerst
            sentences[sentence] = label
        
    return sentences

Margrethe_sentences_with_labels = sentence_and_label(path1)
Statsministeren_sentences_with_labels = sentence_and_label(path2)

In [4]:
# from pprint import pprint

# # Pretty-print to look nice
# pprint(Margrethe_sentences_with_labels)
# pprint(Statsministeren_sentences_with_labels)


In [5]:
all_sentences = Margrethe_sentences_with_labels | Statsministeren_sentences_with_labels
# pprint(all_sentences)
print(len(all_sentences))

10731


In [6]:
# # Inspect the tokens that look suspicious
# for sentence in all_sentences:
#     tokens = word_tokenize(sentence)
#     # Look for weird tokens
#     weird_tokens = [t for t in tokens if not t.isalnum()]
#     if weird_tokens:
#         print("Weird tokens found:", weird_tokens)

### Separate keys and values into two lists (For vectorization)

In [7]:
sentences_list = []
labels_list = []

for sentence in all_sentences:
    sentences_list.append(sentence)
    labels_list.append(all_sentences[sentence])

print(sentences_list)
print(labels_list)

['for min familie og for mig selv blev dette år skelsættende ved min fader kong frederiks sygdom og død', 'den dybe sorg  der ramte os  følte vi  at hele folket tog del i  og jeg kan ikke begynde denne min første nytårshilsen uden at bringe en tak for al den varme og sympati  som blev prins henrik og mig  men ikke mindst min moder  dronning ingrid  til del', 'de hjertelige følelser  som i de tunge dage støttede og løftede os  har jeg siden i årets løb mødt så mange gange  og det har været mig en glæde og inspiration i min gerning som danmarks dronning', '1972 vil blive et historisk år på mange måder', 'i danmarkshistorien vil året blive husket for den betydningsfulde beslutning  som det danske folk traf om danmarks medlemskab af det udvidede europæiske fællesskab', 'hermed træder vi ind i et nært samarbejde med de vigtigste vesteuropæiske nationer  vi går ind til det med store forhåbninger  men selvsagt også med en bevidsthed om  at dette vil stille krav til dansk kultur  og samfundsli

In [8]:
print(len(sentences_list))

10731


## SVD (Singular Value decompostion)

# CNN

## Load data:

In [137]:
batch_size = 256
num_epochs = 100 
learning_rate = 0.01
weight_decay = 0.

In [118]:
# Load SVD-matrix from csv-file
A_k = genfromtxt('SVD.csv', delimiter=',')

In [148]:
A_k

array([[-6.16473675e+00, -4.02082074e-01, -1.52501388e-01, ...,
        -4.01377674e-02, -9.24744540e-03,  2.00122848e-02],
       [-4.61413986e+00, -3.23343196e-01,  8.82422583e-03, ...,
         4.69880154e-03,  1.84452465e-02, -1.13838840e-02],
       [-5.46674470e+00, -2.37277261e-01,  4.61381935e-01, ...,
         2.01689315e-02,  3.47052724e-02, -1.68292466e-02],
       ...,
       [-5.39011312e+00, -1.48593809e+00,  1.13009978e+00, ...,
        -2.06424785e-02,  6.46485565e-02, -1.12069831e-01],
       [-7.26047414e+00,  1.32972128e+00, -1.20220301e+00, ...,
        -1.34227097e-01,  1.34009804e-01, -5.79210589e-02],
       [-5.64561839e+00, -2.04792378e+00,  7.58238299e-01, ...,
        -2.92956351e-03,  5.25144745e-02,  3.60375463e-02]])

In [149]:
A_k.shape

(10731, 200)

In [138]:
# Split SVD-matrix (A_k) into training- and test data:
train_input, test_input, train_labels, test_labels = train_test_split(
    A_k,             # The SVD-reduced matrix
    labels_list,    # The labels
    test_size=0.2,   # Percentage of data to use for testing (e.g., 20%)
    random_state=42, # For reproducibility
)

In [139]:
# Convert inputs
train_input_tensor = torch.tensor(train_input, dtype=torch.float32)  # Shape: (num_train_sentences, 200)
test_input_tensor = torch.tensor(test_input, dtype=torch.float32)    # Shape: (num_test_sentences, 200)

# Convert labels
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)  # Shape: (num_train_sentences,)
test_labels_tensor = torch.tensor(test_labels, dtype=torch.long)    # Shape: (num_test_sentences,)

In [151]:
# Reshape the tensors for CNN input (adding the channel dimension)
train_input_tensor = train_input_tensor.unsqueeze(1)  # Shape: (num_train_sentences, 1, 200)
test_input_tensor = test_input_tensor.unsqueeze(1)    # Shape: (num_test_sentences, 1, 200)

In [154]:
train_input_tensor.shape, test_input_tensor.shape

(torch.Size([8584, 1, 200]), torch.Size([2147, 1, 200]))

In [172]:
# Create datasets
train_dataset = TensorDataset(train_input_tensor, train_labels_tensor)
test_dataset = TensorDataset(test_input_tensor, test_labels_tensor)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # Shuffle training data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)   # No shuffle for testing

## Build the CNN

In [156]:
#%% Device
# Run on GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

In [185]:
#%% Neural network
net = torch.nn.Sequential(
    torch.nn.Conv1d(1, 16, kernel_size=3), # 200 - 3 + 1 = 198 
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.2),
    torch.nn.MaxPool1d(kernel_size=2),     # 198 // 2 = 99

    torch.nn.Conv1d(16, 16, kernel_size=3),  # 99 - 3 + 1 = 97
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.2),
    torch.nn.MaxPool1d(kernel_size=2),     # 97 // 2 = 48

    torch.nn.Flatten(),                    
    torch.nn.Linear(16*48, 254),
    torch.nn.ReLU(),
    torch.nn.Linear(254, 1),
    torch.nn.Sigmoid()       # Sigmoid activation for binary classification
            

).to(device)

In [None]:
# %% Load trained network from file
# net.load_state_dict(torch.load('net.pt'))

In [182]:
#%% Loss and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [183]:
#%% Metrics
accuracy_metric = torchmetrics.classification.BinaryAccuracy().to(device)

## Train

In [189]:
#%% Train
step = 0
for epoch in range(num_epochs):
    accuracy_metric.reset()
    for inputs, labels in train_loader:
        step += 1

        # Put data on GPU 
        inputs = inputs.to(device)
        labels = labels.to(device).float()

        # Compute loss and take gradient step
        optimizer.zero_grad()
        prediction = net(inputs)
        prediction = prediction.squeeze(1)  # Shape: [batch_size]
        loss = loss_function(prediction, labels)
        loss.backward()
        optimizer.step()

        # Update accuracy metric
        accuracy_metric.update(prediction, labels)

    # Print accuracy for epoch            
    acc = accuracy_metric.compute()
    print(f'Training accuracy = {acc}')

KeyboardInterrupt: 

In [None]:
# %% Save the trained model
# torch.save(net.state_dict(), 'net.pt')

## Test