In [65]:
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from numpy import array

In [125]:
# Import the CSV file containing HN titles, mean vectors and scores
df = pd.read_hdf('hn_title_mean_vectors_scores.h5', key='df_clean')

df['score']

0        19
1         1
2         2
3         1
4         1
         ..
99995     1
99996     1
99997     6
99998     1
99999    36
Name: score, Length: 98735, dtype: int64

In [67]:
df.shape

(98735, 3)

In [68]:
# Create dictionary in matrix and score format
data = {
    "matrix": df['mean_vector'].tolist(),
    "score": df['score'].values
}
dataframe = pd.DataFrame(data)

In [127]:
# #Generating random data to test training
data = {
    "matrix": [torch.rand(256) for _ in range(98735)],
    "score": df['score'],
    }


dataframe = pd.DataFrame(data)

In [128]:
dataframe.head()

Unnamed: 0,matrix,score
0,"[tensor(0.7606), tensor(0.8390), tensor(0.9581...",19
1,"[tensor(0.6599), tensor(0.2378), tensor(0.7250...",1
2,"[tensor(0.9299), tensor(0.9484), tensor(0.4627...",2
3,"[tensor(0.4715), tensor(0.6072), tensor(0.5657...",1
4,"[tensor(0.2310), tensor(0.3687), tensor(0.9965...",1


In [129]:


class ProcessedData(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Extract the matrix and score
        matrix = self.dataframe.iloc[idx]["matrix"]
        score = self.dataframe.iloc[idx]["score"]
        
        # Convert to PyTorch tensors
        matrix_tensor = torch.tensor(matrix, dtype=torch.float32).squeeze() 
        score_tensor = torch.tensor(score, dtype=torch.float32)
        
        return matrix_tensor, score_tensor

# Create Dataset
dataset = ProcessedData(dataframe)

# Defining DataLoader
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(64, 4),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(4, 1),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
criterion = nn.MSELoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [130]:


epoch_losses = []

model = NeuralNetwork()
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)
epochs=30
for epoch in range(epochs):
    epoch_loss = 0.0
    for batch in dataloader:
        # Get the input matrix and target score
        inputs, targets = batch
        
        # Forward pass
        predictions = model(inputs)
        loss = criterion(predictions.squeeze(), targets)  # Squeeze to match dimensions
        
        # Backward pass and optimisation
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
        
        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)
    epoch_losses.append(epoch_loss)




print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

epoch_losses = [round(loss, 4) for loss in epoch_losses]
epoch_losses


  matrix_tensor = torch.tensor(matrix, dtype=torch.float32).squeeze()


Epoch [30/30], Loss: 455.5966


[1380.8891,
 1376.99,
 1377.7854,
 1376.7496,
 1376.5855,
 1376.2679,
 1375.5403,
 1375.3747,
 1374.6278,
 1374.0849,
 1373.8191,
 1373.031,
 1374.1704,
 1373.227,
 1373.3287,
 1374.0691,
 1372.6496,
 1373.0252,
 1372.1963,
 1372.1023,
 1372.7076,
 1372.4536,
 1372.7663,
 1372.1275,
 1371.122,
 1370.8126,
 1372.1424,
 1371.2656,
 1370.4037,
 1370.5083]

In [26]:
epoch_losses[-20:]


[1361.5208,
 1352.8646,
 1348.5354,
 1343.6366,
 1340.6144,
 1336.0978,
 1327.7114,
 1323.219,
 1310.7708,
 1296.9789,
 1283.6761,
 1269.4318,
 1252.3672,
 1224.143,
 1209.8325,
 1173.3435,
 1145.2942,
 1119.6048,
 1101.9909,
 1073.3053]

In [131]:
sampled_data = dataframe.sample(n=100, random_state=42)  # n=100 to get 100 random samples

# Extract the input features (second column) and target scores (third column)
inputs = list(sampled_data.iloc[:, 0])  # Extracting the feature vectors
targets = torch.tensor(sampled_data.iloc[:, 1].values, dtype=torch.float32)  # Extracting the actual scores

# Convert inputs to tensors if necessary
inputs_tensor = torch.stack([torch.tensor(input_vec) for input_vec in inputs])

# Pass the inputs through the trained model to get predictions
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # No need to track gradients during inference
    predictions = model(inputs_tensor)

# Calculate the absolute difference between predictions and actual scores
differences = torch.abs(predictions.squeeze() - targets)

# Compute the average difference
average_difference = torch.mean(differences)

average_difference

  inputs_tensor = torch.stack([torch.tensor(input_vec) for input_vec in inputs])


tensor(19.2857)

In [132]:
average_score = dataframe.iloc[:, 1].mean()
average_score

9.669337114498404