In [1]:
# Step 1: Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gensim.downloader as api
import re
import os
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import r2_score, mean_absolute_error

# For Hacker News dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
torch.set_num_threads(8)  # or up to 10 for M4
torch.set_num_interop_threads(2)


In [2]:
# Integration with W&B
import random
import wandb
run = wandb.init(
    # Set the wandb entity where your project will be logged (generally your team name).
    entity="clemha-mli",
    # Set the wandb project where this run will be logged.
    project="HackerNews Prediction",
    # Track hyperparameters and run metadata.
    config={
        "learning_rate": 1e-4,
        "architecture": "MLP with 2 hidden layers (64, 32)",
        "dataset": "Hackernews text and text length",
        "epochs": 30,
    })


[34m[1mwandb[0m: Currently logged in as: [33mclemha[0m ([33mclemha-mli[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:

df = pd.read_csv("Inputs1stGen.csv", low_memory = False, nrows=100000)
# Scale the score to 0 to 1
df = df.dropna()
scaler = MinMaxScaler()
df['score_scaled'] = scaler.fit_transform(df[['score']])



In [4]:
# Step 2: Load GloVe 100 model (download once, save for reuse)
model_path = "glove-wiki-gigaword-100.kv"

if os.path.exists(model_path):
    print("Loading saved GloVe model from disk...")
    glove = KeyedVectors.load(model_path, mmap='r')
else:
    print("Downloading GloVe model...")
    glove = api.load("glove-wiki-gigaword-100")
    glove.save(model_path)

embedding_dim = glove.vector_size

Loading saved GloVe model from disk...


In [5]:
# Step 3: Sentence preprocessing + vector averaging
def preprocess(text):
    text = text.lower()
    return re.findall(r'\b\w+\b', text)

def sentence_to_vec_and_len(sentence, model, dim):
    tokens = preprocess(sentence)
    vectors = [model[word] for word in tokens if word in model.key_to_index]
    if not vectors:
        return np.zeros(dim), 0
    return np.mean(vectors, axis=0), len(tokens)

In [6]:
# Step 5: Vectorise sentences and get lengths
vectors = []
sentence_lengths = []
#for sent in data['sentence']:
for sent in df['title']:
    vec, length = sentence_to_vec_and_len(sent, glove, embedding_dim)
    vectors.append(vec)
    sentence_lengths.append(length)

df['vector'] = vectors
df['length'] = sentence_lengths

In [7]:
# Step 6: Custom Dataset with sentence length
class SentenceScoreDataset(Dataset):
    def __init__(self, df):
        self.X_embed = torch.tensor(np.stack(df['vector'].values), dtype=torch.float32)
        self.X_len = torch.tensor(df['length'].values, dtype=torch.float32).view(-1, 1)
        self.X = torch.cat([self.X_embed, self.X_len], dim=1)
        self.y = torch.tensor(df['score_scaled'].values, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [8]:
# Step 7: Split and prepare DataLoader
#train_df, temp_df = train_test_split(data, test_size=0.4, random_state=42)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_ds = SentenceScoreDataset(train_df)
val_ds = SentenceScoreDataset(val_df)
test_ds = SentenceScoreDataset(test_df)

train_loader = DataLoader(train_ds, batch_size=2, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=2)
test_loader = DataLoader(test_ds, batch_size=2)


In [9]:
# Step 8: MLP Regression Model
class MLPRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.net(x)

model = MLPRegressor(embedding_dim + 1)  # +1 for sentence length
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=wandb.config['learning_rate'])


In [10]:
#wandb.watch(model, log='all')

In [11]:
# Step 9: Training Loop with Validation
epochs = wandb.config["epochs"]
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    y_true, y_pred = [], []
    with torch.no_grad():
        for X_val, y_val in val_loader:
            preds = model(X_val)
            val_loss += criterion(preds, y_val).item()
            y_true.extend(y_val.numpy())
            y_pred.extend(preds.numpy())

    avg_val_loss = val_loss / len(val_loader)
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {total_loss:.4f}, Val Loss: {avg_val_loss:.4f}, R2: {r2:.4f}, MAE: {mae:.4f}")
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": total_loss / len(train_loader),
        "val_loss": avg_val_loss,
        "val_r2": r2,
        "val_mae": mae
    })

Epoch 1/30, Train Loss: 17.6089, Val Loss: 0.0006, R2: -0.0210, MAE: 0.0075
Epoch 2/30, Train Loss: 14.7942, Val Loss: 0.0006, R2: 0.0019, MAE: 0.0098
Epoch 3/30, Train Loss: 14.6121, Val Loss: 0.0006, R2: 0.0062, MAE: 0.0085
Epoch 4/30, Train Loss: 14.5218, Val Loss: 0.0006, R2: 0.0074, MAE: 0.0088
Epoch 5/30, Train Loss: 14.4599, Val Loss: 0.0006, R2: 0.0048, MAE: 0.0084
Epoch 6/30, Train Loss: 14.3959, Val Loss: 0.0006, R2: 0.0017, MAE: 0.0074
Epoch 7/30, Train Loss: 14.3288, Val Loss: 0.0006, R2: 0.0018, MAE: 0.0097
Epoch 8/30, Train Loss: 14.2771, Val Loss: 0.0006, R2: 0.0059, MAE: 0.0080
Epoch 9/30, Train Loss: 14.1600, Val Loss: 0.0006, R2: 0.0026, MAE: 0.0080
Epoch 10/30, Train Loss: 14.0237, Val Loss: 0.0006, R2: -0.0112, MAE: 0.0074
Epoch 11/30, Train Loss: 13.8709, Val Loss: 0.0006, R2: -0.0103, MAE: 0.0073
Epoch 12/30, Train Loss: 13.7443, Val Loss: 0.0006, R2: -0.0074, MAE: 0.0088
Epoch 13/30, Train Loss: 13.5307, Val Loss: 0.0006, R2: -0.0133, MAE: 0.0080
Epoch 14/30, Tra

In [12]:
# Step 10: Final Test Evaluation
model.eval()
y_true_test, y_pred_test = [], []
with torch.no_grad():
    for X_test, y_test in test_loader:
        preds = model(X_test)
        y_true_test.extend(y_test.numpy())
        y_pred_test.extend(preds.numpy())
        
r2_test = r2_score(y_true_test, y_pred_test)
mae_test = mean_absolute_error(y_true_test, y_pred_test)
mse_test = np.mean((np.array(y_true_test) - np.array(y_pred_test)) ** 2)

print(f"\nTest Results: R2: {r2_test:.4f}, MAE: {mae_test:.4f}, MSE: {mse_test:.4f}")
wandb.log({
    "test_r2": r2_test,
    "test_mae": mae_test,
    "test_mse": mse_test
})


Test Results: R2: -0.1185, MAE: 0.0080, MSE: 0.0006


In [13]:
# Step 11: Show sample predictions
# print("\nSample Predictions:")
# for true, pred in zip(y_true_test, y_pred_test):
#     print(f"True: {true[0]:.2f}, Predicted: {pred[0]:.2f}")

In [14]:
# Step 12: Generate a score from custom sentence
print("\nScore Prediction from New Sentence:")
custom_sentence = "Elon Musk just died."
vec, length = sentence_to_vec_and_len(custom_sentence, glove, embedding_dim)
input_tensor = torch.tensor(np.append(vec, length), dtype=torch.float32).unsqueeze(0)  # shape (1, dim+1)
model.eval()
with torch.no_grad():
    scaled_pred = model(input_tensor).item()
    predicted_score = scaler.inverse_transform([[scaled_pred]])[0][0]
    print(f"Sentence: {custom_sentence}\nPredicted Score: {predicted_score:.4f}")



Score Prediction from New Sentence:
Sentence: Elon Musk just died.
Predicted Score: 7.1863
