In [1]:
import numpy as np
import torch # This is where torch is imported
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
df = pd.read_csv("~/Documents/Inputs1stGen.csv", low_memory = False)

In [2]:
df.columns

Index(['by', 'karma', 'title', 'url', 'time', 'score'], dtype='object')

In [3]:
from datetime import datetime
from urllib.parse import urlparse

'''
The following section gives us all the parameters relating to time:
1. Day of week
2. Time of day
3. Month of Year
4. Year of past 
'''

#Converting UNIX timestamp to datetime
df['datetime'] = pd.to_datetime(df['time'])

#Getting the names of the days of the week
df['day_of_week'] = df['datetime'].dt.day_name()

#Creating a key of days of the week to numbers
day_map = {
    'Monday': 0, 'Tuesday': 1, 'Wednesday':2, 'Thursday':3, 'Friday':4, 'Saturday':5, 'Sunday':6
}

#Assigning numbers to the days of the week corresponding to the dataset
df["day_of_week_num"] = df['day_of_week'].map(day_map)

#Getting the names of the months of year
df['month'] = df['datetime'].dt.month_name()

#Creating a key of months of the year to numbers
month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April':4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October':10, 'November': 11, 'December': 12
}

df['month_num'] = df['month'].map(month_map)

#Getting the hour of the post
df['hour'] = df['datetime'].dt.hour

#Getting the year of the post
df['year'] = df['datetime'].dt.year


In [4]:
df.columns

Index(['by', 'karma', 'title', 'url', 'time', 'score', 'datetime',
       'day_of_week', 'day_of_week_num', 'month', 'month_num', 'hour', 'year'],
      dtype='object')

In [5]:
max_time = df['datetime'].max()
print(max_time)

2024-10-14 00:00:50


In [6]:
df['time_since_post1'] = max_time - df['datetime']

In [7]:
df.columns

Index(['by', 'karma', 'title', 'url', 'time', 'score', 'datetime',
       'day_of_week', 'day_of_week_num', 'month', 'month_num', 'hour', 'year',
       'time_since_post1'],
      dtype='object')

In [8]:
df['time_since_post1']

0         4738 days 07:33:50
1         4647 days 12:21:25
2         4738 days 07:33:14
3         4107 days 18:44:24
4         6041 days 14:14:25
                 ...        
5116176      0 days 00:11:08
5116177      0 days 00:07:50
5116178      0 days 00:03:39
5116179      0 days 00:02:32
5116180      0 days 00:00:00
Name: time_since_post1, Length: 5116181, dtype: timedelta64[ns]

In [9]:
'''
The following code gives us:
2. Time since the post was posted
'''

df['time_since_post'] = df['time_since_post1'].dt.total_seconds()

In [10]:
print(df['time_since_post'])

0          409390430.0
1          401545285.0
2          409390394.0
3          354912264.0
4          521993665.0
              ...     
5116176          668.0
5116177          470.0
5116178          219.0
5116179          152.0
5116180            0.0
Name: time_since_post, Length: 5116181, dtype: float64


In [11]:
df.columns

Index(['by', 'karma', 'title', 'url', 'time', 'score', 'datetime',
       'day_of_week', 'day_of_week_num', 'month', 'month_num', 'hour', 'year',
       'time_since_post1', 'time_since_post'],
      dtype='object')

In [12]:
'''
The following code gives us:
1. URL
2. Domain
'''

# Define a safe parser
def safe_urlparse(url):
    try: 
        parsed = urlparse(url)
        return parsed.netloc, parsed.path
    except Exception:
        return '', ''

# Apply safely
df['url'] = df['url'].fillna('').astype(str)  # Ensure it's string
df[['domain_name', 'url_path']] = df['url'].apply(
    lambda u: pd.Series(safe_urlparse(u))
)


In [13]:
df.columns

Index(['by', 'karma', 'title', 'url', 'time', 'score', 'datetime',
       'day_of_week', 'day_of_week_num', 'month', 'month_num', 'hour', 'year',
       'time_since_post1', 'time_since_post', 'domain_name', 'url_path'],
      dtype='object')

In [14]:
'''
The following code gives us:
1. User name
2. Title
3. Length of Title
4. Number of Upvotes
'''

df['by'] = df['by'].fillna('').astype(str)
df['title'] = df['title'].fillna('').astype(str)
df['title_length_chars'] = df['title'].str.len()
df['title_length_words'] = df['title'].str.split().str.len()

In [15]:
'''
Getting a file that has:
- User ('by')
- Title ('title')
- Domain ('domain_name')
- Day of the week ('day_of_week_num')
- Month ('month')
- Hour ('hour')
- Year ('year')
- Time since post ('time_since_post')
- Title length chars ('title_length_chars')
- Score ('score')
'''

selected_columns = ['by', 'title', 'domain_name', 'day_of_week_num', 'month_num', 'hour', 'year', 'time_since_post', 'title_length_chars', 'score']
df_selected = df[selected_columns]
df_selected.to_csv('RelevantDataScrape.csv', index=False)

df['domain_name'].value_counts()
domain_name_counts = df['domain_name'].value_counts()
df['by'].value_counts()
username_counts = df['by'].value_counts()

In [16]:
domain_name_counts

domain_name
                           512396
github.com                 160136
medium.com                 119464
www.youtube.com            118512
www.nytimes.com             70045
                            ...  
www.cummins-engine.es           1
yaketyhack.blogspot.com         1
forum.thinkpads.com             1
www.wb6nvh.com                  1
yongebai.github.io              1
Name: count, Length: 575001, dtype: int64

In [17]:
username_counts

by
rbanffy         30849
Tomte           23651
tosh            20919
pseudolus       16944
bookofjoe       16097
                ...  
giladha             1
skynetswed          1
18567478            1
sophxkath123        1
partime             1
Name: count, Length: 512588, dtype: int64

In [18]:
domaincolumn_names = domain_name_counts.name
print(domaincolumn_names)
usernamecolumn_names = username_counts.name
print(usernamecolumn_names)

count
count


In [19]:
cutoff_value =  1000
filtered_domain_counts = domain_name_counts[domain_name_counts > cutoff_value]
print(filtered_domain_counts)

filtered_username_counts = username_counts[username_counts > cutoff_value]
print(filtered_username_counts)

domain_name
                       512396
github.com             160136
medium.com             119464
www.youtube.com        118512
www.nytimes.com         70045
                        ...  
journals.plos.org        1014
readwrite.com            1012
www.sciencenews.org      1010
foreignpolicy.com        1010
t.co                     1006
Name: count, Length: 320, dtype: int64
by
rbanffy           30849
Tomte             23651
tosh              20919
pseudolus         16944
bookofjoe         16097
                  ...  
Bostonian          1018
ValentineC         1009
abraham            1005
danw               1005
lyricsongation     1004
Name: count, Length: 365, dtype: int64


In [20]:
#pip install scikit-learn

In [21]:
#pip install tensorflow

In [22]:
df = pd.read_csv('RelevantDataScrape.csv', low_memory = False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



Using device: cpu


In [23]:
features = df[[
    'score',
    'day_of_week_num',
    'month_num',
    'hour',
    'year',
    'time_since_post',
    'title_length_chars'
]].values

numerical_features = df[[
    'day_of_week_num',
    'month_num',
    'hour',
    'year',
    'time_since_post',
    'title_length_chars']].values
print(f"[DEBUG] Shape of numerical_features: {numerical_features.shape}")



[DEBUG] Shape of numerical_features: (5116181, 6)


In [24]:
# --- USERS (i.e. by) ---

if 'by' in df.columns:
    num_top_users = 100
    top_users = df['by'].value_counts().nlargest(num_top_users).index.tolist()
    print(f"[DEBUG] Identified top {len(top_users)} users for one-hot encoding.")

    # Create 'user_group' column with top users or 'OTHER'
    df['user_group'] = df['by'].apply(lambda x: x if x in top_users else 'OTHER')

    # One-hot encode
    user_one_hot_features = pd.get_dummies(df['user_group'], prefix='user')

    # Debug
    print(f"[DEBUG] User feature columns: {user_one_hot_features.columns.tolist()}")
    print(f"[DEBUG] Shape of user_one_hot_features: {user_one_hot_features.shape}")

    user_one_hot_features_array = user_one_hot_features.values
else:
    user_one_hot_features_array = np.empty((len(df), 0))
    print("[DEBUG] 'by' column not available, skipping user one-hot encoding.")



[DEBUG] Identified top 100 users for one-hot encoding.
[DEBUG] User feature columns: ['user_Anon84', 'user_BerislavLopac', 'user_Brajeshwar', 'user_CapitalistCartr', 'user_ColinWright', 'user_CrankyBear', 'user_DanielRibeiro', 'user_DiabloD3', 'user_DyslexicAtheist', 'user_Garbage', 'user_JumpCrisscross', 'user_LinuxBender', 'user_OTHER', 'user_PaulHoule', 'user_Tomte', 'user_aaronbrethorst', 'user_adamnemecek', 'user_adrian_mrd', 'user_amichail', 'user_anigbrowl', 'user_based2', 'user_belter', 'user_bookofjoe', 'user_bootload', 'user_bryanrasmussen', 'user_colinprince', 'user_coloneltcb', 'user_cwan', 'user_danso', 'user_dnetesn', 'user_doener', 'user_dsr12', 'user_edw519', 'user_edward', 'user_elorant', 'user_elsewhen', 'user_evo_9', 'user_fanf2', 'user_feross', 'user_fortran77', 'user_geox', 'user_ghosh', 'user_giuliomagnifico', 'user_gk1', 'user_gmays', 'user_happy-go-lucky', 'user_headalgorithm', 'user_hhs', 'user_howard941', 'user_iProject', 'user_iafrikan', 'user_ilamont', 'user

In [25]:
# --- DOMAINS ---

if 'domain_name' in df.columns:
    num_top_domain = 100
    top_domain = df['domain_name'].value_counts().nlargest(num_top_domain).index.tolist()
    print(f"[DEBUG] Identified top {len(top_domain)} domains for one-hot encoding.")

    # Create 'domain_group' column with top domains or 'OTHER'
    df['domain_group'] = df['domain_name'].apply(lambda x: x if x in top_domain else 'OTHER')

    # One-hot encode
    domain_one_hot_features = pd.get_dummies(df['domain_group'], prefix='domain')

    # Debug
    print(f"[DEBUG] Domain feature columns: {domain_one_hot_features.columns.tolist()}")
    print(f"[DEBUG] Shape of domain_one_hot_features: {domain_one_hot_features.shape}")

    domain_one_hot_features_array = domain_one_hot_features.values
else:
    domain_one_hot_features_array = np.empty((len(df), 0))
    print("[DEBUG] 'domain_name' column not available, skipping domain one-hot encoding.")


[DEBUG] Identified top 100 domains for one-hot encoding.
[DEBUG] Domain feature columns: ['domain_OTHER', 'domain_aeon.co', 'domain_apnews.com', 'domain_arstechnica.com', 'domain_arxiv.org', 'domain_aws.amazon.com', 'domain_bit.ly', 'domain_chrome.google.com', 'domain_dev.to', 'domain_docs.google.com', 'domain_edition.cnn.com', 'domain_en.wikipedia.org', 'domain_finance.yahoo.com', 'domain_fortune.com', 'domain_gigaom.com', 'domain_gist.github.com', 'domain_github.com', 'domain_gizmodo.com', 'domain_goo.gl', 'domain_hackaday.com', 'domain_hackernoon.com', 'domain_itunes.apple.com', 'domain_lwn.net', 'domain_mashable.com', 'domain_medium.com', 'domain_motherboard.vice.com', 'domain_nautil.us', 'domain_news.cnet.com', 'domain_news.ycombinator.com', 'domain_old.reddit.com', 'domain_online.wsj.com', 'domain_phys.org', 'domain_play.google.com', 'domain_qz.com', 'domain_spectrum.ieee.org', 'domain_stackoverflow.com', 'domain_techcrunch.com', 'domain_theconversation.com', 'domain_thenextweb.c

In [51]:
X = np.hstack((numerical_features, user_one_hot_features_array, domain_one_hot_features_array))
y = df[['score']].values.reshape(-1, 1)
print(f"[DEBUG] Shape of target variable (y): {y.shape}")

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



[DEBUG] Shape of target variable (y): (5116181, 1)


In [52]:
# --- 5. Define the Neural Network Model in PyTorch ---
# Refactored to use nn.Sequential
import os
class HackerNewsPredictor(nn.Module):
    def __init__(self, input_dim):
        super(HackerNewsPredictor, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(), # Activation function for the first hidden layer
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(), # Activation function for the second hidden layer
            nn.Dropout(0.3),
            nn.Linear(64, 1) # Output layer with linear activation (default for nn.Linear)
        )

    def forward(self, x):
        return self.network(x)

# Instantiate the model
input_dim = X_train_scaled.shape[1]
model = HackerNewsPredictor(input_dim).to(device) # Move model to device (CPU/GPU)
if os.path.exists('best_model.pth'):
   model.load_state_dict(torch.load('best_model.pth'))  # Load the best model weights

print("\nPyTorch Neural Network Model Defined:")
print(model)



PyTorch Neural Network Model Defined:
HackerNewsPredictor(
  (network): Sequential(
    (0): Linear(in_features=208, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=64, out_features=1, bias=True)
  )
)


In [53]:
# --- 6. Define Loss Function and Optimizer ---
criterion = nn.MSELoss() # Mean Squared Error Loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.001) # Adam optimizer with learning rate

print("\nLoss function (MSE) and Optimizer (Adam) defined.")




Loss function (MSE) and Optimizer (Adam) defined.


In [54]:
# --- 7. Train the Model ---
num_epochs = 100 # Max epochs, EarlyStopping will manage it
patience = 10 # Number of epochs to wait for improvement
best_val_loss = float('inf')
epochs_no_improve = 0

print("\nStarting model training...")
for epoch in range(num_epochs):
    model.train() # Set model to training mode
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device) # Move data to device
        optimizer.zero_grad() # Zero the gradients
        outputs = model(inputs) # Forward pass
        loss = criterion(outputs, targets) # Calculate loss
        loss.backward() # Backward pass (compute gradients)
        optimizer.step() # Update weights
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_dataset)

    # Validation phase
    model.eval() # Set model to evaluation mode
    val_running_loss = 0.0
    val_predictions = []
    val_actuals = []
    with torch.no_grad(): # Disable gradient calculations during validation
        for inputs, targets in test_loader: # Using test_loader for simplicity in this example for validation
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            val_loss = criterion(outputs, targets)
            val_running_loss += val_loss.item() * inputs.size(0)
            val_predictions.extend(outputs.cpu().numpy())
            val_actuals.extend(targets.cpu().numpy())
    
    val_epoch_loss = val_running_loss / len(test_dataset) # Use test_dataset for size calculation here

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Val Loss: {val_epoch_loss:.4f}")

    # Early Stopping check
    if val_epoch_loss < best_val_loss:
        best_val_loss = val_epoch_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), 'best_model.pth') # Save best model
    else:
        epochs_no_improve += 1
        if epochs_no_improve == patience:
            print(f"Early stopping at epoch {epoch+1} as validation loss did not improve for {patience} epochs.")
            break

print("Model training finished.")

torch.save(model.state_dict(), 'best_model.pth') # Save best model


Starting model training...
Epoch [1/100], Train Loss: 3335.4574, Val Loss: 3366.8234
Epoch [2/100], Train Loss: 3334.4240, Val Loss: 3367.4970
Epoch [3/100], Train Loss: 3334.3548, Val Loss: 3365.6197
Epoch [4/100], Train Loss: 3334.6003, Val Loss: 3366.7110
Epoch [5/100], Train Loss: 3335.8338, Val Loss: 3367.9162
Epoch [6/100], Train Loss: 3336.2692, Val Loss: 3367.5730
Epoch [7/100], Train Loss: 3337.1862, Val Loss: 3368.8800
Epoch [8/100], Train Loss: 3337.5078, Val Loss: 3367.5418
Epoch [9/100], Train Loss: 3336.9953, Val Loss: 3367.8629
Epoch [10/100], Train Loss: 3337.7119, Val Loss: 3367.3289
Epoch [11/100], Train Loss: 3336.8226, Val Loss: 3369.9222
Epoch [12/100], Train Loss: 3337.0052, Val Loss: 3365.9089
Epoch [13/100], Train Loss: 3336.8986, Val Loss: 3370.6719
Early stopping at epoch 13 as validation loss did not improve for 10 epochs.
Model training finished.


In [55]:
# --- 8. Evaluate the Model on the Test Set (or load best model and evaluate) ---
model.load_state_dict(torch.load('best_model.pth'))  # Load the best model weights
model.eval()  # Set to evaluation mode

all_predictions = []
all_actuals = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        all_predictions.append(outputs)
        all_actuals.append(targets)

# Concatenate tensors
all_predictions = torch.cat(all_predictions).flatten()
all_actuals = torch.cat(all_actuals).flatten()

# Compute metrics using PyTorch only
mse = torch.mean((all_predictions - all_actuals) ** 2).item()
mae = torch.mean(torch.abs(all_predictions - all_actuals)).item()

# R² calculation: 1 - SSR/SST
ss_res = torch.sum((all_predictions - all_actuals) ** 2)
ss_tot = torch.sum((all_actuals - torch.mean(all_actuals)) ** 2)
r2 = 1 - (ss_res / ss_tot).item()

print(f"\nModel Evaluation on Test Set:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²): {r2:.2f}")



Model Evaluation on Test Set:
Mean Squared Error (MSE): 3370.67
Mean Absolute Error (MAE): 20.79
R-squared (R²): 0.01


In [56]:
# --- 9. Make Predictions (Optional) ---
# Prepare a sample for prediction (e.g., the first sample from the test set)
sample_index = 0
sample_input_scaled = torch.tensor(X_test_scaled[sample_index:sample_index+1], dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    predicted_upvotes_tensor = model(sample_input_scaled)
    predicted_upvotes = predicted_upvotes_tensor.item()  # Convert to scalar

# Actual value (assuming y_test is a PyTorch tensor or converted here)
actual_upvotes = torch.tensor(y_test[sample_index][0]).item()

print(f"\nPrediction for sample {sample_index}:")
print(f"Actual Upvotes: {actual_upvotes}")
print(f"Predicted Upvotes: {predicted_upvotes:.2f}")


Prediction for sample 0:
Actual Upvotes: 32
Predicted Upvotes: 13.43
