In [1]:
import gzip
from collections import defaultdict

def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    u = d['userID']
    try:
      g = d['gameID']
    except Exception as e:
      g = None
    yield u,g,d

train_json = '/home/scotty/dsc_256/fall_25/make_up/assignment1/train.json.gz'
pairs_hours = '/home/scotty/dsc_256/fall_25/make_up/assignment1/pairs_Hours.csv'
pairs_played = '/home/scotty/dsc_256/fall_25/make_up/assignment1/pairs_Played.csv'

# create containers for users and games
user_dict = defaultdict(list)
game_dict = defaultdict(list)
train_data = []

# read train.json.gz and populate user_dict and game_dict
for u,g,d in readJSON(train_json):
    user_dict[u].append(g)
    game_dict[g].append(u)
    train_data.append(d)

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import hstack, csr_matrix
from surprise import SVD, Dataset, Reader

# --- 1. Setup ---
df = pd.DataFrame(train_data)
df['hours_transformed'] = np.log2(df['hours'] + 1)

# --- 2. Ridge Feature (The "Bias" Expert) ---
print("Generating Ridge (Bias) Features...")
ohe = OneHotEncoder(handle_unknown='ignore')
sparse_ids = ohe.fit_transform(df[['userID', 'gameID']])

tfidf = TfidfVectorizer(max_features=3000, stop_words='english')
sparse_text = tfidf.fit_transform(df['text'].fillna(''))

X_sparse = hstack([sparse_ids, sparse_text])
y_target = df['hours_transformed'].values

# OOF Ridge
ridge_preds = np.zeros(len(df))
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in kf.split(X_sparse):
    # Lower alpha to 0.1 to reduce underfitting
    model = Ridge(alpha=0.1, solver='sag', random_state=42)
    model.fit(X_sparse[train_idx], y_target[train_idx])
    ridge_preds[val_idx] = model.predict(X_sparse[val_idx])

df['ridge_score'] = ridge_preds

# --- 3. SVD Feature (The "Interaction" Expert) ---
print("Generating SVD (Interaction) Features...")
# We use Surprise here because it handles explicit rating interactions naturally
svd_preds = np.zeros(len(df))
reader = Reader(rating_scale=(0, df['hours_transformed'].max()))

for train_idx, val_idx in kf.split(df):
    fold_train = df.iloc[train_idx]
    fold_val = df.iloc[val_idx]
    
    data_train = Dataset.load_from_df(fold_train[['userID', 'gameID', 'hours_transformed']], reader)
    trainset = data_train.build_full_trainset()
    
    # Standard SVD params
    algo = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.02)
    algo.fit(trainset)
    
    svd_preds[val_idx] = [algo.predict(row['userID'], row['gameID']).est for _, row in fold_val.iterrows()]

df['svd_score'] = svd_preds

# --- 4. The Final Stack (XGBoost) ---
print("Training Ensemble...")

features = ['ridge_score', 'svd_score'] # The two experts
X = df[features]
y = df['hours_transformed']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# We let XGBoost decide how much to trust Bias (Ridge) vs Interaction (SVD)
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)

xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

train_p = xgb_model.predict(X_train)
val_p = xgb_model.predict(X_val)

print(f"\nFinal Train MSE: {mean_squared_error(y_train, train_p):.4f}")
print(f"Final Val MSE:   {mean_squared_error(y_val, val_p):.4f}")

# --- 5. Generate Test Submission ---
print("Generating Test Predictions...")
pairs_hours = pd.read_csv('pairs_Hours.csv')

# A. Final Ridge Model (Full Data)
final_ridge = Ridge(alpha=0.1, solver='sag', random_state=42)
final_ridge.fit(X_sparse, y_target)

# B. Final SVD Model (Full Data)
full_data = Dataset.load_from_df(df[['userID', 'gameID', 'hours_transformed']], reader)
full_trainset = full_data.build_full_trainset()
final_svd = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.02)
final_svd.fit(full_trainset)

# C. Predict Test Features
# Ridge Prep
test_ids = ohe.transform(pairs_hours[['userID', 'gameID']])
test_text = csr_matrix((len(pairs_hours), 3000)) # Empty text for test
X_test_sparse = hstack([test_ids, test_text])
pairs_hours['ridge_score'] = final_ridge.predict(X_test_sparse)

# SVD Prep
pairs_hours['svd_score'] = [final_svd.predict(u, g).est for u, g in zip(pairs_hours['userID'], pairs_hours['gameID'])]

# XGBoost Final Predict
final_preds = xgb_model.predict(pairs_hours[features])

# Clip
min_rating = 0
max_rating = df['hours_transformed'].max()
pairs_hours['prediction'] = [max(min_rating, min(max_rating, p)) for p in final_preds]

print(pairs_hours.head())
# pairs_hours.to_csv('predictions_Hours_Ensemble.csv', index=False, columns=['userID', 'gameID', 'prediction'])

In [27]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from scipy.sparse import hstack, csr_matrix
import gc 

# --- 1. Setup & Data Loading ---
df = pd.DataFrame(train_data)
df['hours_transformed'] = np.log2(df['hours'] + 1)

# --- 2. Sparse Feature Engineering (The Scikit-Learn Way) ---
print("Constructing Sparse Features...")

# A. One-Hot Encoding Users and Items
# handle_unknown='ignore' is CRITICAL. It ensures that if a user appears 
# in the test set but not the train set, their row is all 0s (Global Average).
ohe = OneHotEncoder(handle_unknown='ignore')
# We fit on the full dataset ID space to establish the vocabulary
ohe.fit(df[['userID', 'gameID']])
sparse_ids = ohe.transform(df[['userID', 'gameID']])

# B. Text Features (TF-IDF)
tfidf = TfidfVectorizer(max_features=3000, stop_words='english')
sparse_text = tfidf.fit_transform(df['text'].fillna(''))

# C. Combine into one massive sparse matrix
# This matrix has: [User_1...User_N, Game_1...Game_M, Word_1...Word_K]
X_sparse = hstack([sparse_ids, sparse_text])
y_target = df['hours_transformed'].values

print(f"Sparse Matrix Shape: {X_sparse.shape}")

# --- 3. Out-of-Fold (OOF) Ridge Predictions ---
print("Generating OOF Ridge Features...")
# We replace SVD with Ridge Regression. 
# Ridge is essentially solving: y = b_u + b_i + w_text
ridge_preds = np.zeros(len(df))

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold_i, (train_idx, val_idx) in enumerate(kf.split(X_sparse)):
    print(f"  Processing Fold {fold_i + 1}/5...")
    
    # Slice the sparse matrix
    X_fold_train = X_sparse[train_idx]
    y_fold_train = y_target[train_idx]
    
    X_fold_val = X_sparse[val_idx]
    
    # Train Ridge (Alpha controls regularization, similar to 'reg_all' in SVD)
    # solver='sag' is fast for large sparse matrices
    model = Ridge(alpha=1.0, solver='sag', random_state=42)
    model.fit(X_fold_train, y_fold_train)
    
    # Predict
    ridge_preds[val_idx] = model.predict(X_fold_val)

# Add the OOF prediction to the dataframe
df['ridge_feature'] = ridge_preds

# --- 4. Train Final Ridge (For Test Set) ---
print("Training Final Ridge on full dataset...")
final_ridge = Ridge(alpha=1.0, solver='sag', random_state=42)
final_ridge.fit(X_sparse, y_target)

# --- 5. Stacking with XGBoost ---
print("Preparing XGBoost Data...")

# We no longer need separate "Bias" features (u_bias, i_bias) because 
# Ridge Regression ALREADY captured them perfectly in 'ridge_feature'.
# XGBoost just needs to correct the Ridge residuals.

def build_stacking_features(dataframe, ridge_scores):
    return pd.DataFrame({
        'ridge_score': ridge_scores,
        'text_len': dataframe['text'].str.len().fillna(0)
        # We can add other metadata here if available
    })

# Split for XGBoost Training
train_idx, val_idx = train_test_split(range(len(df)), test_size=0.2, random_state=42)

X_stack_train = build_stacking_features(df.iloc[train_idx], df.iloc[train_idx]['ridge_feature'])
y_stack_train = df.iloc[train_idx]['hours_transformed']

X_stack_val = build_stacking_features(df.iloc[val_idx], df.iloc[val_idx]['ridge_feature'])
y_stack_val = df.iloc[val_idx]['hours_transformed']

print("Training XGBoost Stack...")
xgb_model = xgb.XGBRegressor(
    n_estimators=100,      # Ridge does the heavy lifting, XGB just polishes
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)

xgb_model.fit(X_stack_train, y_stack_train, eval_set=[(X_stack_val, y_stack_val)], verbose=False)

train_p = xgb_model.predict(X_stack_train)
val_p = xgb_model.predict(X_stack_val)

print(f"\nFinal Train MSE: {mean_squared_error(y_stack_train, train_p):.4f}")
print(f"Final Val MSE:   {mean_squared_error(y_stack_val, val_p):.4f}")

# --- 6. Prediction on Test Set ---
pairs_hours = pd.read_csv('pairs_Hours.csv')
print("Generating predictions for Test Set...")

# A. Prepare Test Features for Ridge
# We must apply the SAME OneHotEncoder and Tfidf transform
# Note: pairs_Hours likely lacks 'text', so we pass empty strings for TFIDF part
# Or, better yet, we just rely on IDs.
# Limitation: If pairs_Hours doesn't have review text, we assume empty string.
test_ids = ohe.transform(pairs_hours[['userID', 'gameID']])
# Create dummy text matrix (all zeros) for the test set since we don't have reviews at test time
test_text = csr_matrix((len(pairs_hours), 3000), dtype=np.float64) 

X_test_sparse = hstack([test_ids, test_text])

# B. Predict Ridge
test_ridge_scores = final_ridge.predict(X_test_sparse)

# C. Predict XGBoost
X_test_stack = pd.DataFrame({
    'ridge_score': test_ridge_scores,
    'text_len': [0] * len(pairs_hours) # No text len at test time
})

raw_predictions = xgb_model.predict(X_test_stack)

# D. Clip and Save
min_rating = 0
max_rating = df['hours_transformed'].max()
pairs_hours['prediction'] = [max(min_rating, min(max_rating, p)) for p in raw_predictions]

print(pairs_hours.head())
# pairs_hours.to_csv('predictions_Hours_RidgeStack.csv', index=False, columns=['userID', 'gameID', 'prediction'])

Constructing Sparse Features...
Sparse Matrix Shape: (175000, 12147)
Generating OOF Ridge Features...
  Processing Fold 1/5...
  Processing Fold 2/5...
  Processing Fold 3/5...
  Processing Fold 4/5...
  Processing Fold 5/5...
Training Final Ridge on full dataset...
Preparing XGBoost Data...
Training XGBoost Stack...

Final Train MSE: 2.8259
Final Val MSE:   2.8601
Generating predictions for Test Set...
      userID     gameID  prediction
0  u04763917  g51093074    3.253604
1  u10668484  g42523222    1.368647
2  u82502949  g39422502    4.491645
3  u14336188  g83517324    2.587467
4  u10096161  g10962300    2.587467


In [26]:
import pandas as pd
import numpy as np
import xgboost as xgb
from surprise import SVD, Dataset, Reader
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from collections import defaultdict

# --- 1. Setup & Data Loading ---
# (Assuming 'train_data' is your list of dictionaries loaded from json)
df = pd.DataFrame(train_data)
df['hours_transformed'] = np.log2(df['hours'] + 1)

# --- 2. NLP Pipeline (Semantic Profiles) ---
print("Vectorizing Text...")
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'].fillna(''))

# Reduce dimensions
svd_text = TruncatedSVD(n_components=15, random_state=42)
text_latents = svd_text.fit_transform(tfidf_matrix)

# Map Latents to User/Item Profiles
user_text_sum = defaultdict(lambda: np.zeros(15))
user_text_cnt = defaultdict(int)
item_text_sum = defaultdict(lambda: np.zeros(15))
item_text_cnt = defaultdict(int)

for idx, (u, i) in enumerate(zip(df['userID'], df['gameID'])):
    vec = text_latents[idx]
    user_text_sum[u] += vec
    user_text_cnt[u] += 1
    item_text_sum[i] += vec
    item_text_cnt[i] += 1

# Average the vectors
user_profile = {u: user_text_sum[u] / c for u, c in user_text_cnt.items()}
item_profile = {i: item_text_sum[i] / c for i, c in item_text_cnt.items()}

def get_semantic_affinity(uid, iid):
    # Dot product of User Profile and Item Profile
    if uid not in user_profile or iid not in item_profile: return 0.0
    return np.dot(user_profile[uid], item_profile[iid])

# --- 3. Out-of-Fold (OOF) SVD Feature Generation ---
print("Generating OOF SVD Features...")
df['svd_feature'] = 0.0 # Placeholder

kf = KFold(n_splits=5, shuffle=True, random_state=42)
reader = Reader(rating_scale=(0, df['hours_transformed'].max()))

for fold_i, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f"  Processing Fold {fold_i + 1}/5...")
    
    fold_train = df.iloc[train_idx]
    fold_val = df.iloc[val_idx]
    
    # Train SVD only on this fold's training data
    data_train = Dataset.load_from_df(fold_train[['userID', 'gameID', 'hours_transformed']], reader)
    trainset = data_train.build_full_trainset()
    
    model = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.05)
    model.fit(trainset)
    
    # Predict on the hold-out fold
    preds = [model.predict(row['userID'], row['gameID']).est for _, row in fold_val.iterrows()]
    
    # Update the main dataframe with these "clean" predictions
    df.loc[val_idx, 'svd_feature'] = preds

# --- 4. Train Final SVD (For Test Set Only) ---
print("Training Final SVD on full dataset...")
full_data = Dataset.load_from_df(df[['userID', 'gameID', 'hours_transformed']], reader)
full_trainset = full_data.build_full_trainset()
final_svd = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.05)
final_svd.fit(full_trainset)

# --- 5. Feature Assembly & XGBoost Training ---
print("Preparing XGBoost Data...")

# Pre-compute Global/User/Item Averages
global_mean = df['hours_transformed'].mean()
user_means = df.groupby('userID')['hours_transformed'].mean().to_dict()
item_means = df.groupby('gameID')['hours_transformed'].mean().to_dict()

def build_features(dataframe, mode='internal'):
    """
    mode='internal': Use the pre-calculated OOF 'svd_feature' column. 
                     (For Train and Validation sets)
    mode='external': Use 'final_svd' to predict. 
                     (For the Test set / Submission)
    """
    
    # 1. Semantic Affinity
    text_scores = [get_semantic_affinity(u, i) for u, i in zip(dataframe['userID'], dataframe['gameID'])]
    
    # 2. SVD Score (The Critical Logic Fix)
    if mode == 'internal':
        # Use the column we generated in Step 3
        svd_scores = dataframe['svd_feature'].values
    else:
        # Use the model trained in Step 4
        svd_scores = [final_svd.predict(u, i).est for u, i in zip(dataframe['userID'], dataframe['gameID'])]

    # 3. Bias Features
    u_bias = [user_means.get(u, global_mean) for u in dataframe['userID']]
    i_bias = [item_means.get(i, global_mean) for i in dataframe['gameID']]

    return pd.DataFrame({
        'svd_rating': svd_scores,
        'semantic_affinity': text_scores,
        'user_avg': u_bias,
        'item_avg': i_bias
    })

# Split the dataframe that ALREADY has OOF features
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Both use mode='internal' because they come from the training file
X_train = build_features(train_df, mode='internal')
y_train = train_df['hours_transformed']

X_val = build_features(val_df, mode='internal')
y_val = val_df['hours_transformed']

print("Training XGBoost...")
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.02,
    max_depth=4,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=1.0,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

train_preds = xgb_model.predict(X_train)
val_preds = xgb_model.predict(X_val)

print(f"\nCorrected Train MSE: {mean_squared_error(y_train, train_preds):.4f}")
print(f"Corrected Val MSE:   {mean_squared_error(y_val, val_preds):.4f}")

# --- 6. Prediction on Test Set ---
pairs_hours = pd.read_csv('pairs_Hours.csv')

print("Generating predictions for Test Set...")

# We use mode='external' here because these pairs are new
X_test = build_features(pairs_hours, mode='external')

raw_predictions = xgb_model.predict(X_test)

# Safeguards
min_rating = 0
max_rating = df['hours_transformed'].max()
pairs_hours['prediction'] = [max(min_rating, min(max_rating, p)) for p in raw_predictions]

print(pairs_hours.head())
pairs_hours.to_csv('predictions_Hours.csv', index=False, columns=['userID', 'gameID', 'prediction'])

Vectorizing Text...
Generating OOF SVD Features...
  Processing Fold 1/5...
  Processing Fold 2/5...
  Processing Fold 3/5...
  Processing Fold 4/5...
  Processing Fold 5/5...
Training Final SVD on full dataset...
Preparing XGBoost Data...
Training XGBoost...

Corrected Train MSE: 2.8833
Corrected Val MSE:   2.8970
Generating predictions for Test Set...
      userID     gameID  prediction
0  u04763917  g51093074    3.799462
1  u10668484  g42523222    1.337331
2  u82502949  g39422502    4.715211
3  u14336188  g83517324    3.223988
4  u10096161  g10962300    3.296594


In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import itertools

# --- 1. Data Prep & Monotonic Chain ---
# Assuming 'train_data' is a list of dicts loaded from json
if 'train_data' not in locals():
    # Placeholder for the clinic context; ensure this is loaded!
    train_data = [] 

df = pd.DataFrame(train_data)

# Transformations
df['hours_transformed'] = np.log2(df['hours'] + 1)
df['text_len_norm'] = df['text'].str.len().fillna(0)
df['text_len_norm'] = df['text_len_norm'] / df['text_len_norm'].max()

# Mappings (Handling Unknowns for later)
# We reserve index 0 for "Unknown" users/items
unique_users = df['userID'].unique()
unique_items = df['gameID'].unique()

user_map = {u: i+1 for i, u in enumerate(unique_users)} # Start at 1
item_map = {i: x+1 for x, i in enumerate(unique_items)} # Start at 1

df['user_idx'] = df['userID'].map(user_map)
df['item_idx'] = df['gameID'].map(item_map)

num_users = len(user_map) + 1 # +1 for the padding/unknown index
num_items = len(item_map) + 1

# Thresholds
hours_median = df['hours_transformed'].median()
hours_75th = df['hours_transformed'].quantile(0.75)
text_threshold = df['text_len_norm'].mean()

def create_monotonic_chain(row):
    # Stage 1: Played (Always 1 for training data)
    y1 = 1
    # Stage 2: Engaged (Above Median)
    y2 = 1 if row['hours_transformed'] > hours_median else 0
    # Stage 3: Deeply Engaged (Above 75th)
    y3 = 1 if row['hours_transformed'] > hours_75th else 0
    # Stage 4: Reviewed (Has significant text)
    y4 = 1 if row['text_len_norm'] > text_threshold else 0
    
    # Enforce Monotonicity Constraints
    if y4 == 1: y3 = 1
    if y3 == 1: y2 = 1
    
    return [y1, y2, y3, y4]

# Create Edge Indices (The "Target" class for the network)
# If chain is [1, 1, 0, 0], sum is 2. The edge is stage 2.
df['chain_labels'] = df.apply(create_monotonic_chain, axis=1)
df['edge_index'] = df['chain_labels'].apply(sum)

# Split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['edge_index'])

# Tensors
train_u = torch.LongTensor(train_df['user_idx'].values)
train_i = torch.LongTensor(train_df['item_idx'].values)
train_edge = torch.LongTensor(train_df['edge_index'].values)

val_u = torch.LongTensor(val_df['user_idx'].values)
val_i = torch.LongTensor(val_df['item_idx'].values)
val_edge = torch.LongTensor(val_df['edge_index'].values)

# --- 2. The ChainRec Model ---
class ChainRec(nn.Module):
    def __init__(self, num_users, num_items, num_stages=4, embed_dim=16):
        super().__init__()
        self.num_stages = num_stages
        
        # Embeddings
        self.user_emb = nn.Embedding(num_users, embed_dim, padding_idx=0)
        self.item_emb = nn.Embedding(num_items, embed_dim, padding_idx=0)
        
        # Stage-specific projections
        # We project the interaction vector into 'num_stages' outputs
        self.stage_proj = nn.Linear(embed_dim, num_stages)
        
        # Biases
        self.user_bias = nn.Embedding(num_users, 1, padding_idx=0)
        self.item_bias = nn.Embedding(num_items, 1, padding_idx=0)
        self.global_bias = nn.Parameter(torch.zeros(1))
        
        # Init
        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.item_emb.weight)

    def forward(self, u_idx, i_idx):
        u = self.user_emb(u_idx)
        i = self.item_emb(i_idx)
        
        # Element-wise product interaction
        interaction = u * i 
        
        # Calculate raw deltas for each stage
        # Shape: [batch_size, num_stages]
        deltas = self.stage_proj(interaction)
        
        # Softplus to ensure positive contributions (Monotonicity constraint)
        deltas_plus = F.softplus(deltas)
        
        # Cumulative sum implies: If you passed stage 4, you effectively passed 1, 2, and 3.
        # We flip, cumsum, and flip back so stage 1 accumulates all subsequent probabilities
        scores = torch.flip(torch.cumsum(torch.flip(deltas_plus, [1]), dim=1), [1])
        
        # Add biases
        b = self.global_bias + self.user_bias(u_idx).squeeze() + self.item_bias(i_idx).squeeze()
        
        # Broadcast bias across stages
        return scores + b.unsqueeze(1)

def edge_loss(model, u_idx, i_idx_pos, edge_targets, i_idx_neg):
    """
    Maximizes the likelihood of the specific 'edge' (highest stage reached)
    while minimizing likelihood of negative samples.
    """
    # Positive Scores
    pos_scores = model(u_idx, i_idx_pos)
    
    # We want to select the score corresponding to the actual edge reached
    # edge_targets range from 1 to 4. We map to indices 0 to 3.
    # Note: If edge is 0 (shouldn't happen in train), we clamp.
    gather_indices = (edge_targets - 1).clamp(min=0).unsqueeze(1)
    s_edge_pos = pos_scores.gather(1, gather_indices).squeeze()
    
    # Negative Scores (Standard BPR approach)
    # We only care about Stage 1 for negatives (Did they play it at all?)
    neg_scores = model(u_idx, i_idx_neg)
    s_neg = neg_scores[:, 0] 
    
    # BPR-like Log Sigmoid Loss
    loss = -torch.mean(F.logsigmoid(s_edge_pos - s_neg))
    return loss

# --- 3. Training Loop ---
print("Training ChainRec...")
# Fixed params for demonstration (Use your Grid Search values here)
embed_dim = 16
batch_size = 1024
epochs = 5
lr = 0.01

model = ChainRec(num_users, num_items, num_stages=4, embed_dim=embed_dim)
optimizer = optim.Adam(model.parameters(), lr=lr)

dataset = TensorDataset(train_u, train_i, train_edge)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for b_u, b_i, b_edge in dataloader:
        optimizer.zero_grad()
        # Random Negative Sampling
        b_neg_i = torch.randint(1, num_items, (len(b_u),))
        
        loss = edge_loss(model, b_u, b_i, b_edge, b_neg_i)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}: Loss {total_loss:.4f}")

# --- 4. Final Prediction Strategy ---
print("Generating Predictions...")
model.eval()

# A. Predict PLAYED (Classification)
pairs_played = pd.read_csv('pairs_Played.csv')
# Map Users/Items (Handle unknowns by mapping to 0)
pairs_played['u_idx'] = pairs_played['userID'].map(lambda x: user_map.get(x, 0))
pairs_played['i_idx'] = pairs_played['gameID'].map(lambda x: item_map.get(x, 0))

u_test = torch.LongTensor(pairs_played['u_idx'].values)
i_test = torch.LongTensor(pairs_played['i_idx'].values)

with torch.no_grad():
    scores = model(u_test, i_test)
    # We only care about Stage 1 (Played) probability
    probs = torch.sigmoid(scores[:, 0]).numpy()

# THE RANKING HACK (Crucial for balanced test sets)
pairs_played['raw_score'] = probs
median_score = pairs_played['raw_score'].median()
pairs_played['prediction'] = (pairs_played['raw_score'] > median_score).astype(int)

print(pairs_played.head())
pairs_played.to_csv('predictions_Played.csv', columns=['userID', 'gameID', 'prediction'], index=False)


Training ChainRec...
Epoch 1: Loss 149.3796
Epoch 2: Loss 81.9664
Epoch 3: Loss 63.4963
Epoch 4: Loss 55.9711
Epoch 5: Loss 51.1110
Generating Predictions...
      userID     gameID  prediction  u_idx  i_idx  raw_score
0  u04836696  g41031307           1    264    322   0.986283
1  u32377855  g62450068           0    250   2131   0.968902
2  u58289072  g71021765           1   2659   1924   0.998017
3  u74685029  g26732871           1   1459    420   0.999361
4  u06266052  g69433247           1   3995   1923   0.993517


In [20]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_classification(model, val_df, user_map, item_map, num_items):
    print("Building Balanced Validation Set...")
    
    # 1. Positives (The real data)
    # We take the validation split we already made
    pos_u = [user_map.get(u, 0) for u in val_df['userID']]
    pos_i = [item_map.get(i, 0) for i in val_df['gameID']]
    labels = [1] * len(pos_u)
    
    # 2. Negatives (The fake data)
    # We generate an equal number of random negative samples
    neg_u = pos_u # Same users
    neg_i = []
    
    # Simple negative sampling: pick a random item. 
    # In a perfect world, we verify the user hasn't actually played it, 
    # but for fast validation, random selection is usually sufficient.
    for _ in range(len(pos_u)):
        neg_i.append(np.random.randint(1, num_items))
        
    labels.extend([0] * len(neg_u))
    
    # Combine
    all_u = torch.LongTensor(pos_u + neg_u)
    all_i = torch.LongTensor(pos_i + neg_i)
    y_true = np.array(labels)
    
    # 3. Predict
    model.eval()
    with torch.no_grad():
        scores = model(all_u, all_i)
        # Get probability of Stage 1 (Played)
        y_probs = torch.sigmoid(scores[:, 0]).numpy()
        
    # 4. Metrics
    print("\n--- Evaluation Metrics ---")
    
    # AUC (Area Under Curve) - The best metric for ranking ability
    # 0.5 = Random Guessing, 1.0 = Perfect
    auc = roc_auc_score(y_true, y_probs)
    print(f"ROC AUC Score: {auc:.4f}")
    
    # Accuracy (at 0.5 threshold)
    y_pred = (y_probs > 0.5).astype(int)
    acc = accuracy_score(y_true, y_pred)
    print(f"Accuracy (Raw Threshold): {acc:.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    print("\nConfusion Matrix:")
    print(f"True Negatives: {cm[0][0]} | False Positives: {cm[0][1]}")
    print(f"False Negatives: {cm[1][0]} | True Positives: {cm[1][1]}")
    
    return y_true, y_probs

# Run the evaluation
y_true, y_probs = evaluate_classification(model, val_df, user_map, item_map, num_items)

import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

print("\n--- Re-evaluating with Median Threshold ---")

# 1. Find the median score of your predictions
# This effectively forces the model to predict exactly 50% "Played" and 50% "Not Played"
median_thresh = np.median(y_probs)
print(f"New Threshold (Median): {median_thresh:.4f}")

# 2. Apply the new threshold
y_pred_calibrated = (y_probs > median_thresh).astype(int)

# 3. Recalculate Accuracy
new_acc = accuracy_score(y_true, y_pred_calibrated)
print(f"Calibrated Accuracy: {new_acc:.4f}")

# 4. New Confusion Matrix
cm = confusion_matrix(y_true, y_pred_calibrated)
print("\nNew Confusion Matrix:")
print(f"True Negatives: {cm[0][0]} | False Positives: {cm[0][1]}")
print(f"False Negatives: {cm[1][0]} | True Positives: {cm[1][1]}")

Building Balanced Validation Set...

--- Evaluation Metrics ---
ROC AUC Score: 0.6977
Accuracy (Raw Threshold): 0.5097

Confusion Matrix:
True Negatives: 885 | False Positives: 34115
False Negatives: 208 | True Positives: 34792

--- Re-evaluating with Median Threshold ---
New Threshold (Median): 0.9856
Calibrated Accuracy: 0.6465

New Confusion Matrix:
True Negatives: 22629 | False Positives: 12371
False Negatives: 12371 | True Positives: 22629


In [21]:
import pandas as pd
import numpy as np
import xgboost as xgb
from surprise import SVD, Dataset, Reader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import random

# --- 1. Load and Prep Data ---
# Assuming train_data is loaded
df = pd.DataFrame(train_data)
df['hours_transformed'] = np.log2(df['hours'] + 1)

# A. Calculate Statistics (The "Heuristic" Features)
item_popularity = df['gameID'].value_counts().to_dict()
user_activity = df['userID'].value_counts().to_dict()
global_pop_median = df['gameID'].value_counts().median()
global_act_median = df['userID'].value_counts().median()

# --- 2. Train SVD (Latent Feature Extractor) ---
# We use SVD not to predict the final output, but to generate a "Compatibility Score"
print("Training SVD for Feature Extraction...")
reader = Reader(rating_scale=(0, df['hours_transformed'].max()))
data = Dataset.load_from_df(df[['userID', 'gameID', 'hours_transformed']], reader)
trainset = data.build_full_trainset()

# We use fewer factors to capture broad strokes, not noise
svd = SVD(n_factors=10, n_epochs=10, lr_all=0.005, reg_all=0.02)
svd.fit(trainset)

def get_svd_score(uid, iid):
    return svd.predict(uid, iid).est

# --- 3. Build the Classification Dataset (Positives + Negatives) ---
print("Constructing Negative Samples...")

# Positives: The actual interactions
pos_df = df[['userID', 'gameID']].copy()
pos_df['label'] = 1

# Negatives: We need to generate roughly the same amount of 0s
all_games = list(item_popularity.keys())
n_negatives = len(pos_df)

# Optimized Negative Sampling
# We randomly sample users from the existing data, and assign them random games
neg_users = df['userID'].sample(n_negatives, replace=True).values
neg_games = random.choices(all_games, k=n_negatives)

neg_df = pd.DataFrame({'userID': neg_users, 'gameID': neg_games})
neg_df['label'] = 0

# Concatenate and Shuffle
train_class_df = pd.concat([pos_df, neg_df]).sample(frac=1.0, random_state=42)

# --- 4. Feature Engineering ---
print("Building Features...")

def extract_features(dataframe):
    # 1. Compatibility (SVD Score)
    # This captures: "Does this user usually like this KIND of game?"
    dataframe['svd_score'] = [get_svd_score(u, i) for u, i in zip(dataframe['userID'], dataframe['gameID'])]
    
    # 2. Popularity (Global Bias)
    # This captures: "Is this game just generally popular?"
    dataframe['item_pop'] = dataframe['gameID'].map(item_popularity).fillna(0)
    
    # 3. Activity (User Bias)
    # This captures: "Does this user click 'play' on everything?"
    dataframe['user_act'] = dataframe['userID'].map(user_activity).fillna(0)
    
    return dataframe[['svd_score', 'item_pop', 'user_act']]

X = extract_features(train_class_df)
y = train_class_df['label']

# --- 5. Train XGBoost Classifier ---
print("Training XGBoost Classifier...")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Note: We use XGBClassifier now, not Regressor
clf = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=4,         # Keep it shallow to prevent overfitting
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

clf.fit(X_train, y_train)

# --- 6. Validate ---
val_probs = clf.predict_proba(X_val)[:, 1] # Probability of Class 1
roc = roc_auc_score(y_val, val_probs)
print(f"\nValidation ROC AUC: {roc:.4f}")

# Check Accuracy at 0.5 threshold
val_preds = (val_probs > 0.5).astype(int)
acc = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {acc:.4f}")

# --- 7. Prediction on Test Set ---
print("\nGenerating Predictions for pairs_Played.csv...")
pairs_played = pd.read_csv('pairs_Played.csv')

# Generate features for test data
X_test = extract_features(pairs_played)

# Predict Probabilities
test_probs = clf.predict_proba(X_test)[:, 1]

# Apply the Median Ranking Strategy
# Since we know the test set is 50/50, we take the top 50% as played
threshold = np.median(test_probs)
pairs_played['prediction'] = (test_probs > threshold).astype(int)

print(pairs_played.head())
print(f"Test Set Threshold used: {threshold:.4f}")

# Save
# pairs_played.to_csv('predictions_Played_XGB.csv', index=False, columns=['userID', 'gameID', 'prediction'])

Training SVD for Feature Extraction...
Constructing Negative Samples...
Building Features...
Training XGBoost Classifier...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Validation ROC AUC: 0.7646
Validation Accuracy: 0.7019

Generating Predictions for pairs_Played.csv...
      userID     gameID  prediction  svd_score  item_pop  user_act
0  u04836696  g41031307           1   5.138286       144      59.0
1  u32377855  g62450068           0   2.755522        22     173.0
2  u58289072  g71021765           1   5.127018        81      17.0
3  u74685029  g26732871           1   3.842400       422      26.0
4  u06266052  g69433247           0   4.001235        35      48.0
Test Set Threshold used: 0.4673


In [23]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import random

# --- 1. Load Data ---
df = pd.DataFrame(train_data)

# --- 2. Create Implicit Features (The Fix) ---
print("Generating Implicit Latent Factors...")

# A. Build the User-Item Binary Matrix
# We want a matrix where rows=users, cols=games, value=1 (Played)
# Since the matrix is too big for RAM, we use a trick:
# We learn embeddings for Users and Items separately using the known pairs.

# Map IDs to integers 0..N
user_ids = df['userID'].unique()
item_ids = df['gameID'].unique()
u_map = {u: i for i, u in enumerate(user_ids)}
i_map = {i: i for i, i in enumerate(item_ids)}

# Create a sparse matrix-like structure using pivoting is expensive.
# Instead, we will effectively do "Content-Based" filtering using User History.

# B. Jaccard-Based "Popularity Overlap" (Very strong feature)
# Logic: "People who played Game X also played Game Y"
print("Building Co-occurrence Dictionary...")
game_users = df.groupby('gameID')['userID'].apply(set).to_dict()
user_games = df.groupby('userID')['gameID'].apply(set).to_dict()

# Statistics
item_popularity = df['gameID'].value_counts().to_dict()
user_activity = df['userID'].value_counts().to_dict()

# --- 3. Construct Training Data (Positives + Negatives) ---
print("Constructing Negative Samples...")
pos_df = df[['userID', 'gameID']].copy()
pos_df['label'] = 1

# Negative Sampling
all_games = list(item_popularity.keys())
n_negatives = len(pos_df)

neg_users = df['userID'].sample(n_negatives, replace=True).values
neg_games = random.choices(all_games, k=n_negatives)
neg_df = pd.DataFrame({'userID': neg_users, 'gameID': neg_games})
neg_df['label'] = 0

train_class_df = pd.concat([pos_df, neg_df]).sample(frac=1.0, random_state=42)

# --- 4. Feature Engineering (The "Jaccard" Pivot) ---
print("Building Similarity Features...")

def calculate_overlap_score(row):
    """
    Approximates Item-Item Similarity without the full matrix.
    Measures: How popular is this game among the user's specific history?
    """
    u = row['userID']
    target_g = row['gameID']
    
    # If this is a cold-start user or game
    if u not in user_games or target_g not in game_users:
        return 0.0
    
    # Get the set of users who played the target game
    target_g_players = game_users[target_g]
    
    # Get the games the user has played
    u_history = user_games[u]
    
    # METRIC: Co-occurrence Sum
    # "Of the games the user played, how many times did they appear 
    # in the histories of people who played the target game?"
    # This is effectively: sum(freq(g') for g' in user_history if g' is related to target)
    # But that's slow. 
    
    # FASTER METRIC: Jaccard Proxy
    # Does the user play "popular" games? We already have that.
    # We want: Does the user play games that share users with the target?
    
    # Let's stick to pure popularity + activity stats which are O(1) lookups
    # and add a simplified "Jaccard" if possible, but for speed, let's rely on
    # the interaction of User Activity * Item Popularity in the Tree.
    return 0.0 

# REVISED FEATURE STRATEGY: 
# XGBoost is great at learning non-linear combos of basic stats.
# Let's give it the raw ingredients to learn the "Gravity" model: 
# P(play) ~ (Activity * Popularity) / Distance

def extract_features(dataframe):
    # 1. Item Popularity (Global)
    dataframe['item_pop'] = dataframe['gameID'].map(item_popularity).fillna(0)
    
    # 2. User Activity (Global)
    dataframe['user_act'] = dataframe['userID'].map(user_activity).fillna(0)
    
    # 3. Interaction Term (Gravity)
    dataframe['gravity'] = np.log1p(dataframe['item_pop']) * np.log1p(dataframe['user_act'])
    
    # 4. Target Encoding (Optional but risky - skip for now to avoid leakage)
    
    return dataframe[['item_pop', 'user_act', 'gravity']]

X = extract_features(train_class_df)
y = train_class_df['label']

# --- 5. Train XGBoost ---
print("Training XGBoost...")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

clf = xgb.XGBClassifier(
    n_estimators=200,        # More trees
    max_depth=5,             # Slightly deeper
    learning_rate=0.05,      # Slower learning
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

clf.fit(X_train, y_train)

# --- 6. Validate ---
val_probs = clf.predict_proba(X_val)[:, 1]
roc = roc_auc_score(y_val, val_probs)
print(f"\nValidation ROC AUC: {roc:.4f}")

# Check Median Accuracy
threshold = np.median(val_probs)
val_preds = (val_probs > threshold).astype(int)
acc = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy (Median Thresh): {acc:.4f}")

# --- 7. Prediction ---
print("\nGenerating Predictions...")
pairs_played = pd.read_csv('pairs_Played.csv')

# Features
X_test = extract_features(pairs_played)

# Predict
test_probs = clf.predict_proba(X_test)[:, 1]

# RANKING STRATEGY (Crucial)
# 1. Sort predictions
# 2. Top 50% = 1, Bottom 50% = 0
pairs_played['raw_score'] = test_probs
median_thresh = pairs_played['raw_score'].median()
pairs_played['prediction'] = (pairs_played['raw_score'] > median_thresh).astype(int)

print(pairs_played.head())
print(f"Test Threshold: {median_thresh:.4f}")

# Save
# pairs_played.to_csv('predictions_Played_Ranking.csv', index=False, columns=['userID', 'gameID', 'prediction'])

Generating Implicit Latent Factors...
Building Co-occurrence Dictionary...
Constructing Negative Samples...
Building Similarity Features...
Training XGBoost...

Validation ROC AUC: 0.7601
Validation Accuracy (Median Thresh): 0.6989

Generating Predictions...
      userID     gameID  prediction  item_pop  user_act    gravity  raw_score
0  u04836696  g41031307           1       144      59.0  20.376463   0.670874
1  u32377855  g62450068           0        22     173.0  16.176188   0.326026
2  u58289072  g71021765           1        81      17.0  12.737057   0.505203
3  u74685029  g26732871           1       422      26.0  19.931152   0.844870
4  u06266052  g69433247           0        35      48.0  13.946412   0.318320
Test Threshold: 0.4683


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from scipy.sparse import csr_matrix
import random

# --- 1. Load Data ---
df = pd.DataFrame(train_data)

# --- 2. Build the Implicit Interaction Matrix ---
print("Building Implicit Matrix...")
# Map IDs to contiguous integers for matrix operations
unique_users = df['userID'].unique()
unique_items = df['gameID'].unique()

u_map = {u: i for i, u in enumerate(unique_users)}
i_map = {j: i for i, j in enumerate(unique_items)}

row_ind = [u_map[u] for u in df['userID']]
col_ind = [i_map[i] for i in df['gameID']]
data_vals = np.ones(len(row_ind))

# Create Sparse Matrix (Rows=Users, Cols=Games)
# This is the "Implicit" matrix (1 = Played, 0 = Not Played)
interaction_matrix = csr_matrix((data_vals, (row_ind, col_ind)), 
                                shape=(len(unique_users), len(unique_items)))

# --- 3. Implicit SVD (Latent Features) ---
print("Extracting Latent Factors (SVD)...")
# We reduce the matrix to 32 dimensions. 
# This captures "Genre Affinity" and "User Taste" without needing text.
n_components = 32
svd = TruncatedSVD(n_components=n_components, random_state=42)

# Fit on the sparse matrix
user_factors = svd.fit_transform(interaction_matrix)
item_factors = svd.components_.T

# Create fast lookup dictionaries
user_vec_dict = {u_id: user_factors[i] for u_id, i in u_map.items()}
item_vec_dict = {i_id: item_factors[i] for i_id, i in i_map.items()}

# Global Averages for Cold Start
avg_user_vec = np.mean(user_factors, axis=0)
avg_item_vec = np.mean(item_factors, axis=0)

# --- 4. Feature Engineering Function ---
item_popularity = df['gameID'].value_counts().to_dict()
user_activity = df['userID'].value_counts().to_dict()

def extract_features(dataframe):
    # 1. Latent Compatibility (Dot Product of SVD vectors)
    dots = []
    for u, i in zip(dataframe['userID'], dataframe['gameID']):
        u_vec = user_vec_dict.get(u, avg_user_vec)
        i_vec = item_vec_dict.get(i, avg_item_vec)
        dots.append(np.dot(u_vec, i_vec))
    
    dataframe['latent_score'] = dots
    
    # 2. Gravity Features (Pop/Act)
    dataframe['item_pop'] = dataframe['gameID'].map(item_popularity).fillna(0)
    dataframe['user_act'] = dataframe['userID'].map(user_activity).fillna(0)
    dataframe['gravity'] = np.log1p(dataframe['item_pop']) * np.log1p(dataframe['user_act'])
    
    return dataframe[['latent_score', 'item_pop', 'user_act', 'gravity']]

# --- 5. Construct Training Set (Positives + Negatives) ---
print("Constructing Negative Samples...")
pos_df = df[['userID', 'gameID']].copy()
pos_df['label'] = 1

# Negative Sampling
all_games = list(item_popularity.keys())
n_negatives = len(pos_df)

neg_users = df['userID'].sample(n_negatives, replace=True).values
neg_games = random.choices(all_games, k=n_negatives)
neg_df = pd.DataFrame({'userID': neg_users, 'gameID': neg_games})
neg_df['label'] = 0

train_class_df = pd.concat([pos_df, neg_df]).sample(frac=1.0, random_state=42)

# --- 6. Train XGBoost ---
print("Training Hybrid XGBoost...")
X = extract_features(train_class_df)
y = train_class_df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

clf = xgb.XGBClassifier(
    n_estimators=300,        # More trees to leverage the new latent features
    max_depth=6,             # Deeper trees to capture interaction between Latent & Gravity
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

clf.fit(X_train, y_train)

# --- 7. Validate ---
val_probs = clf.predict_proba(X_val)[:, 1]
roc = roc_auc_score(y_val, val_probs)
print(f"\nValidation ROC AUC: {roc:.4f}")

# Median Accuracy
threshold = np.median(val_probs)
val_preds = (val_probs > threshold).astype(int)
acc = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy (Median Thresh): {acc:.4f}")

# --- 8. Final Prediction ---
print("\nGenerating Predictions for pairs_Played.csv...")
pairs_played = pd.read_csv('pairs_Played.csv')

# Extract Hybrid Features
X_test = extract_features(pairs_played)

# Predict
test_probs = clf.predict_proba(X_test)[:, 1]

# Ranking Strategy
pairs_played['raw_score'] = test_probs
median_thresh = pairs_played['raw_score'].median()
pairs_played['prediction'] = (pairs_played['raw_score'] > median_thresh).astype(int)

print(pairs_played.head())
pairs_played.to_csv('predictions_Played.csv', index=False, columns=['userID', 'gameID', 'prediction'])

Building Implicit Matrix...
Extracting Latent Factors (SVD)...
Constructing Negative Samples...
Training Hybrid XGBoost...

Validation ROC AUC: 0.8547
Validation Accuracy (Median Thresh): 0.7686

Generating Predictions for pairs_Played.csv...
      userID     gameID  prediction  latent_score  item_pop  user_act  \
0  u04836696  g41031307           1      0.073228       144      59.0   
1  u32377855  g62450068           0      0.004881        22     173.0   
2  u58289072  g71021765           1      0.017212        81      17.0   
3  u74685029  g26732871           1      0.181650       422      26.0   
4  u06266052  g69433247           0      0.007638        35      48.0   

     gravity  raw_score  
0  20.376463   0.703755  
1  16.176188   0.005892  
2  12.737057   0.630762  
3  19.931152   0.803112  
4  13.946412   0.182023  


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from surprise import SVD, Dataset, Reader
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict

# --- 1. Setup & Text Processing (Same as before) ---
df = pd.DataFrame(train_data)
df['hours_transformed'] = np.log2(df['hours'] + 1)

# NLP Pipeline (Keep this, it was good)
print("Vectorizing Text...")
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'].fillna(''))
svd_text = TruncatedSVD(n_components=15, random_state=42)
text_latents = svd_text.fit_transform(tfidf_matrix)

# Map Latents (Keep existing logic here)
user_text_sum = defaultdict(lambda: np.zeros(15))
user_text_cnt = defaultdict(int)
item_text_sum = defaultdict(lambda: np.zeros(15))
item_text_cnt = defaultdict(int)

for idx, (u, i) in enumerate(zip(df['userID'], df['gameID'])):
    vec = text_latents[idx]
    user_text_sum[u] += vec
    user_text_cnt[u] += 1
    item_text_sum[i] += vec
    item_text_cnt[i] += 1

user_profile = {u: user_text_sum[u] / c for u, c in user_text_cnt.items()}
item_profile = {i: item_text_sum[i] / c for i, c in item_text_cnt.items()}

def get_semantic_affinity(uid, iid):
    if uid not in user_profile or iid not in item_profile: return 0.0
    return np.dot(user_profile[uid], item_profile[iid])

# --- 2. Out-of-Fold SVD Feature Generation ---
print("Generating OOF SVD Features...")

# We need a column to store the SVD predictions for the training data
df['svd_feature'] = 0.0

# 5-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
reader = Reader(rating_scale=(0, df['hours_transformed'].max()))

for fold_i, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f"  Processing Fold {fold_i + 1}/5...")
    
    # Create temporary Train/Val sets for this fold
    fold_train = df.iloc[train_idx]
    fold_val = df.iloc[val_idx]
    
    # Train SVD on the Train part
    data_train = Dataset.load_from_df(fold_train[['userID', 'gameID', 'hours_transformed']], reader)
    trainset = data_train.build_full_trainset()
    
    # Regularization is key here
    model = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.05)
    model.fit(trainset)
    
    # Predict on the Val part (This mimics test time!)
    preds = [model.predict(row['userID'], row['gameID']).est for _, row in fold_val.iterrows()]
    
    # Store these "clean" predictions in the main dataframe
    df.loc[val_idx, 'svd_feature'] = preds

# --- 3. Train Final SVD for the Actual Test/Validation Set ---
# Now we need a model trained on ALL data to use for the final validation split
print("Training Final SVD on all data...")
full_data = Dataset.load_from_df(df[['userID', 'gameID', 'hours_transformed']], reader)
full_trainset = full_data.build_full_trainset()
final_svd = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.05)
final_svd.fit(full_trainset)

# --- 4. Feature Assembly & XGBoost ---

# Now we do the Train/Val split for XGBoost
# Note: We use the 'svd_feature' column we generated via OOF for training
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Global Averages for Fallback
global_mean = df['hours_transformed'].mean()
user_means = df.groupby('userID')['hours_transformed'].mean().to_dict()
item_means = df.groupby('gameID')['hours_transformed'].mean().to_dict()

def build_features(dataframe, is_training=True):
    # 1. Semantic Affinity
    text_scores = [get_semantic_affinity(u, i) for u, i in zip(dataframe['userID'], dataframe['gameID'])]
    
    # 2. SVD Score
    if is_training:
        # Use the OOF predictions we calculated earlier
        svd_scores = dataframe['svd_feature'].values
    else:
        # For validation/test, use the Final SVD model
        svd_scores = [final_svd.predict(u, i).est for u, i in zip(dataframe['userID'], dataframe['gameID'])]

    # 3. Explicit Bias Features (Helps XGBoost correct SVD mistakes)
    u_bias = [user_means.get(u, global_mean) for u in dataframe['userID']]
    i_bias = [item_means.get(i, global_mean) for i in dataframe['gameID']]

    return pd.DataFrame({
        'svd_rating': svd_scores,
        'semantic_affinity': text_scores,
        'user_avg': u_bias,
        'item_avg': i_bias
    })

X_train = build_features(train_df, is_training=True)
y_train = train_df['hours_transformed']

X_val = build_features(val_df, is_training=False)
y_val = val_df['hours_transformed']

print("Training XGBoost...")
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.02, # Slow learner
    max_depth=4,        # Shallow trees to prevent overfitting
    subsample=0.7,      # Random sampling
    colsample_bytree=0.7,
    reg_lambda=1.0,     # L2 Regularization
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

train_preds = xgb_model.predict(X_train)
val_preds = xgb_model.predict(X_val)

print(f"\nFinal Train MSE: {mean_squared_error(y_train, train_preds):.4f}")
print(f"Final Val MSE:   {mean_squared_error(y_val, val_preds):.4f}")


1.26.4


In [None]:
df_users = pd.DataFrame(user_dict,columns=['userID','games_u'])
df_users['games_u'] = df_users['games_u'].apply(set)
df_games = pd.DataFrame(user_dict,columns=['gameID','users_g'])
df_games['user_g'] = df_users['user_g'].apply(set)

In [4]:
import pandas as pd
from surprise import Reader, Dataset, SVD, SVDpp, NMF, KNNBaseline
from surprise.model_selection import cross_validate

# 1. Prepare your DataFrame (Assume train_data is loaded)
df = pd.DataFrame(train_data)
# Use the transformed hours as the "Rating" to predict
# We drop columns we don't need for Surprise (it only wants User, Item, Rating)
df_surprise = df[['userID', 'gameID', 'hours_transformed']]

# 2. Define the Reader
# IMPORTANT: Define the scale. Min is 0, Max is the max transformed hour in your set.
min_rating = df['hours_transformed'].min()
max_rating = df['hours_transformed'].max()
reader = Reader(rating_scale=(min_rating, max_rating))

# 3. Create the Surprise Dataset
data = Dataset.load_from_df(df_surprise[['userID', 'gameID', 'hours_transformed']], reader)

# 4. Your Benchmarking Code
# (I added SVDpp which models implicit interactions too, potentially good here)
algo_svd = SVD()
algo_svdpp = SVDpp() 
algo_nmf = NMF()
algo_knn = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': False})

algorithms = [algo_svd, algo_svdpp, algo_nmf, algo_knn]

print(f"Benchmarking on {len(df)} interactions...")

for algo in algorithms:
    print(f"Evaluating {algo.__class__.__name__}...")
    # metrics: RMSE (Root Mean Squared Error) is the standard for Hours prediction
    results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    print(f"  Mean RMSE: {results['test_rmse'].mean():.4f}")
    print(f"  Mean MAE:  {results['test_mae'].mean():.4f}\n")

Benchmarking on 175000 interactions...
Evaluating SVD...
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.8199  1.7818  1.8231  1.8263  1.8050  1.8112  0.0164  
MAE (testset)     1.3710  1.3468  1.3712  1.3725  1.3607  1.3644  0.0098  
Fit time          2.40    2.93    2.89    2.78    2.20    2.64    0.29    
Test time         0.44    0.34    0.42    0.34    0.25    0.36    0.07    
  Mean RMSE: 1.8112
  Mean MAE:  1.3644

Evaluating SVDpp...
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.9215  1.9313  1.9080  1.9048  1.9351  1.9201  0.0121  
MAE (testset)     1.4201  1.4248  1.4090  1.4057  1.4321  1.4183  0.0098  
Fit time          9.70    11.62   9.81    11.56   11.85   10.91   0.95    
Test time         1.94    1.73    2.05    1.95    2.18    1.97    0.15    
  Mean RMSE: 1.9201
  

In [6]:
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import GridSearchCV

# --- 1. Load Data ---
# (Assuming you have your dataframe 'df' from previous steps)
# If loading from scratch:
# df = pd.DataFrame(train_data) 
# Ensure you are using the TRANSFORMED hours
df_surprise = df[['userID', 'gameID', 'hours_transformed']]

# --- 2. Setup Surprise Data Objects ---
# Define the scale (Min to Max of your transformed target)
min_rating = df['hours_transformed'].min()
max_rating = df['hours_transformed'].max()
reader = Reader(rating_scale=(min_rating, max_rating))

data = Dataset.load_from_df(df_surprise, reader)

# --- 3. Define Hyperparameter Grid ---
# These are the most critical parameters for SVD
param_grid = {
    # Number of latent factors (Dimensions of the user/item vectors)
    # 20 is standard, 50/100 allows more complex patterns but risks overfitting
    'n_factors': [20, 50, 100],
    
    # Number of epochs (iterations)
    # Too few = underfit, Too many = waste of time (SVD converges fast)
    'n_epochs': [20, 30,50],
    
    # Learning Rate
    # 0.005 is safe, 0.01 is aggressive
    'lr_all': [0.005, 0.01,0.0005],
    
    # Regularization (The most important one for you!)
    # Higher reg (0.1) prevents the model from memorizing outliers (1000+ hour players)
    'reg_all': [0.02, 0.05, 0.1]
}

# --- 4. Run Grid Search ---
print("Starting Grid Search... (This may take a while)")
# measures=['rmse'] optimizes for Root Mean Squared Error (Standard for regression)
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=-1)

gs.fit(data)

# --- 5. Analyze Results ---
print("\n--- Grid Search Results ---")
print(f"Best RMSE Score: {gs.best_score['rmse']:.4f}")
print(f"Best Parameters: {gs.best_params['rmse']}")

# --- 6. Train Final Model with Best Parameters ---
print("\nRetraining SVD on full dataset with best parameters...")

# Extract best params
best_params = gs.best_params['rmse']
final_algo = SVD(
    n_factors=best_params['n_factors'],
    n_epochs=best_params['n_epochs'],
    lr_all=best_params['lr_all'],
    reg_all=best_params['reg_all']
)

# Fit on full data
trainset = data.build_full_trainset()
final_algo.fit(trainset)

# --- 7. Generate Predictions for 'Hours' ---
# Load Test Data
pairs_hours = pd.read_csv('pairs_Hours.csv')

def predict_hours(row):
    # .predict returns a Prediction object, .est is the value
    # Clip predictions to min/max to avoid crazy outliers
    pred = final_algo.predict(row['userID'], row['gameID']).est
    return max(min_rating, min(max_rating, pred))

pairs_hours['prediction'] = pairs_hours.apply(predict_hours, axis=1)

print("\nFinal Predictions Sample:")
print(pairs_hours.head())

# Save
# pairs_hours.to_csv('prediction_Hours_SVD_Optimized.csv', index=False)

Starting Grid Search... (This may take a while)

--- Grid Search Results ---
Best RMSE Score: 1.7671
Best Parameters: {'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.1}

Retraining SVD on full dataset with best parameters...

Final Predictions Sample:
      userID     gameID  prediction
0  u04763917  g51093074    4.125587
1  u10668484  g42523222    1.342928
2  u82502949  g39422502    5.397711
3  u14336188  g83517324    3.086567
4  u10096161  g10962300    3.164058


In [7]:
import pandas as pd
import numpy as np
import xgboost as xgb
from surprise import SVD, Dataset, Reader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

df = pd.DataFrame(train_data)
df['text_len'] = df['text'].str.len().fillna(0)

# 1. Create Train/Val Split using SKLearn
# We split the DATAFRAME first so we don't leak information
print("Splitting Data...")
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# --- 2. Train SVD (Layer 1) ---
print("Training SVD (Layer 1)...")
# Prepare data for Surprise
reader = Reader(rating_scale=(df['hours_transformed'].min(), df['hours_transformed'].max()))
train_data_surprise = Dataset.load_from_df(
    train_df[['userID', 'gameID', 'hours_transformed']], 
    reader
)
trainset = train_data_surprise.build_full_trainset()

# Train SVD
svd_model = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.01)
svd_model.fit(trainset)

# Generate SVD Feature for Train and Val
def get_svd_score(uid, iid):
    # .est returns the predicted rating
    return svd_model.predict(uid, iid).est

train_df['svd_feature'] = train_df.apply(lambda x: get_svd_score(x['userID'], x['gameID']), axis=1)
val_df['svd_feature'] = val_df.apply(lambda x: get_svd_score(x['userID'], x['gameID']), axis=1)

# --- 3. Feature Engineering (Similarity Scores) ---
print("Calculating Jaccard Similarities...")

# A. Build Lookups (Adjacency Lists) strictly from TRAINING data
# User -> Set of Games
user_history = train_df.groupby('userID')['gameID'].apply(set).to_dict()
# Game -> Set of Users
game_history = train_df.groupby('gameID')['userID'].apply(set).to_dict()

# Helper Jaccard Function
def calculate_jaccard(set_a, set_b):
    if not set_a or not set_b: return 0.0
    intersection = len(set_a.intersection(set_b))
    union = len(set_a.union(set_b))
    return intersection / union if union > 0 else 0.0

# Feature 1: Item-Based Jaccard
# "How similar is this game to other games this user has played?"
def get_mean_item_jaccard(row):
    user, target_game = row['userID'], row['gameID']
    
    # Games the user played (from history)
    history_games = user_history.get(user, set())
    
    # Target game's users
    target_game_users = game_history.get(target_game, set())
    
    similarities = []
    for other_game in history_games:
        if other_game == target_game: continue
        
        # Calculate Jaccard between Target Game and Other Game
        # (Based on the users who played them)
        other_game_users = game_history.get(other_game, set())
        sim = calculate_jaccard(target_game_users, other_game_users)
        similarities.append(sim)
        
    return np.mean(similarities) if similarities else 0.0

# Feature 2: User-Based Jaccard
# "How similar is this user to other users who played this game?"
def get_mean_user_jaccard(row):
    target_user, game = row['userID'], row['gameID']
    
    # Users who played this game
    game_users = game_history.get(game, set())
    
    # Target user's games
    target_user_games = user_history.get(target_user, set())
    
    similarities = []
    for other_user in game_users:
        if other_user == target_user: continue
        
        # Calculate Jaccard between Target User and Other User
        # (Based on the games they played)
        other_user_games = user_history.get(other_user, set())
        sim = calculate_jaccard(target_user_games, other_user_games)
        similarities.append(sim)
        
    return np.mean(similarities) if similarities else 0.0

# Apply Calculations (This can be slow on large data!)
train_df['item_jaccard'] = train_df.apply(get_mean_item_jaccard, axis=1)
train_df['user_jaccard'] = train_df.apply(get_mean_user_jaccard, axis=1)

# Apply to Validation (Using Train History lookups to avoid leakage)
val_df['item_jaccard'] = val_df.apply(get_mean_item_jaccard, axis=1)
val_df['user_jaccard'] = val_df.apply(get_mean_user_jaccard, axis=1)

# --- 4. Train XGBoost (Layer 2) ---
print("Training XGBoost (Layer 2)...")

features = ['svd_feature', 'item_jaccard', 'user_jaccard', 'text_len']
target = 'hours_transformed'

xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=6,
    objective='reg:squarederror',
    random_state=42
)

xgb_model.fit(train_df[features], train_df[target])

# --- 5. Evaluation ---
train_preds = xgb_model.predict(train_df[features])
val_preds = xgb_model.predict(val_df[features])

train_mse = mean_squared_error(train_df[target], train_preds)
val_mse = mean_squared_error(val_df[target], val_preds)

print(f"\nResults:")
print(f"Train MSE: {train_mse:.4f}")
print(f"Val MSE:   {val_mse:.4f}")

# --- 6. Prediction on Test Set (Optional) ---
# When you run on the actual test set, remember:
# 1. Calculate 'svd_feature' using the SVD model trained on FULL train data
# 2. Calculate Jaccard features using histories from FULL train data

Splitting Data...
Training SVD (Layer 1)...
Calculating Jaccard Similarities...
Training XGBoost (Layer 2)...

Results:
Train MSE: 1.4400
Val MSE:   3.4097


In [13]:
print(df['userID'].isna().sum())

0


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from surprise import SVD, Dataset, Reader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from collections import defaultdict

# Load Data
df = pd.DataFrame(train_data)
# Ensure transformations exist
df['hours_transformed'] = np.log2(df['hours'] + 1)
df['text_len'] = df['text'].str.len().fillna(0)

# --- 1. Split Data ---
print("Splitting Data...")
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# --- 2. NLP Pipeline: Creating Semantic Profiles ---
print("Vectorizing Text...")

# A. TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(train_df['text'].fillna(''))

# B. Dimensionality Reduction (LSA)
# We shrink 5000 words down to 15 dense features
n_text_factors = 15
svd_text = TruncatedSVD(n_components=n_text_factors, random_state=42)
text_latents = svd_text.fit_transform(tfidf_matrix)

# C. Map Latent Vectors to Users and Items
# Create dictionaries to store the sum of vectors and count
user_text_sum = defaultdict(lambda: np.zeros(n_text_factors))
user_text_cnt = defaultdict(int)
item_text_sum = defaultdict(lambda: np.zeros(n_text_factors))
item_text_cnt = defaultdict(int)

# Zip indices to iterate fast (avoiding .apply)
u_ids = train_df['userID'].values
i_ids = train_df['gameID'].values

for idx, (u, i) in enumerate(zip(u_ids, i_ids)):
    vec = text_latents[idx]
    user_text_sum[u] += vec
    user_text_cnt[u] += 1
    item_text_sum[i] += vec
    item_text_cnt[i] += 1

# D. Create Average Profiles
user_profile = {u: user_text_sum[u] / c for u, c in user_text_cnt.items()}
item_profile = {i: item_text_sum[i] / c for i, c in item_text_cnt.items()}

# E. Feature Extraction Function
def get_semantic_affinity(uid, iid):
    # If user or item is new (cold start), return 0 affinity
    if uid not in user_profile or iid not in item_profile:
        return 0.0
    
    u_vec = user_profile[uid]
    i_vec = item_profile[iid]
    
    # Dot product as similarity score
    return np.dot(u_vec, i_vec)

# --- 3. Train Collaborative Filtering (Surprise SVD) ---
print("Training Rating SVD...")
reader = Reader(rating_scale=(0, df['hours_transformed'].max()))
train_data_surprise = Dataset.load_from_df(
    train_df[['userID', 'gameID', 'hours_transformed']], 
    reader
)
trainset = train_data_surprise.build_full_trainset()

# Increased regularization to prevent overfitting on residuals
rating_svd = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.02)
rating_svd.fit(trainset)

def get_rating_svd_score(uid, iid):
    return rating_svd.predict(uid, iid).est

# --- 4. Assemble Features for XGBoost ---
print("Assembling Features...")

def build_features(dataframe):
    # Vectorized operations are preferred, but for lookups, list comprehension is acceptable
    # 1. Collaborative Filtering Score
    cf_scores = [get_rating_svd_score(u, i) for u, i in zip(dataframe['userID'], dataframe['gameID'])]
    
    # 2. Semantic Affinity (Text Match)
    text_scores = [get_semantic_affinity(u, i) for u, i in zip(dataframe['userID'], dataframe['gameID'])]
    
    # 3. Simple Global/Bias Stats (Much faster than Jaccard)
    # (Optional: Add specific user/item bias lookups here if SVD misses them)
    
    X = pd.DataFrame({
        'svd_rating': cf_scores,
        'semantic_affinity': text_scores,
        # 'text_len': dataframe['text_len'] # Be careful: Test set does NOT have this!
    })
    return X

X_train = build_features(train_df)
y_train = train_df['hours_transformed']

X_val = build_features(val_df)
y_val = val_df['hours_transformed']

# --- 5. Train XGBoost ---
print("Training XGBoost...")

xgb_model = xgb.XGBRegressor(
    n_estimators=150,
    learning_rate=0.003, # Lower LR for better generalization
    max_depth=5,
    subsample=0.8,      # Reduce overfitting
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

# --- 6. Results ---
train_preds = xgb_model.predict(X_train)
val_preds = xgb_model.predict(X_val)

print(f"\nFinal Train MSE: {mean_squared_error(y_train, train_preds):.4f}")
print(f"Final Val MSE:   {mean_squared_error(y_val, val_preds):.4f}")

Splitting Data...
Vectorizing Text...
Training Rating SVD...
Assembling Features...
Training XGBoost...

Final Train MSE: 1.5840
Final Val MSE:   3.2767


In [9]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import SVD, Dataset, Reader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report

# --- 1. Load & Split Data ---
if 'train_data' not in locals():
    # Placeholder for safety
    print("Warning: Loading dummy data.")
    train_data = [{'hours': 12.0, 'text': "A", 'gameID': 'g1', 'hours_transformed': 3.6, 'userID': 'u1'}] # ... (rest of dummy data)

df = pd.DataFrame(train_data)
df['text_len_norm'] = df['text'].str.len().fillna(0) / df['text'].str.len().max()

# Split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# --- 2. Layer 1: SVD Feature ---
print("Training SVD Layer...")
reader = Reader(rating_scale=(df['hours_transformed'].min(), df['hours_transformed'].max()))
train_data_surprise = Dataset.load_from_df(train_df[['userID', 'gameID', 'hours_transformed']], reader)
trainset = train_data_surprise.build_full_trainset()

# Using your best parameters from earlier
svd_model = SVD(n_factors=20, n_epochs=30, lr_all=0.005, reg_all=0.05)
svd_model.fit(trainset)

def get_svd_est(uid, iid):
    return svd_model.predict(uid, iid).est

# Generate Feature
train_df['svd_feature'] = train_df.apply(lambda x: get_svd_est(x['userID'], x['gameID']), axis=1)
val_df['svd_feature'] = val_df.apply(lambda x: get_svd_est(x['userID'], x['gameID']), axis=1)

# --- 3. Feature Engineering: Jaccard Similarity ---
print("Calculating Jaccard Features...")
# Adjacency Lists (From Training Data ONLY)
user_history = train_df.groupby('userID')['gameID'].apply(set).to_dict()
game_history = train_df.groupby('gameID')['userID'].apply(set).to_dict()

def calculate_jaccard(set_a, set_b):
    if not set_a or not set_b: return 0.0
    intersection = len(set_a.intersection(set_b))
    union = len(set_a.union(set_b))
    return intersection / union if union > 0 else 0.0

def get_jaccard_feats(row):
    u, g = row['userID'], row['gameID']
    
    # Item-Based
    u_games = user_history.get(u, set())
    target_users = game_history.get(g, set())
    
    item_sims = []
    for other_g in u_games:
        if other_g == g: continue
        other_users = game_history.get(other_g, set())
        item_sims.append(calculate_jaccard(target_users, other_users))
    mean_item = np.mean(item_sims) if item_sims else 0.0
    
    # User-Based
    g_users = game_history.get(g, set())
    target_games = user_history.get(u, set())
    
    user_sims = []
    for other_u in g_users:
        if other_u == u: continue
        other_games = user_history.get(other_u, set())
        user_sims.append(calculate_jaccard(target_games, other_games))
    mean_user = np.mean(user_sims) if user_sims else 0.0
    
    return mean_item, mean_user

# Apply
train_df[['item_jaccard', 'user_jaccard']] = train_df.apply(get_jaccard_feats, axis=1, result_type='expand')
val_df[['item_jaccard', 'user_jaccard']] = val_df.apply(get_jaccard_feats, axis=1, result_type='expand')

# --- 4. Layer 2: XGBoost ---
print("Training XGBoost Layer...")
features = ['svd_feature', 'item_jaccard', 'user_jaccard', 'text_len_norm']
target = 'hours_transformed'

xgb_model = xgb.XGBRegressor(
    n_estimators=100, learning_rate=0.05, max_depth=6, 
    objective='reg:squarederror', random_state=42
)
xgb_model.fit(train_df[features], train_df[target])

# Predict
val_preds = xgb_model.predict(val_df[features])
mse = mean_squared_error(val_df[target], val_preds)
print(f"\nValidation MSE: {mse:.4f}")

# --- 5. THE DIAGNOSTIC: Binned Confusion Matrix ---
print("\n--- Diagnostic Report ---")

# A. Create Bins based on Quantiles (Low / Medium / High)
# We define "Low" as bottom 33%, "High" as top 33%
bins = [-1, np.percentile(df[target], 33), np.percentile(df[target], 66), float('inf')]
labels = ['Low', 'Medium', 'High']

# B. Bin the Actuals and Predictions
val_df['bin_actual'] = pd.cut(val_df[target], bins=bins, labels=labels)
val_df['bin_pred'] = pd.cut(val_preds, bins=bins, labels=labels)

# C. Generate Matrix
cm = confusion_matrix(val_df['bin_actual'], val_df['bin_pred'], labels=labels)

# Visualizing (Text format)
print("\nConfusion Matrix (Rows=Actual, Cols=Predicted):")
print(f"{'':<10} {'Low':<10} {'Med':<10} {'High':<10}")
for i, label in enumerate(labels):
    print(f"{label:<10} {cm[i][0]:<10} {cm[i][1]:<10} {cm[i][2]:<10}")

print("\nClassification Report (Accuracy per Bin):")
print(classification_report(val_df['bin_actual'], val_df['bin_pred']))

# D. Feature Importance (Why did it make those decisions?)
print("\nFeature Importance:")
for name, score in zip(features, xgb_model.feature_importances_):
    print(f"  {name}: {score:.4f}")

Training SVD Layer...
Calculating Jaccard Features...
Training XGBoost Layer...

Validation MSE: 3.5627

--- Diagnostic Report ---

Confusion Matrix (Rows=Actual, Cols=Predicted):
           Low        Med        High      
Low        6739       3721       1105      
Medium     2937       6519       2101      
High       676        4580       6622      

Classification Report (Accuracy per Bin):
              precision    recall  f1-score   support

        High       0.67      0.56      0.61     11878
         Low       0.65      0.58      0.61     11565
      Medium       0.44      0.56      0.49     11557

    accuracy                           0.57     35000
   macro avg       0.59      0.57      0.57     35000
weighted avg       0.59      0.57      0.57     35000


Feature Importance:
  svd_feature: 0.9897
  item_jaccard: 0.0036
  user_jaccard: 0.0024
  text_len_norm: 0.0043


In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb
from surprise import SVD, Dataset, Reader
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict

# --- 1. Setup & Text Processing (Same as before) ---
df = pd.DataFrame(train_data)
df['hours_transformed'] = np.log2(df['hours'] + 1)

# NLP Pipeline (Keep this, it was good)
print("Vectorizing Text...")
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'].fillna(''))
svd_text = TruncatedSVD(n_components=15, random_state=42)
text_latents = svd_text.fit_transform(tfidf_matrix)

# Map Latents (Keep existing logic here)
user_text_sum = defaultdict(lambda: np.zeros(15))
user_text_cnt = defaultdict(int)
item_text_sum = defaultdict(lambda: np.zeros(15))
item_text_cnt = defaultdict(int)

for idx, (u, i) in enumerate(zip(df['userID'], df['gameID'])):
    vec = text_latents[idx]
    user_text_sum[u] += vec
    user_text_cnt[u] += 1
    item_text_sum[i] += vec
    item_text_cnt[i] += 1

user_profile = {u: user_text_sum[u] / c for u, c in user_text_cnt.items()}
item_profile = {i: item_text_sum[i] / c for i, c in item_text_cnt.items()}

def get_semantic_affinity(uid, iid):
    if uid not in user_profile or iid not in item_profile: return 0.0
    return np.dot(user_profile[uid], item_profile[iid])

# --- 2. Out-of-Fold SVD Feature Generation ---
print("Generating OOF SVD Features...")

# We need a column to store the SVD predictions for the training data
df['svd_feature'] = 0.0

# 5-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
reader = Reader(rating_scale=(0, df['hours_transformed'].max()))

for fold_i, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f"  Processing Fold {fold_i + 1}/5...")
    
    # Create temporary Train/Val sets for this fold
    fold_train = df.iloc[train_idx]
    fold_val = df.iloc[val_idx]
    
    # Train SVD on the Train part
    data_train = Dataset.load_from_df(fold_train[['userID', 'gameID', 'hours_transformed']], reader)
    trainset = data_train.build_full_trainset()
    
    # Regularization is key here
    model = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.05)
    model.fit(trainset)
    
    # Predict on the Val part (This mimics test time!)
    preds = [model.predict(row['userID'], row['gameID']).est for _, row in fold_val.iterrows()]
    
    # Store these "clean" predictions in the main dataframe
    df.loc[val_idx, 'svd_feature'] = preds

# --- 3. Train Final SVD for the Actual Test/Validation Set ---
# Now we need a model trained on ALL data to use for the final validation split
print("Training Final SVD on all data...")
full_data = Dataset.load_from_df(df[['userID', 'gameID', 'hours_transformed']], reader)
full_trainset = full_data.build_full_trainset()
final_svd = SVD(n_factors=20, n_epochs=20, lr_all=0.005, reg_all=0.05)
final_svd.fit(full_trainset)

# --- 4. Feature Assembly & XGBoost ---

# Now we do the Train/Val split for XGBoost
# Note: We use the 'svd_feature' column we generated via OOF for training
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Global Averages for Fallback
global_mean = df['hours_transformed'].mean()
user_means = df.groupby('userID')['hours_transformed'].mean().to_dict()
item_means = df.groupby('gameID')['hours_transformed'].mean().to_dict()

def build_features(dataframe, is_training=True):
    # 1. Semantic Affinity
    text_scores = [get_semantic_affinity(u, i) for u, i in zip(dataframe['userID'], dataframe['gameID'])]
    
    # 2. SVD Score
    if is_training:
        # Use the OOF predictions we calculated earlier
        svd_scores = dataframe['svd_feature'].values
    else:
        # For validation/test, use the Final SVD model
        svd_scores = [final_svd.predict(u, i).est for u, i in zip(dataframe['userID'], dataframe['gameID'])]

    # 3. Explicit Bias Features (Helps XGBoost correct SVD mistakes)
    u_bias = [user_means.get(u, global_mean) for u in dataframe['userID']]
    i_bias = [item_means.get(i, global_mean) for i in dataframe['gameID']]

    return pd.DataFrame({
        'svd_rating': svd_scores,
        'semantic_affinity': text_scores,
        'user_avg': u_bias,
        'item_avg': i_bias
    })

X_train = build_features(train_df, is_training=True)
y_train = train_df['hours_transformed']

X_val = build_features(val_df, is_training=False)
y_val = val_df['hours_transformed']

print("Training XGBoost...")
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.02, # Slow learner
    max_depth=4,        # Shallow trees to prevent overfitting
    subsample=0.7,      # Random sampling
    colsample_bytree=0.7,
    reg_lambda=1.0,     # L2 Regularization
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

train_preds = xgb_model.predict(X_train)
val_preds = xgb_model.predict(X_val)

print(f"\nFinal Train MSE: {mean_squared_error(y_train, train_preds):.4f}")
print(f"Final Val MSE:   {mean_squared_error(y_val, val_preds):.4f}")

Vectorizing Text...
Generating OOF SVD Features...
  Processing Fold 1/5...
  Processing Fold 2/5...
  Processing Fold 3/5...
  Processing Fold 4/5...
  Processing Fold 5/5...
Training Final SVD on all data...
Training XGBoost...

Final Train MSE: 2.8835
Final Val MSE:   2.4813
