In [None]:
import gzip
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
import datetime

def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    u = d['userID']
    try:
      g = d['gameID']
    except Exception as e:
      g = None
    yield u,g,d

### read train and test data

In [4]:
train_data = []
user_games = defaultdict(list)
game_users = defaultdict(list)

for u,g,d in readJSON("train.json.gz"):
    user_games[u].append(g)
    game_users[g].append(u)
    train_data.append(d)

df_train_data = pd.DataFrame(train_data)
df_train_data.drop('user_id',axis=1,inplace=True)
df_train_data['found_funny'] = df_train_data['found_funny'].fillna(0)
df_train_data['compensation'] = df_train_data['compensation'].fillna(0)
df_train_data.loc[df_train_data['compensation'] != 0,'compensation'] = 1
df_test_hours_data = pd.read_csv("pairs_Hours.csv")
df_test_play_data = pd.read_csv("pairs_Played.csv")

### define biased matrix factorization (regularized SVD)

In [5]:
class BiasedMF:
    def __init__(self, n_factors=10, learning_rate=0.01, 
                 reg_bu=0.0, reg_bi=0.0, reg_pu=0.0, reg_qi=0.0, n_epochs=10):
        """
        Biased Matrix Factorization (Regularized SVD) optimized via SGD.
        
        Model: r_hat = mu + b_u + b_i + dot(p_u, q_i)
        
        Parameters:
        -----------
        n_factors : int
            Number of latent factors (dimension of vectors p_u and q_i)
        learning_rate : float
            Step size for Gradient Descent (eta)
        reg_bu, reg_bi, reg_pu, reg_qi : float
            Regularization parameters (lambda) to prevent overfitting
        n_epochs : int
            Number of passes over the training data
        """
        self.n_factors = n_factors
        self.lr = learning_rate
        self.reg_bu = reg_bu
        self.reg_bi = reg_bi
        self.reg_pu = reg_pu
        self.reg_qi = reg_qi
        self.n_epochs = n_epochs
        
        # Internal state
        self.mu = 0.0
        self.user_bias = None  # numpy array
        self.item_bias = None  # numpy array
        self.P = None          # User factors (n_users x n_factors)
        self.Q = None          # Item factors (n_items x n_factors)
        
        # Mappings
        self.user2idx = {}
        self.idx2user = {}
        self.game2idx = {}
        self.idx2game = {}

    def fit(self, df):
        """
        Trains the model using Stochastic Gradient Descent.
        df columns: ['userID', 'gameID', 'hours_transformed']
        """
        print("Initializing BiasedMF...")
        
        # 1. Create Mappings
        unique_users = df['userID'].unique()
        unique_games = df['gameID'].unique()
        
        self.n_users = len(unique_users)
        self.n_games = len(unique_games)
        
        self.user2idx = {u: i for i, u in enumerate(unique_users)}
        self.game2idx = {g: i for i, g in enumerate(unique_games)}
        
        # 2. Initialize Parameters
        self.mu = df['hours_transformed'].mean()
        
        # Biases initialized to zero
        self.user_bias = np.zeros(self.n_users)
        self.item_bias = np.zeros(self.n_games)
        
        # Latent Factors initialized with small random noise (Normal Dist)
        # Scaling by 0.1 helps convergence
        self.P = np.random.normal(0, 0.1, (self.n_users, self.n_factors))
        self.Q = np.random.normal(0, 0.1, (self.n_games, self.n_factors))
        
        # Convert dataframe to numpy arrays for faster iteration
        # We map string IDs to integers here for the training loop
        users_idx = df['userID'].map(self.user2idx).values
        games_idx = df['gameID'].map(self.game2idx).values
        ratings = df['hours_transformed'].values
        
        n_samples = len(df)
        
        # 3. Training Loop (SGD)
        print(f"Starting training on {n_samples} samples for {self.n_epochs} epochs.")
        
        for epoch in range(self.n_epochs):
            # Calculate MSE at start of epoch (optional, but good for monitoring)
            # Keeping it simple here to focus on speed
            
            # Shuffle indices for true SGD behavior
            indices = np.random.permutation(n_samples)
            
            total_error = 0
            
            for i in indices:
                u = users_idx[i]
                g = games_idx[i]
                r = ratings[i]
                
                # --- Prediction Step ---
                # r_hat = mu + b_u + b_i + P_u . Q_i
                dot_prod = np.dot(self.P[u], self.Q[g])
                pred = self.mu + self.user_bias[u] + self.item_bias[g] + dot_prod
                
                # --- Error Calculation ---
                err = r - pred
                total_error += err**2
                
                # --- Update Rules (Gradient Descent) ---
                # Update Biases
                # b_u <- b_u + lr * (err - reg * b_u)
                self.user_bias[u] += self.lr * (err - self.reg_bu * self.user_bias[u])
                self.item_bias[g] += self.lr * (err - self.reg_bi * self.item_bias[g])
                
                # Update Latent Factors
                # Note: We need to copy P[u] before updating it to update Q[g] correctly 
                # (though simultaneous update approximation is standard in code)
                p_u_current = self.P[u].copy()
                
                # P_u <- P_u + lr * (err * Q_i - reg * P_u)
                self.P[u] += self.lr * (err * self.Q[g] - self.reg_pu * self.P[u])
                
                # Q_i <- Q_i + lr * (err * P_u - reg * Q_i)
                self.Q[g] += self.lr * (err * p_u_current - self.reg_qi * self.Q[g])
            
            mse = total_error / n_samples
            print(f"Epoch {epoch+1}/{self.n_epochs} - MSE: {mse:.4f}")

    def predict(self, user_id, game_id):
        """
        Predict rating for user item pair.
        Handles Cold Start by falling back to global mean.
        """
        # Cold Start Check
        if user_id not in self.user2idx or game_id not in self.game2idx:
            # If we haven't seen the user or game, return global mean
            # (A more advanced approach would use just user_bias or item_bias if available)
            return self.mu
        
        u = self.user2idx[user_id]
        g = self.game2idx[game_id]
        
        pred = self.mu + self.user_bias[u] + self.item_bias[g] + np.dot(self.P[u], self.Q[g])
        return pred

    def get_components(self, user_id, game_id):
        """
        Show breakdown of prediction components for analysis.
        """
        if user_id not in self.user2idx or game_id not in self.game2idx:
            return {"error": "User or Game not found in training data"}
            
        u = self.user2idx[user_id]
        g = self.game2idx[game_id]
        
        components = {
            "Global Mean": self.mu,
            "User Bias": self.user_bias[u],
            "Game Bias": self.item_bias[g],
            "Latent Interaction": np.dot(self.P[u], self.Q[g]),
            "Total Prediction": self.predict(user_id, game_id)
        }
        return components

    def predict_batch(self, df):
        """
        Batch prediction for efficiency.
        df: dataframe containing 'userID' and 'gameID' columns
        Returns: list of predictions
        """
        predictions = []
        # Iterating is acceptable here as we need to handle string IDs and potential cold starts
        # Vectorization is difficult with mixed cold-start scenarios without complex masking
        for _, row in df.iterrows():
            pred = self.predict(row['userID'], row['gameID'])
            predictions.append(pred)
        return predictions

# --- Example Usage (Commented out) ---
# model = BiasedMF(n_factors=5, learning_rate=0.005, reg_bu=0.02, reg_bi=0.02, n_epochs=10)
# model.fit(train_df)
# preds = model.predict_batch(test_df)

### define hybrid model
1. biased matrix factorization (regularized SVD)
2. gradient boosting machine on features

A "Hybrid" approach, specifically a Residual Learning pipeline, is often the "secret sauce" in winning machine learning competitions. By letting the Matrix Factorization model handle the broad patterns (latent factors) and using a Gradient Boosting Machine (GBM) to fix the errors using specific content features (like review length or release date), you get the best of both world

In [6]:
class HybridMF:
    """
    Two-stage hybrid:
    1. BiasedMF learns from user/game IDs (Latent Factors)
    2. Residual model (GBM/Ridge) learns from content features to correct MF errors
    """
    def __init__(self, mf_model, residual_model_type='gbm'):
        self.mf_model = mf_model
        
        if residual_model_type == 'gbm':
            self.residual_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
        elif residual_model_type == 'ridge':
            self.residual_model = Ridge(alpha=1.0)
        else:
            raise ValueError("residual_model must be 'gbm' or 'ridge'")
            
        self.is_fitted = False

    def fit(self, train_df, feature_columns):
        """
        Stage 1: Train MF model
        Stage 2: Train residual model on (True - MF_Pred) using features
        """
        print("--- Stage 1: Training BiasedMF ---")
        self.mf_model.fit(train_df)
        
        # Get MF predictions for training set
        print("--- Calculating Residuals ---")
        mf_preds = np.array(self.mf_model.predict_batch(train_df))
        y_true = train_df['hours_transformed'].values
        
        # Residual = Actual - Predicted
        residuals = y_true - mf_preds
        
        # Check for NaN features
        X = train_df[feature_columns].fillna(0)
        
        print(f"--- Stage 2: Training Residual Model ({type(self.residual_model).__name__}) ---")
        self.residual_model.fit(X, residuals)
        self.is_fitted = True
        print("Hybrid Training Complete.")

        # Optional: Print Feature Importance if GBM
        if hasattr(self.residual_model, 'feature_importances_'):
            importances = self.residual_model.feature_importances_
            indices = np.argsort(importances)[::-1]
            print("\nTop 5 Feature Importances (Residual Model):")
            for f in range(min(5, len(feature_columns))):
                print(f"{feature_columns[indices[f]]}: {importances[indices[f]]:.4f}")

    def predict_batch(self, df, feature_columns):
        """
        Hybrid prediction: MF Prediction + Residual Correction
        """
        if not self.is_fitted:
            raise Exception("Model not fitted yet.")
            
        # 1. MF Prediction (Base)
        mf_preds = np.array(self.mf_model.predict_batch(df))
        
        # 2. Residual Prediction (Correction)
        X = df[feature_columns].fillna(0)
        res_preds = self.residual_model.predict(X)
        
        # 3. Combine
        final_preds = mf_preds + res_preds
        
        # Clip to valid range if known (e.g., hours >= 0)
        # hours_transformed = log2(hours+1), so it must be >= 0
        final_preds = np.maximum(final_preds, 0)
        
        return final_preds



### create new features

In [7]:
def create_new_features(df):
    """
    Create powerful features for predicting 'hours_transformed'.
    
    Note: For a rigorous production pipeline, aggregate features (means) 
    should be computed on the Train set and mapped to Test. 
    Here, we compute them on the provided dataframe for simplicity/demonstration.
    """
    df_feat = df.copy()
    
    print("Generating features...")
    
    # --- 1. User Features ---
    # User Activity Level
    if 'userID' in df_feat.columns:
        # (Self-join aggregation)
        user_counts = df_feat['userID'].value_counts()
        df_feat['user_game_count'] = df_feat['userID'].map(user_counts)
        
        # User Mean Hours (Target Leakage warning if used on Train without CV, but useful signal)
        if 'hours_transformed' in df_feat.columns:
            user_means = df_feat.groupby('userID')['hours_transformed'].transform('mean')
            df_feat['user_mean_hours'] = user_means
        else:
            df_feat['user_mean_hours'] = 0 # Placeholder for Test set if not mapped

    # --- 2. Game Features ---
    # Game Popularity
    if 'gameID' in df_feat.columns:
        game_counts = df_feat['gameID'].value_counts()
        df_feat['game_user_count'] = df_feat['gameID'].map(game_counts)
        
        if 'hours_transformed' in df_feat.columns:
            game_means = df_feat.groupby('gameID')['hours_transformed'].transform('mean')
            df_feat['game_mean_hours'] = game_means
        else:
            df_feat['game_mean_hours'] = 0

    # --- 3. Text Features ---
    if 'text' in df_feat.columns:
        # Length of review (longer reviews might indicate more passion/playtime)
        df_feat['review_length'] = df_feat['text'].fillna('').apply(len)
    else:
        df_feat['review_length'] = 0

    # --- 4. Time Features ---
    if 'date' in df_feat.columns:
        # Parse date
        # Assuming format "%Y-%m-%d" based on prompt
        df_feat['date_dt'] = pd.to_datetime(df_feat['date'], errors='coerce')
        
        # Day of week (0=Monday, 6=Sunday)
        df_feat['day_of_week'] = df_feat['date_dt'].dt.dayofweek
        
        # Is Weekend?
        df_feat['is_weekend'] = df_feat['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
        
        # Month (Seasonality?)
        df_feat['month'] = df_feat['date_dt'].dt.month
        
        # Years since review (Vintage)
        current_year = 2024 # Or max year in data
        df_feat['years_since_review'] = current_year - df_feat['date_dt'].dt.year
    else:
        df_feat['day_of_week'] = 0
        df_feat['is_weekend'] = 0
        df_feat['years_since_review'] = 0

    # --- 5. Engagement/Interaction Features ---
    if 'found_funny' in df_feat.columns:
        # Funny votes might correlate with popularity or meme-games
        df_feat['found_funny'] = df_feat['found_funny'].fillna(0)
    
    if 'compensation' in df_feat.columns:
        # Convert bool/str to int
        df_feat['compensation_flag'] = (df_feat['compensation'] == 'Recorded Free').astype(int)

    # --- 6. Quartile/Distribution Feature ---
    # This helps separate "hardcore" vs "casual" patterns
    if 'hours_transformed' in df_feat.columns:
        # Create quantiles for the target to help analysis, 
        # (Note: You can't use this column as a feature for prediction directly!)
        # Instead, let's create a feature: "Is this user usually in the top 10% of players?"
        
        # Global 90th percentile of playtime
        global_90_pct = df_feat['hours_transformed'].quantile(0.90)
        global_10_pct = df_feat['hours_transformed'].quantile(0.10)
        
        # Feature: User's deviation from global mean
        global_mean = df_feat['hours_transformed'].mean()
        df_feat['user_deviation'] = df_feat['user_mean_hours'] - global_mean
        
    else:
        df_feat['user_deviation'] = 0

    # Clean up NaNs created by mapping (Cold start handling for features)
    df_feat = df_feat.fillna(0)
    
    return df_feat

### split train data into train/val split

In [8]:
def split_train_data(df, test_size=0.2, min_user_games=2, min_game_users=2, random_state=42):
    """
    Split data ensuring all users and items appear in BOTH train and validation to mimic test data.
    
    Parameters:
    -----------
    df : DataFrame 
        pandas DataFramewith columns ['userID','gameID', 'hours_transformed']
    test_size : float
        proportion of data for validation
    min_user_games : int
        min number of games per user to keep in training
    min_game_users : int
        min number of users per game to keep in training
    random_state : int
        for reproducibility

    Returns:
    --------
    train_df, val_df : DataFrames
    """
    # Set random seed
    np.random.seed(random_state)
    
    # Step 1: Filter users and games with enough interactions (Data Cleaning)
    # We iteratively filter until convergence to ensure constraints are met
    print("Filtering data for minimum interactions...")
    temp_df = df.copy()
    
    # Simple one-pass filter (iterative is better but this is usually sufficient for large sparse data)
    user_counts = temp_df['userID'].value_counts()
    valid_users = user_counts[user_counts >= min_user_games].index
    temp_df = temp_df[temp_df['userID'].isin(valid_users)]
    
    game_counts = temp_df['gameID'].value_counts()
    valid_games = game_counts[game_counts >= min_game_users].index
    temp_df = temp_df[temp_df['gameID'].isin(valid_games)]
    
    print(f"Data filtered. Original: {len(df)}, Filtered: {len(temp_df)}")
    
    # Step 2: Group by user to ensure user split
    # We will build a boolean mask for the training set
    # True = Train, False = Validation
    train_mask = np.ones(len(temp_df), dtype=bool)
    
    # Reset index to allow easy indexing
    temp_df = temp_df.reset_index(drop=True)
    
    grouped = temp_df.groupby('userID')
    
    val_indices_list = []
    
    print("Splitting data by user...")
    for user, group in grouped:
        n_samples = len(group)
        # Calculate how many to validate
        n_val = int(n_samples * test_size)
        
        # Constraint: Ensure min_user_games stay in training
        if n_samples - n_val < min_user_games:
            n_val = n_samples - min_user_games
            
        if n_val > 0:
            # Randomly select indices for validation
            val_idx = np.random.choice(group.index, n_val, replace=False)
            val_indices_list.extend(val_idx)
            
    # Apply split mask
    train_mask[val_indices_list] = False
    
    train_df = temp_df[train_mask].copy()
    val_df = temp_df[~train_mask].copy()
    
    # Step 3 & 4: Check for Item Cold-Start in Validation
    # (Items that ended up ONLY in validation)
    train_items = set(train_df['gameID'].unique())
    val_items = set(val_df['gameID'].unique())
    
    # Items present in validation but MISSING from training
    cold_start_items = val_items - train_items
    
    if len(cold_start_items) > 0:
        print(f"Warning: {len(cold_start_items)} games appear in Validation but not Training. Moving them back to Train.")
        
        # Identify rows in validation that contain these cold-start items
        rows_to_move_mask = val_df['gameID'].isin(cold_start_items)
        rows_to_move = val_df[rows_to_move_mask]
        
        # Move them to training
        train_df = pd.concat([train_df, rows_to_move], ignore_index=True)
        val_df = val_df[~rows_to_move_mask].copy()
        
    # Step 5: Final Verification
    final_train_users = set(train_df['userID'].unique())
    final_val_users = set(val_df['userID'].unique())
    final_train_items = set(train_df['gameID'].unique())
    final_val_items = set(val_df['gameID'].unique())
    
    # Assertions
    if not final_val_users.issubset(final_train_users):
        print("Warning: Validation has users not in Train (Should not happen with user-group split).")
        
    if not final_val_items.issubset(final_train_items):
        print("Warning: Validation has items not in Train (Logic error in Step 4).")
        
    # Check for pair overlap
    train_pairs = set(zip(train_df['userID'], train_df['gameID']))
    val_pairs = set(zip(val_df['userID'], val_df['gameID']))
    overlap = train_pairs.intersection(val_pairs)
    
    if len(overlap) > 0:
        print(f"Critical Error: {len(overlap)} overlapping pairs found between Train and Val.")
    else:
        print("Success: No pair overlap between Train and Val.")
        
    print(f"Final Split -- Train: {len(train_df)}, Val: {len(val_df)}")
    
    return train_df, val_df


### error analysis

In [9]:
def comprehensive_error_analysis(model, val_df, train_df, feature_columns=None, top_n_errors=50):
    """
    Deep dive into model performance to identify weaknesses.
    Suggests hyperparameters or feature engineering based on error patterns.
    """
    print("\n" + "="*60)
    print("COMPREHENSIVE ERROR ANALYSIS")
    print("="*60)
    
    # 1. Generate Predictions
    is_hybrid = isinstance(model, HybridMF)
    
    if is_hybrid:
        if feature_columns is None:
            print("Warning: feature_columns not provided for HybridMF. Using inferred numeric columns.")
            feature_columns = [c for c in val_df.columns if pd.api.types.is_numeric_dtype(val_df[c]) 
                               and c not in ['hours_transformed', 'prediction', 'user_idx', 'game_idx']]
        preds = model.predict_batch(val_df, feature_columns)
    else:
        preds = np.array(model.predict_batch(val_df))
        
    y_true = val_df['hours_transformed'].values
    residuals = y_true - preds
    abs_residuals = np.abs(residuals)
    
    # 2. Overall Metrics (Validation)
    mse = np.mean(residuals**2)
    rmse = np.sqrt(mse)
    mae = np.mean(abs_residuals)
    
    # Estimate Training Error for Overfitting Check (using sample for speed)
    train_sample = train_df.sample(min(len(train_df), 10000), random_state=42)
    if is_hybrid:
        train_preds = model.predict_batch(train_sample, feature_columns)
    else:
        train_preds = np.array(model.predict_batch(train_sample))
    train_rmse = np.sqrt(np.mean((train_sample['hours_transformed'].values - train_preds)**2))
    
    print(f"\n--- PERFORMANCE METRICS ---")
    print(f"{'Metric':<10} | {'Validation':<10} | {'Train (Est)':<10}")
    print(f"{'-'*36}")
    print(f"{'RMSE':<10} | {rmse:<10.4f} | {train_rmse:<10.4f}")
    print(f"{'MAE':<10} | {mae:<10.4f} | {'-':<10}")
    
    # 3. Error Distribution Statistics
    print(f"\n--- ERROR DISTRIBUTION ---")
    print(f"Bias (Mean Residual): {np.mean(residuals):.4f} (Pos=Underpredict, Neg=Overpredict)")
    print(f"Std Deviation:        {np.std(residuals):.4f}")
    print(f"Quantiles [5, 25, 50, 75, 95]: {np.percentile(residuals, [5, 25, 50, 75, 95]).round(2)}")
    
    # 4. Cold Start Analysis
    train_users = set(train_df['userID'].unique())
    train_games = set(train_df['gameID'].unique())
    
    val_analysis = val_df.copy()
    val_analysis['prediction'] = preds
    val_analysis['error'] = residuals
    val_analysis['abs_error'] = abs_residuals
    val_analysis['is_new_user'] = ~val_analysis['userID'].isin(train_users)
    val_analysis['is_new_game'] = ~val_analysis['gameID'].isin(train_games)
    
    print(f"\n--- COLD START METRICS (MAE) ---")
    print(f"Known Users: {val_analysis[~val_analysis['is_new_user']]['abs_error'].mean():.4f}")
    print(f"New Users:   {val_analysis[val_analysis['is_new_user']]['abs_error'].mean():.4f} (Count: {val_analysis['is_new_user'].sum()})")
    
    # 5. Prediction Range Analysis (Binning)
    print(f"\n--- BIAS BY TARGET VALUE (Playtime) ---")
    bins = [0, 2, 4, 6, 8, 10, 20]
    labels = ['0-2', '2-4', '4-6', '6-8', '8-10', '10+']
    val_analysis['bin'] = pd.cut(val_analysis['hours_transformed'], bins=bins, labels=labels)
    grouped = val_analysis.groupby('bin')['error'].agg(['mean', 'count'])
    print(grouped)

    # 6. Worst Errors & Component Analysis
    print(f"\n--- TOP {top_n_errors} WORST PREDICTIONS ---")
    worst = val_analysis.sort_values('abs_error', ascending=False).head(top_n_errors)
    
    # Iterate to show components
    print(f"{'UserID':<12} {'GameID':<12} {'True':<6} {'Pred':<6} {'Error':<6} {'Breakdown'}")
    for _, row in worst.iterrows():
        uid, gid = row['userID'], row['gameID']
        
        # Get components from base MF model
        base_model = model.mf_model if is_hybrid else model
        comps = base_model.get_components(uid, gid)
        
        # Simple string representation of breakdown
        if "error" in comps:
            breakdown = "Cold Start"
        else:
            breakdown = f"GAvg:{comps['Global Mean']:.1f} UB:{comps['User Bias']:.1f} GB:{comps['Game Bias']:.1f} Int:{comps['Latent Interaction']:.1f}"
            
        print(f"{uid:<12} {gid:<12} {row['hours_transformed']:<6.2f} {row['prediction']:<6.2f} {row['error']:<6.2f} {breakdown}")

    # 7. Actionable Suggestions (Heuristics)
    print(f"\n" + "="*30)
    print("ACTIONABLE SUGGESTIONS")
    print("="*30)
    
    # A. Regularization Check
    if train_rmse < rmse * 0.85:
        print("[!] OVERFITTING DETECTED")
        print("    -> Increase regularization terms (reg_bu, reg_bi, reg_pu, reg_qi).")
        print("    -> Decrease n_factors or n_epochs.")
        if is_hybrid:
            print("    -> Reduce GradientBoosting max_depth or n_estimators.")
    elif train_rmse > rmse:
         print("[!] UNDERFITTING DETECTED (or Val set is easier)")
         print("    -> Increase n_factors.")
         print("    -> Increase n_epochs.")
    
    # B. Bias Check
    mean_res = np.mean(residuals)
    if abs(mean_res) > 0.5:
        direction = "Underpredicting" if mean_res > 0 else "Overpredicting"
        print(f"[!] SYSTEMATIC BIAS: {direction}")
        print("    -> Check global mean calculation.")
        print("    -> Hybrid: Add a feature for 'global_trend' or adjust residual model intercept.")

    # C. High Value Underprediction
    high_bin_bias = grouped.loc['10+', 'mean'] if '10+' in grouped.index else 0
    if high_bin_bias > 1.0:
        print("[!] DIFFICULTY PREDICTING HARDCORE GAMERS (High hours)")
        print("    -> Feature Idea: Add 'is_hardcore_user' (top 10% playtime) as feature.")
        print("    -> Feature Idea: Add 'game_completion_time' (if external data available).")

    print("="*60 + "\n")

### everybody all togther now.. full work flow

In [None]:
# 1. Add powerful features FIRST
# (Do this before splitting so features like 'user_game_count' exist in both sets)
df_train_data_with_features = create_new_features(df_train_data)

# 2. Split into Train and Validation
# This ensures no cold-start items end up in validation
df_train, df_val = split_train_data(
    df_train_data_with_features, 
    test_size=0.2, 
    min_user_games=2, 
    min_game_users=2, 
    random_state=42
)

# 3. Define numeric features for the Hybrid model
# We exclude ID columns and the target variable
feature_cols = [
    'review_length', 'user_game_count', 'game_user_count', 
    'user_mean_hours', 'game_mean_hours', 'day_of_week', 
    'is_weekend', 'user_deviation'
]

# 4. Initialize and Train BiasedMF (Base Model)
svd_reg_model = BiasedMF(
    n_factors=10, 
    learning_rate=0.01, 
    reg_bu=0.02, reg_bi=0.2, reg_pu=0.01, reg_qi=0.1, 
    n_epochs=20
)
svd_reg_model.fit(df_train)

# 5. Error Analysis for Base Model
# Note: I changed val_df to df_val here. Using df_train (as in your snippet) 
# gives training error; df_val gives the actual generalization error.
print("--- BASE MODEL ANALYSIS ---")
comprehensive_error_analysis(svd_reg_model, val_df=df_val, train_df=df_train, top_n_errors=50)

# 6. Initialize and Train HybridMF
hybrid_model = HybridMF(svd_reg_model, residual_model_type='gbm')

# !CORRECTION HERE!: You must pass feature_columns
hybrid_model.fit(df_train, feature_columns=feature_cols)

# 7. Error Analysis for Hybrid Model
print("--- HYBRID MODEL ANALYSIS ---")
comprehensive_error_analysis(hybrid_model, val_df=df_val, train_df=df_train, feature_columns=feature_cols, top_n_errors=50)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge

class HybridMF:
    """
    Two-stage hybrid:
    1. BiasedMF learns from user/game IDs
    2. Residual model learns from features
    """
    def __init__(self,mf_model, residual_model='gbm'):
        self.mf_model = mf_model
        self.residual_model_type = residual_model

    def fit(self, train_df, feature_columns):
        """
        Stage 1: train MF model
        Stage 2: train residual model on MF errors using features
        """
        # get MF predictions

        # calculate residuals

        # train residual model

        #feature imporance for GBM

    def predict_batch(self,df):
        """
        Hybrid prediction: MF + residual corrcection
        """
        # MF prediction

        # residual prediction

        # combined

In [None]:
def create_new_features(df):
    """
    create powerful features for predicting 'hours_transformed' for (userID,gameID) pairs

    Parameters:
    -----------
    df : Dataframe
        orginal pandas DataFrame for training data with following columns:
        'userID' : str
        'gameID' : str 
        'hours_transformed' : float 
        'early_access' : bool {0,1} 
        'date' : str (format="%Y-%m-%d"), 
        'text' : str
        'found_funny' : int
        'compensation' : bool {0,1}

    Returns:
    --------
    df_with_features : Dataframe
        orginal pandas DataFrame with additional powerful features
    """
    # create new user features
    # get game count for each user in user_games dict
    user_games_count = pd.Series({u:len(g) for u,g in user_games.items()})
    df['user_games_count_log'] = np.log1p(df['userID'].map(user_games_count).fillna(0))
    # get avg hours played for each user
    user_avg_hours = df.groupby('userID')['hours_transformed'].mean()
    df['user_avg_hours'] = df['userID'].map(user_avg_hours)
    user_avg_text_len = df.groupby('userID')['text'].apply(lambda x: x.str.len().mean())
    df['user_avg_text_len'] = df['userID'].map(user_avg_text_len).fillna(0)

    ## create new game features
# get user count for each game in game_users dict
game_users_count = pd.Series({g:len(u) for g,u in game_users.items()})

df_train_data.head()
df_train_data.describe()


Unnamed: 0,hours,hours_transformed,found_funny
count,175000.0,175000.0,175000.0
mean,66.408189,3.717845,1.356497
std,275.203113,2.297882,24.119107
min,0.0,0.0,0.0
25%,3.0,2.0,0.0
50%,10.1,3.472488,0.0
75%,33.5,5.108524,0.0
max,16539.9,14.01375,4013.0


In [8]:
def split_train_data(df, test_size=0.2, min_user_games=2, min_game_users=2, random_state=42):
    """
    Split data ensuring all users and items appear in BOTH train and validation to mimic test data.
    
    Parameters:
    -----------
    df : DataFrame 
        pandas DataFramewith columns ['userID','gameID', 'hours_transformed']
    test_size : float
        proportion of data for validation
    min_user_games : int
        min number of games per user to keep in training
    min_game_users : int
        min number of users per game to keep in training
    random_state : int
        for reproducibility

    Returns:
    --------
    train_df, val_df : DataFrames
    """
    # Set random seed
    np.random.seed(random_state)
    
    # Step 1: Filter users and games with enough interactions (Data Cleaning)
    # We iteratively filter until convergence to ensure constraints are met
    print("Filtering data for minimum interactions...")
    temp_df = df.copy()
    
    # Simple one-pass filter (iterative is better but this is usually sufficient for large sparse data)
    user_counts = temp_df['userID'].value_counts()
    valid_users = user_counts[user_counts >= min_user_games].index
    temp_df = temp_df[temp_df['userID'].isin(valid_users)]
    
    game_counts = temp_df['gameID'].value_counts()
    valid_games = game_counts[game_counts >= min_game_users].index
    temp_df = temp_df[temp_df['gameID'].isin(valid_games)]
    
    print(f"Data filtered. Original: {len(df)}, Filtered: {len(temp_df)}")
    
    # Step 2: Group by user to ensure user split
    # We will build a boolean mask for the training set
    # True = Train, False = Validation
    train_mask = np.ones(len(temp_df), dtype=bool)
    
    # Reset index to allow easy indexing
    temp_df = temp_df.reset_index(drop=True)
    
    grouped = temp_df.groupby('userID')
    
    val_indices_list = []
    
    print("Splitting data by user...")
    for user, group in grouped:
        n_samples = len(group)
        # Calculate how many to validate
        n_val = int(n_samples * test_size)
        
        # Constraint: Ensure min_user_games stay in training
        if n_samples - n_val < min_user_games:
            n_val = n_samples - min_user_games
            
        if n_val > 0:
            # Randomly select indices for validation
            val_idx = np.random.choice(group.index, n_val, replace=False)
            val_indices_list.extend(val_idx)
            
    # Apply split mask
    train_mask[val_indices_list] = False
    
    train_df = temp_df[train_mask].copy()
    val_df = temp_df[~train_mask].copy()
    
    # Step 3 & 4: Check for Item Cold-Start in Validation
    # (Items that ended up ONLY in validation)
    train_items = set(train_df['gameID'].unique())
    val_items = set(val_df['gameID'].unique())
    
    # Items present in validation but MISSING from training
    cold_start_items = val_items - train_items
    
    if len(cold_start_items) > 0:
        print(f"Warning: {len(cold_start_items)} games appear in Validation but not Training. Moving them back to Train.")
        
        # Identify rows in validation that contain these cold-start items
        rows_to_move_mask = val_df['gameID'].isin(cold_start_items)
        rows_to_move = val_df[rows_to_move_mask]
        
        # Move them to training
        train_df = pd.concat([train_df, rows_to_move], ignore_index=True)
        val_df = val_df[~rows_to_move_mask].copy()
        
    # Step 5: Final Verification
    final_train_users = set(train_df['userID'].unique())
    final_val_users = set(val_df['userID'].unique())
    final_train_items = set(train_df['gameID'].unique())
    final_val_items = set(val_df['gameID'].unique())
    
    # Assertions
    if not final_val_users.issubset(final_train_users):
        print("Warning: Validation has users not in Train (Should not happen with user-group split).")
        
    if not final_val_items.issubset(final_train_items):
        print("Warning: Validation has items not in Train (Logic error in Step 4).")
        
    # Check for pair overlap
    train_pairs = set(zip(train_df['userID'], train_df['gameID']))
    val_pairs = set(zip(val_df['userID'], val_df['gameID']))
    overlap = train_pairs.intersection(val_pairs)
    
    if len(overlap) > 0:
        print(f"Critical Error: {len(overlap)} overlapping pairs found between Train and Val.")
    else:
        print("Success: No pair overlap between Train and Val.")
        
    print(f"Final Split -- Train: {len(train_df)}, Val: {len(val_df)}")
    
    return train_df, val_df

train_df,val_df = split_train_data(df_train_data, test_size=0.2, min_user_games=2, min_game_users=2, random_state=42)

Filtering data for minimum interactions...
Data filtered. Original: 175000, Filtered: 174993
Splitting data by user...
Success: No pair overlap between Train and Val.
Final Split -- Train: 142675, Val: 32318


In [None]:
import numpy as np
import pandas as pd

class BiasedMF:
    def __init__(self, n_factors=10, learning_rate=0.01, 
                 reg_bu=0.0, reg_bi=0.0, reg_pu=0.0, reg_qi=0.0, n_epochs=10):
        """
        Biased Matrix Factorization (Regularized SVD) optimized via SGD.
        
        Model: r_hat = mu + b_u + b_i + dot(p_u, q_i)
        
        Parameters:
        -----------
        n_factors : int
            Number of latent factors (dimension of vectors p_u and q_i)
        learning_rate : float
            Step size for Gradient Descent (eta)
        reg_bu, reg_bi, reg_pu, reg_qi : float
            Regularization parameters (lambda) to prevent overfitting
        n_epochs : int
            Number of passes over the training data
        """
        self.n_factors = n_factors
        self.lr = learning_rate
        self.reg_bu = reg_bu
        self.reg_bi = reg_bi
        self.reg_pu = reg_pu
        self.reg_qi = reg_qi
        self.n_epochs = n_epochs
        
        # Internal state
        self.mu = 0.0
        self.user_bias = None  # numpy array
        self.item_bias = None  # numpy array
        self.P = None          # User factors (n_users x n_factors)
        self.Q = None          # Item factors (n_items x n_factors)
        
        # Mappings
        self.user2idx = {}
        self.idx2user = {}
        self.game2idx = {}
        self.idx2game = {}

    def fit(self, df):
        """
        Trains the model using Stochastic Gradient Descent.
        df columns: ['userID', 'gameID', 'hours_transformed']
        """
        print("Initializing BiasedMF...")
        
        # 1. Create Mappings
        unique_users = df['userID'].unique()
        unique_games = df['gameID'].unique()
        
        self.n_users = len(unique_users)
        self.n_games = len(unique_games)
        
        self.user2idx = {u: i for i, u in enumerate(unique_users)}
        self.game2idx = {g: i for i, g in enumerate(unique_games)}
        
        # 2. Initialize Parameters
        self.mu = df['hours_transformed'].mean()
        
        # Biases initialized to zero
        self.user_bias = np.zeros(self.n_users)
        self.item_bias = np.zeros(self.n_games)
        
        # Latent Factors initialized with small random noise (Normal Dist)
        # Scaling by 0.1 helps convergence
        self.P = np.random.normal(0, 0.1, (self.n_users, self.n_factors))
        self.Q = np.random.normal(0, 0.1, (self.n_games, self.n_factors))
        
        # Convert dataframe to numpy arrays for faster iteration
        # We map string IDs to integers here for the training loop
        users_idx = df['userID'].map(self.user2idx).values
        games_idx = df['gameID'].map(self.game2idx).values
        ratings = df['hours_transformed'].values
        
        n_samples = len(df)
        
        # 3. Training Loop (SGD)
        print(f"Starting training on {n_samples} samples for {self.n_epochs} epochs.")
        
        for epoch in range(self.n_epochs):
            # Calculate MSE at start of epoch (optional, but good for monitoring)
            # Keeping it simple here to focus on speed
            
            # Shuffle indices for true SGD behavior
            indices = np.random.permutation(n_samples)
            
            total_error = 0
            
            for i in indices:
                u = users_idx[i]
                g = games_idx[i]
                r = ratings[i]
                
                # --- Prediction Step ---
                # r_hat = mu + b_u + b_i + P_u . Q_i
                dot_prod = np.dot(self.P[u], self.Q[g])
                pred = self.mu + self.user_bias[u] + self.item_bias[g] + dot_prod
                
                # --- Error Calculation ---
                err = r - pred
                total_error += err**2
                
                # --- Update Rules (Gradient Descent) ---
                # Update Biases
                # b_u <- b_u + lr * (err - reg * b_u)
                self.user_bias[u] += self.lr * (err - self.reg_bu * self.user_bias[u])
                self.item_bias[g] += self.lr * (err - self.reg_bi * self.item_bias[g])
                
                # Update Latent Factors
                # Note: We need to copy P[u] before updating it to update Q[g] correctly 
                # (though simultaneous update approximation is standard in code)
                p_u_current = self.P[u].copy()
                
                # P_u <- P_u + lr * (err * Q_i - reg * P_u)
                self.P[u] += self.lr * (err * self.Q[g] - self.reg_pu * self.P[u])
                
                # Q_i <- Q_i + lr * (err * P_u - reg * Q_i)
                self.Q[g] += self.lr * (err * p_u_current - self.reg_qi * self.Q[g])
            
            mse = total_error / n_samples
            print(f"Epoch {epoch+1}/{self.n_epochs} - MSE: {mse:.4f}")

    def predict(self, user_id, game_id):
        """
        Predict rating for user item pair.
        Handles Cold Start by falling back to global mean.
        """
        # Cold Start Check
        if user_id not in self.user2idx or game_id not in self.game2idx:
            # If we haven't seen the user or game, return global mean
            # (A more advanced approach would use just user_bias or item_bias if available)
            return self.mu
        
        u = self.user2idx[user_id]
        g = self.game2idx[game_id]
        
        pred = self.mu + self.user_bias[u] + self.item_bias[g] + np.dot(self.P[u], self.Q[g])
        return pred

    def get_components(self, user_id, game_id):
        """
        Show breakdown of prediction components for analysis.
        """
        if user_id not in self.user2idx or game_id not in self.game2idx:
            return {"error": "User or Game not found in training data"}
            
        u = self.user2idx[user_id]
        g = self.game2idx[game_id]
        
        components = {
            "Global Mean": self.mu,
            "User Bias": self.user_bias[u],
            "Game Bias": self.item_bias[g],
            "Latent Interaction": np.dot(self.P[u], self.Q[g]),
            "Total Prediction": self.predict(user_id, game_id)
        }
        return components

    def predict_batch(self, df):
        """
        Batch prediction for efficiency.
        df: dataframe containing 'userID' and 'gameID' columns
        Returns: list of predictions
        """
        predictions = []
        # Iterating is acceptable here as we need to handle string IDs and potential cold starts
        # Vectorization is difficult with mixed cold-start scenarios without complex masking
        for _, row in df.iterrows():
            pred = self.predict(row['userID'], row['gameID'])
            predictions.append(pred)
        return predictions

# --- Example Usage (Commented out) ---
# model = BiasedMF(n_factors=5, learning_rate=0.005, reg_bu=0.02, reg_bi=0.02, n_epochs=10)
# model.fit(train_df)
# preds = model.predict_batch(test_df)