In [None]:
import numpy as np
import pandas as pd
import gzip
import json
from tqdm import tqdm # Import tqdm for progress bars
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, VotingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

# Enable tqdm for pandas operations
tqdm.pandas()

# ==========================================
# 1. PURE CONTENT FEATURE ENGINEERING
# ==========================================
def prepare_content_features(df, is_train=True, tfidf_model=None, scaler_model=None):
    """
    Engineers features using only Pandas/Numpy/Sklearn.
    Separates fitting (Train) from transforming (Test) to prevent leakage.
    """
    print(f"   -> Feature Engineering (Train Context: {is_train})...")
    
    # --- A. Text Features (TF-IDF) ---
    # We use a slightly smaller vocab to keep MLP training fast on CPU
    txt = df['text'].fillna('none').astype(str)
    
    if is_train:
        print("      -> Fitting TF-IDF (This may take a moment)...")
        # No tqdm here as fit_transform is atomic in sklearn
        tfidf_model = TfidfVectorizer(max_features=2000, stop_words='english', ngram_range=(1, 2))
        X_text = tfidf_model.fit_transform(txt).toarray() # Dense for MLP
    else:
        print("      -> Transforming Text...")
        X_text = tfidf_model.transform(txt).toarray()

    # --- B. Metadata Features ---
    if 'date' in df.columns:
        dt = pd.to_datetime(df['date'], errors='coerce')
        year = dt.dt.year.fillna(2015) - 2000
        month = dt.dt.month.fillna(1)
        day_of_week = dt.dt.dayofweek.fillna(0)
        is_weekend = (day_of_week >= 5).astype(float)
    else:
        year, month, day_of_week, is_weekend = 0, 0, 0, 0

    # Use progress_apply to show a bar for this operation
    print("      -> Calculating Review Lengths...")
    review_len = df['text'].fillna('').progress_apply(len)
    found_funny = df['found_funny'].fillna(0)
    
    # Handle compensation safely
    if 'compensation' in df.columns:
        compensation = (df['compensation'] == 'Recorded Free').astype(float)
    else:
        compensation = np.zeros(len(df))

    # Stack Meta Features
    # Note: We use numpy stack for efficiency
    X_meta_raw = np.column_stack([
        year, month, day_of_week, is_weekend,
        review_len, found_funny, compensation
    ])
    
    # Scale Meta Features
    if is_train:
        scaler_model = StandardScaler()
        X_meta = scaler_model.fit_transform(X_meta_raw)
    else:
        X_meta = scaler_model.transform(X_meta_raw)
        
    # --- Combine ---
    # Concatenate Text (2000 cols) + Meta (7 cols)
    X_final = np.hstack([X_text, X_meta])
    
    return X_final, tfidf_model, scaler_model

# ==========================================
# 2. MODEL DEFINITION
# ==========================================
def build_ensemble_model():
    """
    Instead of relying on just one NN, we ensemble two powerful sklearn models.
    1. MLPRegressor (Deep Learning)
    2. HistGradientBoostingRegressor (The 'Kaggle Winner' algorithm)
    """
    print("   -> Building Ensemble (MLP + Gradient Boosting)...")
    
    # Model 1: Deep Neural Network (mimics the TensorFlow architecture)
    # hidden_layer_sizes=(512, 256, 64) is roughly equivalent to the TF model
    mlp = MLPRegressor(
        hidden_layer_sizes=(512, 256, 64),
        activation='relu',
        solver='adam',
        alpha=0.0001, # L2 regularization
        batch_size=128,
        learning_rate='adaptive',
        learning_rate_init=0.001,
        early_stopping=True, # Critical for preventing overfitting
        validation_fraction=0.1,
        n_iter_no_change=5,
        max_iter=50, # Epochs
        random_state=42,
        verbose=True # Prints iteration progress
    )
    
    # Model 2: Histogram-based Gradient Boosting
    # This is typically FASTER and MORE ACCURATE for tabular data than NNs
    gbm = HistGradientBoostingRegressor(
        max_iter=200,
        learning_rate=0.1,
        max_depth=10,
        l2_regularization=0.1,
        early_stopping=True,
        random_state=42,
        verbose=1 # Prints iteration progress (Scoring...)
    )
    
    # Ensemble: Average the predictions of both
    ensemble = VotingRegressor(
        estimators=[('mlp', mlp), ('gbm', gbm)],
        weights=[0.4, 0.6] # Giving slightly more weight to GBM as it's usually more robust
    )
    
    return ensemble

# ==========================================
# 3. EXECUTION UTILITIES
# ==========================================
def readJSON(path):
    for l in gzip.open(path, 'rt'):
        d = eval(l)
        yield d

if __name__ == "__main__":
    print("Starting Sklearn Content-Based Pipeline...")
    
    # 1. Load Data
    try:
        train_data = []
        # We strictly ignore ID logic here as requested
        # Added tqdm to visualize data loading speed
        print("Loading Data from disk...")
        for d in tqdm(readJSON("train.json.gz"), desc="Reading Lines"):
            train_data.append(d)

        df = pd.DataFrame(train_data)
        
        # Basic cleanup
        if 'found_funny' not in df.columns: df['found_funny'] = 0
        if 'compensation' not in df.columns: df['compensation'] = 'None'
        df['found_funny'] = df['found_funny'].fillna(0)
        df['compensation'] = df['compensation'].fillna('None')
        df['hours'] = df['hours'].fillna(0)
        df['hours_transformed'] = np.log2(df['hours'] + 1)
        
    except FileNotFoundError:
        print("Error: train.json.gz not found. Creating dummy data...")
        df = pd.DataFrame({
            'hours_transformed': np.random.uniform(0, 14, 5000),
            'text': ['this game is amazing and addictive' if i%2==0 else 'boring refund' for i in range(5000)],
            'date': ['2020-01-01']*5000,
            'found_funny': [0]*5000,
            'compensation': ['None']*5000
        })

    # 2. Split
    # We split BEFORE feature engineering to simulate real world train/test separation
    y = df['hours_transformed'].values
    train_df, val_df, y_train, y_val = train_test_split(df, y, test_size=0.1, random_state=42)
    
    print(f"Train samples: {len(train_df)}, Val samples: {len(val_df)}")

    # 3. Feature Engineering
    # Fit on Train
    X_train, tfidf, scaler = prepare_content_features(train_df, is_train=True)
    # Transform Val
    X_val, _, _ = prepare_content_features(val_df, is_train=False, tfidf_model=tfidf, scaler_model=scaler)
    
    # 4. Train Ensemble
    model = build_ensemble_model()
    
    print("\nTraining Ensemble Model (Logs will appear for each iteration)...")
    # Note: MLP in sklearn is verbose=True, so you'll see loss logs
    model.fit(X_train, y_train)
    
    # 5. Evaluation
    print("\n--- Evaluation ---")
    val_preds = model.predict(X_val)
    
    # Clip predictions to valid range
    val_preds = np.clip(val_preds, 0, None)
    
    rmse = np.sqrt(np.mean((y_val - val_preds)**2))
    print(f"FINAL RMSE: {rmse:.4f}")
    
    # Bias Check
    val_df_res = val_df.copy()
    val_df_res['pred'] = val_preds
    val_df_res['error'] = val_df_res['hours_transformed'] - val_df_res['pred']
    val_df_res['bin'] = pd.cut(val_df_res['hours_transformed'], bins=[0,2,4,6,8,10,20])
    
    print("\nBias by Target Bin:")
    print(val_df_res.groupby('bin', observed=False)['error'].mean())
    
    print("\n" + "="*40)
    print("       MODEL CONFIGURATION SUMMARY       ")
    print("="*40)
    print(f"Input Features:      {X_train.shape[1]} (TF-IDF + Metadata)")
    print("-" * 40)
    print("Algorithm 1: MLPRegressor (Deep Learning)")
    print("  - Layers: (512, 256, 64)")
    print("  - Activation: ReLU")
    print("  - Solver: Adam")
    print("-" * 40)
    print("Algorithm 2: HistGradientBoostingRegressor")
    print("  - Max Depth: 10")
    print("  - Learning Rate: 0.1")
    print("-" * 40)
    print(f"Ensemble Weight:     40% MLP / 60% Gradient Boosting")
    print(f"Final RMSE:          {rmse:.4f}")
    print("="*40 + "\n")

In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import gzip
import json

# ==========================================
# 1. ADVANCED FEATURE ENGINEERING (NO LEAKAGE)
# ==========================================
def prepare_data(df):
    """
    Orchestrates the feature generation pipeline.
    """
    print("--- 1. Text Processing (TF-IDF + SVD) ---")
    # TF-IDF captures "addictive", "short", "boring" - critical for playtime
    # We limit to 5000 features, then compress to 64 dense features with SVD
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
    # Fill NaN text
    txt = df['text'].fillna('none').astype(str)
    tfidf_matrix = tfidf.fit_transform(txt)
    
    print("   -> Compressing Text features via SVD...")
    svd = TruncatedSVD(n_components=64, random_state=42)
    text_features = svd.fit_transform(tfidf_matrix)
    
    # DataFrame for text features
    text_cols = [f'text_svd_{i}' for i in range(64)]
    df_text = pd.DataFrame(text_features, columns=text_cols, index=df.index)
    
    # Concatenate original df with text features
    df = pd.concat([df, df_text], axis=1)
    
    print("--- 2. Date Processing ---")
    if 'date' in df.columns:
        dt = pd.to_datetime(df['date'], errors='coerce')
        df['year'] = dt.dt.year.fillna(2015)
        df['month'] = dt.dt.month.fillna(1)
        # Days since a baseline (trends over time)
        df['days_timeline'] = (dt - pd.Timestamp('2000-01-01')).dt.days.fillna(0)
    else:
        df['year'] = 2015
        df['month'] = 1
        df['days_timeline'] = 0

    print("--- 3. Basic Counts ---")
    df['review_len'] = df['text'].fillna('').apply(len)
    
    return df, text_cols

# ==========================================
# 2. TARGET ENCODING (THE "PEER" TRICK)
# ==========================================
def add_target_stats(train_df, val_df, target_col='hours_transformed'):
    """
    Computes User/Game mean hours on TRAIN ONLY and maps to Val.
    This prevents data leakage which causes overfitting.
    """
    print("--- 4. Computing Target Statistics (No Leakage) ---")
    
    # Global Mean
    global_mean = train_df[target_col].mean()
    
    # User Mean
    user_means = train_df.groupby('userID')[target_col].mean()
    train_df['user_target_enc'] = train_df['userID'].map(user_means)
    val_df['user_target_enc'] = val_df['userID'].map(user_means)
    
    # Game Mean
    game_means = train_df.groupby('gameID')[target_col].mean()
    train_df['game_target_enc'] = train_df['gameID'].map(game_means)
    val_df['game_target_enc'] = val_df['gameID'].map(game_means)
    
    # Fill NaNs (Cold Start) with Global Mean
    for df_ in [train_df, val_df]:
        df_['user_target_enc'] = df_['user_target_enc'].fillna(global_mean)
        df_['game_target_enc'] = df_['game_target_enc'].fillna(global_mean)
        
    return train_df, val_df

# ==========================================
# 3. WIDE & DEEP MODEL ARCHITECTURE
# ==========================================
def build_wide_and_deep(n_users, n_items, n_dense):
    """
    Wide & Deep Network.
    - Deep Part: Embeddings (Captures Latent Factors)
    - Wide Part: Text + Stats (Captures Explicit Signals)
    """
    # --- Inputs ---
    user_in = Input(shape=(1,), name='user_id')
    item_in = Input(shape=(1,), name='item_id')
    dense_in = Input(shape=(n_dense,), name='dense_features')
    
    # --- Embeddings (Latent) ---
    # L2 Reg is crucial here. 
    emb_dim = 64
    user_emb = Embedding(n_users, emb_dim, embeddings_regularizer=l2(1e-5), name='user_emb')(user_in)
    item_emb = Embedding(n_items, emb_dim, embeddings_regularizer=l2(1e-5), name='item_emb')(item_in)
    
    u_vec = Flatten()(user_emb)
    i_vec = Flatten()(item_emb)
    
    # --- Interaction (Dot Product) ---
    # NCF usually works better if we explicitly calculate the dot product
    # as a feature for the dense layer
    dot = tf.keras.layers.Dot(axes=1)([u_vec, i_vec])
    
    # --- Concatenate Everything ---
    # [UserVec, ItemVec, DotProduct, TextFeatures, StatsFeatures]
    concat = Concatenate()([u_vec, i_vec, dot, dense_in])
    
    # --- MLP (The "Brain") ---
    # Funnel structure: 512 -> 256 -> 128
    x = Dense(512)(concat)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.3)(x)
    
    x = Dense(256)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    
    x = Dense(128)(x)
    x = Activation('relu')(x)
    
    # --- Output ---
    output = Dense(1, activation='linear', name='output')(x)
    
    model = Model(inputs=[user_in, item_in, dense_in], outputs=output)
    
    # Optimizer: Nadam is often better for embeddings than Adam
    model.compile(optimizer=Nadam(learning_rate=0.0005), loss='mse', metrics=['mae'])
    return model

# ==========================================
# 3. PIPELINE ORCHESTRATION
# ==========================================
import gzip
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
import datetime

def readJSON(path):
  for l in gzip.open(path, 'rt'):
    d = eval(l)
    u = d['userID']
    try:
      g = d['gameID']
    except Exception as e:
      g = None
    yield u,g,d
train_data = []
user_games = defaultdict(list)
game_users = defaultdict(list)

for u,g,d in readJSON("train.json.gz"):
    user_games[u].append(g)
    game_users[g].append(u)
    train_data.append(d)

df_train_data = pd.DataFrame(train_data)
df_train_data.drop('user_id',axis=1,inplace=True)
df_train_data['found_funny'] = df_train_data['found_funny'].fillna(0)
df_train_data['compensation'] = df_train_data['compensation'].fillna(0)
df_train_data.loc[df_train_data['compensation'] != 0,'compensation'] = 1
df_test_hours_data = pd.read_csv("pairs_Hours.csv")
df_test_play_data = pd.read_csv("pairs_Played.csv")

# 2. Global Feature Engineering (Text, Date)
df, text_cols = prepare_data(df)

# 3. Encoding IDs
user_enc = LabelEncoder()
game_enc = LabelEncoder()
df['user_idx'] = user_enc.fit_transform(df['userID'])
df['game_idx'] = game_enc.fit_transform(df['gameID'])

n_users = df['user_idx'].max() + 1
n_items = df['game_idx'].max() + 1

# 4. Strict Train/Val Split (Before Target Stats!)
# This is critical. We cannot calculate user_mean on validation data.
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)
print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")

# 5. Add Target Statistics (Leakage-Free)
train_df, val_df = add_target_stats(train_df, val_df)

# 6. Prepare Dense Matrices
# Features: Text SVD (64) + TargetStats (2) + ReviewLen (1) + Timeline (1)
dense_cols = text_cols + ['user_target_enc', 'game_target_enc', 'review_len', 'days_timeline']

# Scale numericals
scaler = StandardScaler()
X_train_dense = scaler.fit_transform(train_df[dense_cols])
X_val_dense = scaler.transform(val_df[dense_cols])

# 7. Inputs
X_train = [train_df['user_idx'].values, train_df['game_idx'].values, X_train_dense]
y_train = train_df['hours_transformed'].values

X_val = [val_df['user_idx'].values, val_df['game_idx'].values, X_val_dense]
y_val = val_df['hours_transformed'].values

# 8. Build & Train
model = build_wide_and_deep(n_users, n_items, len(dense_cols))
model.summary()

early_stop = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)

print("\nTraining Wide & Deep Model...")
# No sample weights this time. The Text features should naturally handle the outliers.
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=25,
    batch_size=512, # Larger batch size for stable gradients
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# 9. Evaluation
print("\n--- Evaluation ---")
val_preds = model.predict(X_val).flatten()
val_preds = np.clip(val_preds, 0, None)

rmse = np.sqrt(np.mean((y_val - val_preds)**2))
print(f"FINAL RMSE: {rmse:.4f}")

# Detailed Analysis
val_df['pred'] = val_preds
val_df['error'] = val_df['hours_transformed'] - val_df['pred']
val_df['bin'] = pd.cut(val_df['hours_transformed'], bins=[0,2,4,6,8,10,20])
print("\nBias by Target Bin:")
print(val_df.groupby('bin', observed=False)['error'].mean())

--- 1. Text Processing (TF-IDF + SVD) ---
   -> Compressing Text features via SVD...
--- 2. Date Processing ---
--- 3. Basic Counts ---
Train size: 148750, Val size: 26250
--- 4. Computing Target Statistics (No Leakage) ---



Training Wide & Deep Model...
Epoch 1/25
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 38ms/step - loss: 3.4805 - mae: 1.4115 - val_loss: 4.0669 - val_mae: 1.5555 - learning_rate: 5.0000e-04
Epoch 2/25
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 37ms/step - loss: 2.8766 - mae: 1.2871 - val_loss: 3.3672 - val_mae: 1.3924 - learning_rate: 5.0000e-04
Epoch 3/25
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 37ms/step - loss: 2.7601 - mae: 1.2552 - val_loss: 3.4669 - val_mae: 1.4175 - learning_rate: 5.0000e-04
Epoch 4/25
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 41ms/step - loss: 2.6627 - mae: 1.2293 - val_loss: 3.3281 - val_mae: 1.3790 - learning_rate: 5.0000e-04
Epoch 5/25
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 38ms/step - loss: 2.5422 - mae: 1.1981 - val_loss: 3.2938 - val_mae: 1.3711 - learning_rate: 5.0000e-04
Epoch 6/25
[1m291/291[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout, BatchNormalization, Activation, Add
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import gzip
import json
from collections import defaultdict

# ==========================================
# 1. ADVANCED FEATURE ENGINEERING (NO LEAKAGE)
# ==========================================
def prepare_data(df):
    """
    Orchestrates the feature generation pipeline.
    """
    print("--- 1. Text Processing (TF-IDF + SVD) ---")
    # Increased to 8000/128 to capture maximum signal from text
    tfidf = TfidfVectorizer(max_features=8000, stop_words='english', ngram_range=(1, 2))
    # Fill NaN text
    txt = df['text'].fillna('none').astype(str)
    tfidf_matrix = tfidf.fit_transform(txt)
    
    print("   -> Compressing Text features via SVD...")
    svd = TruncatedSVD(n_components=128, random_state=42)
    text_features = svd.fit_transform(tfidf_matrix)
    
    # DataFrame for text features
    text_cols = [f'text_svd_{i}' for i in range(128)]
    df_text = pd.DataFrame(text_features, columns=text_cols, index=df.index)
    
    # Concatenate original df with text features
    df = pd.concat([df, df_text], axis=1)
    
    print("--- 2. Date Processing ---")
    if 'date' in df.columns:
        dt = pd.to_datetime(df['date'], errors='coerce')
        df['year'] = dt.dt.year.fillna(2015)
        df['month'] = dt.dt.month.fillna(1)
        # Days since a baseline (trends over time)
        df['days_timeline'] = (dt - pd.Timestamp('2000-01-01')).dt.days.fillna(0)
    else:
        df['year'] = 2015
        df['month'] = 1
        df['days_timeline'] = 0

    print("--- 3. Basic Counts ---")
    df['review_len'] = df['text'].fillna('').apply(len)
    
    return df, text_cols

# ==========================================
# 2. TARGET ENCODING (THE "PEER" TRICK)
# ==========================================
def add_target_stats(train_df, val_df, target_col='hours_transformed'):
    """
    Computes User/Game mean hours on TRAIN ONLY and maps to Val.
    Includes explicit interaction terms for the Wide Path.
    """
    print("--- 4. Computing Target Statistics (No Leakage) ---")
    
    # Global Mean
    global_mean = train_df[target_col].mean()
    
    # User Mean
    user_means = train_df.groupby('userID')[target_col].mean()
    train_df['user_target_enc'] = train_df['userID'].map(user_means)
    val_df['user_target_enc'] = val_df['userID'].map(user_means)
    
    # Game Mean
    game_means = train_df.groupby('gameID')[target_col].mean()
    train_df['game_target_enc'] = train_df['gameID'].map(game_means)
    val_df['game_target_enc'] = val_df['gameID'].map(game_means)
    
    # Fill NaNs (Cold Start) with Global Mean
    for df_ in [train_df, val_df]:
        df_['user_target_enc'] = df_['user_target_enc'].fillna(global_mean)
        df_['game_target_enc'] = df_['game_target_enc'].fillna(global_mean)
        
        # --- ACTIONABLE: Explicit Interaction Term ---
        # If user is Hardcore (High Mean) AND Game is Long (High Mean) -> Result is Multiplicative
        df_['interaction_mean'] = df_['user_target_enc'] * df_['game_target_enc']
        
    return train_df, val_df

# ==========================================
# 3. PASSTHROUGH RESIDUAL ARCHITECTURE
# ==========================================
def build_passthrough_model(n_users, n_items, n_scaled, n_passthrough):
    """
    Architecture:
    1. Passthrough Path (Wide): Raw Target Stats -> Dense(1)
       * Guarantees baseline performance of simple averaging.
       * NOT scaled, preserving the target magnitude.
       
    2. Deep Path: Embeddings + Scaled Features -> MLP
       * Learns the *Residual* (Error) of the Passthrough path.
    """
    # --- Inputs ---
    user_in = Input(shape=(1,), name='user_id')
    item_in = Input(shape=(1,), name='item_id')
    scaled_in = Input(shape=(n_scaled,), name='scaled_features')
    passthrough_in = Input(shape=(n_passthrough,), name='passthrough_features') # RAW features
    
    # ==========================
    # PATH 1: PASSTHROUGH (WIDE)
    # ==========================
    # Direct Linear Connection. 
    # This layer essentially learns: Pred = w1*UserMean + w2*GameMean + w3*Interaction
    # We initialize weights to 0.5 to encourage utilizing the signal immediately
    wide_out = Dense(1, activation='linear', name='wide_output', 
                     kernel_initializer='ones')(passthrough_in)
    
    # ==========================
    # PATH 2: DEEP RESIDUAL
    # ==========================
    
    # --- Embeddings ---
    # Increased dim to 128 to capture more latent factors
    emb_dim = 128
    user_emb = Embedding(n_users, emb_dim, embeddings_regularizer=l2(1e-6), name='user_emb')(user_in)
    item_emb = Embedding(n_items, emb_dim, embeddings_regularizer=l2(1e-6), name='item_emb')(item_in)
    
    u_vec = Flatten()(user_emb)
    i_vec = Flatten()(item_emb)
    
    # --- Interaction (Dot) ---
    dot = tf.keras.layers.Dot(axes=1)([u_vec, i_vec])
    
    # --- Concatenate (Deep Input) ---
    # We feed EVERYTHING into the Deep path too, so it has context
    concat = Concatenate()([u_vec, i_vec, dot, scaled_in, passthrough_in])
    
    # --- MLP ---
    x = Dense(512)(concat)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    
    x = Dense(256)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.2)(x)
    
    x = Dense(64)(x)
    x = Activation('relu')(x)
    
    # Deep output learns the "Correction"
    deep_out = Dense(1, activation='linear', name='deep_output')(x)
    
    # ==========================
    # COMBINE
    # ==========================
    output = Add(name='final_output')([wide_out, deep_out])
    
    model = Model(inputs=[user_in, item_in, scaled_in, passthrough_in], outputs=output)
    
    model.compile(optimizer=Nadam(learning_rate=0.0005), loss='mse', metrics=['mae'])
    return model

# ==========================================
# 4. EXECUTION
# ==========================================
def readJSON(path):
    for l in gzip.open(path, 'rt'):
        d = eval(l)
        u = d['userID']
        try:
            g = d['gameID']
        except Exception as e:
            g = None
        yield u, g, d

if __name__ == "__main__":
    print("Starting Passthrough Residual Pipeline...")
    
    # 1. Load
    try:
        train_data = []
        user_games = defaultdict(list)
        game_users = defaultdict(list)

        for u, g, d in readJSON("train.json.gz"):
            user_games[u].append(g)
            game_users[g].append(u)
            train_data.append(d)

        df_train_data = pd.DataFrame(train_data)
        
        if 'user_id' in df_train_data.columns:
            df_train_data.drop('user_id', axis=1, inplace=True)
            
        df_train_data['found_funny'] = df_train_data['found_funny'].fillna(0)
        df_train_data['compensation'] = df_train_data['compensation'].fillna(0)
        df_train_data.loc[df_train_data['compensation'] != 0, 'compensation'] = 1
        
        df = df_train_data
        
    except FileNotFoundError:
        print("Error: train.json.gz not found. Creating dummy data...")
        df = pd.DataFrame({
            'userID': ['u'+str(i%100) for i in range(10000)],
            'gameID': ['g'+str(i%10) for i in range(10000)],
            'hours_transformed': np.random.uniform(0, 14, 10000),
            'text': ['game was addictive ' + str(i) for i in range(10000)],
            'date': ['2020-01-01']*10000,
            'found_funny': [0]*10000,
            'compensation': [0]*10000
        })

    # 2. Global Feature Engineering
    df, text_cols = prepare_data(df)
    
    # 3. Encoding IDs
    user_enc = LabelEncoder()
    game_enc = LabelEncoder()
    df['user_idx'] = user_enc.fit_transform(df['userID'])
    df['game_idx'] = game_enc.fit_transform(df['gameID'])
    
    n_users = df['user_idx'].max() + 1
    n_items = df['game_idx'].max() + 1
    
    # 4. Strict Train/Val Split
    train_df, val_df = train_test_split(df, test_size=0.10, random_state=42)
    print(f"Train size: {len(train_df)}, Val size: {len(val_df)}")
    
    # 5. Add Target Statistics (Leakage-Free)
    train_df, val_df = add_target_stats(train_df, val_df)
    
    # 6. Feature Groups
    # Group A: Passthrough (Target Stats - DO NOT SCALE)
    passthrough_cols = ['user_target_enc', 'game_target_enc', 'interaction_mean']
    
    # Group B: Scaled Features (Text, Dates, Counts)
    scaled_cols = text_cols + ['review_len', 'days_timeline']
    
    # 7. Prepare Matrices
    # Scale Group B only
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(train_df[scaled_cols])
    X_val_scaled = scaler.transform(val_df[scaled_cols])
    
    # Passthrough Group A (Raw)
    X_train_pass = train_df[passthrough_cols].values
    X_val_pass = val_df[passthrough_cols].values
    
    # IDs
    X_train = [train_df['user_idx'].values, train_df['game_idx'].values, X_train_scaled, X_train_pass]
    y_train = train_df['hours_transformed'].values
    
    X_val = [val_df['user_idx'].values, val_df['game_idx'].values, X_val_scaled, X_val_pass]
    y_val = val_df['hours_transformed'].values
    
    # 8. Build & Train
    model = build_passthrough_model(n_users, n_items, len(scaled_cols), len(passthrough_cols))
    model.summary()
    
    # Callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5)
    
    print("\nTraining Passthrough Residual Model...")
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=30,
        batch_size=512, 
        callbacks=[early_stop, reduce_lr],
        verbose=1
    )
    
    # 9. Evaluation
    print("\n--- Evaluation ---")
    val_preds = model.predict(X_val).flatten()
    val_preds = np.clip(val_preds, 0, None)
    
    rmse = np.sqrt(np.mean((y_val - val_preds)**2))
    print(f"FINAL RMSE: {rmse:.4f}")
    
    # Detailed Analysis
    val_df['pred'] = val_preds
    val_df['error'] = val_df['hours_transformed'] - val_df['pred']
    val_df['bin'] = pd.cut(val_df['hours_transformed'], bins=[0,2,4,6,8,10,20])
    print("\nBias by Target Bin:")
    print(val_df.groupby('bin', observed=False)['error'].mean())

Starting Passthrough Residual Pipeline...
--- 1. Text Processing (TF-IDF + SVD) ---
   -> Compressing Text features via SVD...
--- 2. Date Processing ---
--- 3. Basic Counts ---
Train size: 157500, Val size: 17500
--- 4. Computing Target Statistics (No Leakage) ---



Training Passthrough Residual Model...
Epoch 1/30
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 58ms/step - loss: 27.7669 - mae: 3.2891 - val_loss: 5.0371 - val_mae: 1.7577 - learning_rate: 5.0000e-04
Epoch 2/30
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 52ms/step - loss: 4.6682 - mae: 1.6688 - val_loss: 3.9983 - val_mae: 1.5768 - learning_rate: 5.0000e-04
Epoch 3/30
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 47ms/step - loss: 4.1994 - mae: 1.5758 - val_loss: 3.7635 - val_mae: 1.5225 - learning_rate: 5.0000e-04
Epoch 4/30
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 55ms/step - loss: 3.9272 - mae: 1.5224 - val_loss: 3.5865 - val_mae: 1.4715 - learning_rate: 5.0000e-04
Epoch 5/30
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 55ms/step - loss: 3.6950 - mae: 1.4738 - val_loss: 3.8078 - val_mae: 1.5193 - learning_rate: 5.0000e-04
Epoch 6/30
[1m308/308[0m [32m━━━━━━━━━━━━━━━━━