In [15]:
import joblib
import numpy as np

# Load direct customer embeddings
id2_to_vec = joblib.load("../TransEncoding/customer_id_to_embedding.pkl")

# Load fallback strategy components from single file
fallbacks = joblib.load("../TransEncoding/customer_fallbacks.pkl")
customer_centroids = fallbacks["centroids"]
global_mean_vector = fallbacks["global_mean"]

# Optional: check fallback stats
print(f"📊 Customer Fallback Clusters: {customer_centroids.shape[0]}")
print(f"📈 Global mean vector shape: {global_mean_vector.shape}")

# Embedding resolver function
def get_customer_embedding(id2, id2_to_vec, centroids, global_mean):
    """
    Hybrid strategy to resolve customer embedding.

    1. If ID found, use direct embedding.
    2. Else, fall back to random cluster center.
    3. Else, fall back to global mean.
    """
    if id2 in id2_to_vec:
        return id2_to_vec[id2]
    elif len(centroids) > 0:
        cluster_idx = np.random.randint(len(centroids))
        return centroids[cluster_idx]
    else:
        return global_mean


📊 Customer Fallback Clusters: 50
📈 Global mean vector shape: (16,)


In [16]:
import joblib
import numpy as np

# -------- Load Offer Embedding + Fallbacks -------- #
id3_to_vec = joblib.load("../OfferEncoding/offer_id_to_embedding.pkl")
offer_centroids = joblib.load("../OfferEncoding/offer_cluster_centers.pkl")
global_offer_mean = np.load("../OfferEncoding/global_mean_offer_vector.npy")

print(f"📊 Offer fallback clusters: {offer_centroids.shape[0]}")
print(f"📈 Global mean offer vector shape: {global_offer_mean.shape}")

# -------- Resolver Function -------- #
def get_offer_embedding(id3, id3_to_vec, centroids, global_mean):
    """
    Hybrid strategy to resolve offer embedding.

    1. If ID found, use direct embedding.
    2. Else, use random cluster center.
    3. Else, use global mean.
    """
    if id3 in id3_to_vec:
        return id3_to_vec[id3]
    elif len(centroids) > 0:
        cluster_idx = np.random.randint(len(centroids))
        return centroids[cluster_idx]
    else:
        return global_mean


📊 Offer fallback clusters: 10
📈 Global mean offer vector shape: (16,)


In [17]:
# Load offer hybrid data
offer_id_to_vec = joblib.load("../OfferEncoding/offer_id_to_embedding.pkl")
offer_clusters = joblib.load("../OfferEncoding/offer_cluster_centers.pkl")
global_offer_vec = np.load("../OfferEncoding/global_mean_offer_vector.npy")

# Load customer hybrid data
customer_fallback = joblib.load("../TransEncoding/customer_fallbacks.pkl")
customer_id_to_vec = joblib.load("../TransEncoding/customer_id_to_embedding.pkl")
customer_centroids = customer_fallback["centroids"]
global_customer_vec = customer_fallback["global_mean"]

# Load event data
event_pair = pd.read_parquet("../EventsEncoding/event_pair_agg.parquet").set_index(['id2', 'id3'])
event_customer = pd.read_parquet("../EventsEncoding/event_customer_agg.parquet").set_index('id2')
event_offer = pd.read_parquet("../EventsEncoding/event_offer_agg.parquet").set_index('id3')
event_cols = ['clicks', 'views', 'click_rate', 'avg_click_delay',
              'min_click_delay', 'max_click_delay', 'std_click_delay']


In [18]:
import pandas as pd
import numpy as np
import joblib
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

def closest_vector(centroids, global_vec):
    dists = np.linalg.norm(centroids - global_vec, axis=1)
    return centroids[np.argmin(dists)]

def get_customer_embedding(id2, id2_to_vec, centroids, global_mean):
    vec = id2_to_vec.get(id2)
    if vec is not None:
        return vec
    else:
        return closest_vector(centroids, global_mean)

def build_features(data_path, is_train=True):
    print(f"\n📦 Loading {'train' if is_train else 'test'} data")
    df = pd.read_parquet(data_path)
    ids = df[['id1', 'id2', 'id3', 'id5']].copy()

    df["id2"] = df["id2"].astype(str)
    df["id3"] = df["id3"].astype(str)

    if is_train:
        if 'y' not in df.columns:
            raise ValueError("Train data must contain a 'y' column.")
        labels = df['y'].values

    # ----------------- Load Offer Hybrid Embeddings ------------------ #
    offer_id_to_vec = joblib.load("../OfferEncoding/offer_id_to_embedding.pkl")
    offer_clusters = joblib.load("../OfferEncoding/offer_cluster_centers.pkl")
    global_offer_vec = np.load("../OfferEncoding/global_mean_offer_vector.npy")
    offer_vec_dim = global_offer_vec.shape[0]

    # ----------------- Load Customer Hybrid Embeddings ------------------ #
    customer_id_to_vec = joblib.load("../TransEncoding/customer_id_to_embedding.pkl")
    customer_fallback = joblib.load("../TransEncoding/customer_fallbacks.pkl")
    customer_centroids = customer_fallback["centroids"]
    global_customer_vec = customer_fallback["global_mean"]
    behavior_dim = global_customer_vec.shape[0]

    # ----------------- Load Event Aggregates ------------------ #
    event_pair = pd.read_parquet("../EventsEncoding/event_pair_agg.parquet").set_index(['id2', 'id3'])
    event_customer = pd.read_parquet("../EventsEncoding/event_customer_agg.parquet").set_index('id2')
    event_offer = pd.read_parquet("../EventsEncoding/event_offer_agg.parquet").set_index('id3')
    event_cols = ['clicks', 'views', 'click_rate', 'avg_click_delay',
                  'min_click_delay', 'max_click_delay', 'std_click_delay']

    # ----------------- Fix f-columns ------------------ #
    all_f_cols = [f"f{i}" for i in range(1, 367)]
    f_categorical_cols = []
    f_numeric_cols = []

    for col in all_f_cols:
        if df[col].dtype == 'object':
            try:
                df[col] = pd.to_numeric(df[col], errors='raise')
                f_numeric_cols.append(col)
            except:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str))
                f_categorical_cols.append(col)
        else:
            f_numeric_cols.append(col)

    numeric_f_cols = f_numeric_cols + f_categorical_cols
    df[numeric_f_cols] = df[numeric_f_cols].fillna(0)

    print(f"✅ Using {len(numeric_f_cols)} total f-features ({len(f_numeric_cols)} numeric + {len(f_categorical_cols)} encoded categorical)")

    # ----------------- Build Feature Matrix ------------------ #
    features = []
    missing_offer = 0
    missing_event_pair = 0
    missing_event_customer = 0
    missing_event_offer = 0
    missing_behavior = 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        base_feats = row[numeric_f_cols].astype(np.float32).values

        # --- OFFER EMBEDDING (Hybrid) ---
        offer_vec = offer_id_to_vec.get(row["id3"])
        if offer_vec is None:
            offer_vec = closest_vector(offer_clusters, global_offer_vec)
            missing_offer += 1

        # --- EVENT EMBEDDING (3-level fallback) ---
        key_pair = (row["id2"], row["id3"])
        if key_pair in event_pair.index:
            event_vec = event_pair.loc[key_pair].values
        elif row["id2"] in event_customer.index:
            event_vec = event_customer.loc[row["id2"]].values
            missing_event_pair += 1
        elif row["id3"] in event_offer.index:
            event_vec = event_offer.loc[row["id3"]].values
            missing_event_pair += 1
            missing_event_customer += 1
        else:
            event_vec = np.zeros(len(event_cols))
            missing_event_pair += 1
            missing_event_customer += 1
            missing_event_offer += 1

        # --- CUSTOMER EMBEDDING (Hybrid) ---
        behavior_vec = get_customer_embedding(row["id2"], customer_id_to_vec, customer_centroids, global_customer_vec)
        if row["id2"] not in customer_id_to_vec:
            missing_behavior += 1

        # Combine
        features.append(np.concatenate([base_feats, offer_vec, event_vec, behavior_vec]))

    X = np.vstack(features)

    print(f"⚠️ Missing offers: {missing_offer}")
    print(f"⚠️ Missing event pair: {missing_event_pair}")
    print(f"↪️ Fallback to customer-level: {missing_event_customer}")
    print(f"↪️ Fallback to offer-level: {missing_event_offer}")
    print(f"⚠️ Missing customer behavior: {missing_behavior}")
    print(f"✅ Final feature matrix shape: {X.shape}")

    if is_train:
        return X, labels, ids.values
    else:
        return X, ids.values


In [27]:
import numpy as np
import joblib
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier


# ----------------------- Training Pipeline ----------------------- #
def train_model(train_path="../../Dataset/train_data.parquet"):
    print("🚀 Starting training...")

    # Step 1: Build Features
    X, y, ids = build_features(train_path, is_train=True)

    # Step 2: Split Data
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"🧪 Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

    # Step 3: Define LGBMClassifier
    from lightgbm import LGBMClassifier, early_stopping, log_evaluation

    model = LGBMClassifier(
        objective='binary',
        learning_rate=0.05,
        num_leaves=64,
        max_depth=-1,
        boosting_type='gbdt',
        subsample=0.8,
        colsample_bytree=0.8,
        subsample_freq=5,
        n_estimators=500,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
        callbacks=[
            early_stopping(stopping_rounds=25),
            log_evaluation(period=50)
        ]
    )

    # Step 5: Evaluate
    y_val_pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_val_pred)
    ll = log_loss(y_val, y_val_pred)

    print(f"\n✅ Validation AUC: {auc:.4f}")
    print(f"📉 Validation LogLoss: {ll:.4f}")

    # Step 6: Save artifacts
    joblib.dump(model, "lightgbm_model.pkl")
    np.save("val_preds.npy", y_val_pred)
    print("💾 Saved: model & validation predictions.")

    return model


In [28]:
model = train_model()

🚀 Starting training...

📦 Loading train data
✅ Using 366 total f-features (357 numeric + 9 encoded categorical)


100%|█████████████████████████████████████████████████████████████████████████| 770164/770164 [14:39<00:00, 875.48it/s]


⚠️ Missing offers: 100
⚠️ Missing event pair: 770164
↪️ Fallback to customer-level: 770164
↪️ Fallback to offer-level: 0
⚠️ Missing customer behavior: 770164
✅ Final feature matrix shape: (770164, 405)
🧪 Training samples: 616131, Validation samples: 154033
[LightGBM] [Info] Number of positive: 29641, number of negative: 586490
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.031562 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 52719
[LightGBM] [Info] Number of data points in the train set: 616131, number of used features: 323
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048108 -> initscore=-2.984997
[LightGBM] [Info] Start training from score -2.984997
Training until validation scores don't improve for 25 rounds
[50]	valid_0's auc: 0.936016	valid_0's binary_logloss: 0.0899051
[100]	valid_0's auc: 0.944888	valid_0's binary_logloss: 0.0820331
[150]	valid_0's auc: 0.948765	valid_0's binary_loglos




✅ Validation AUC: 0.9534
📉 Validation LogLoss: 0.0741
💾 Saved: model & validation predictions.


In [35]:
import pandas as pd

def predict_and_save(
    test_path="../../Dataset/test_data.parquet",
    model_path="lightgbm_model.pkl",
    output_dir=r"C:\Users\dhruv\OneDrive\Desktop\AMEX\Code\Model"
):
    print("🔮 Generating predictions on test set...")

    # 1. Load features from test set
    X_test, id_matrix = build_features(test_path, is_train=False)

    # 2. Load the trained model
    model = joblib.load(model_path)

    # 3. Predict probabilities
    y_test_pred = model.predict_proba(X_test)[:, 1]

    # 4. Save raw prediction + IDs
    np.save(f"{output_dir}/test_preds.npy", y_test_pred)
    np.save(f"{output_dir}/test_ids.npy", id_matrix)
    print(f"✅ Saved raw predictions to {output_dir}/test_preds.npy")

    # 5. Save CSV for submission
    sub_df = pd.DataFrame({
        "id1": id_matrix[:, 0],
        "id2": id_matrix[:, 1],
        "id3": id_matrix[:, 2],
        "id5": id_matrix[:, 3],
        "prediction": y_test_pred
    })
    sub_df.to_csv(f"{output_dir}/submission.csv", index=False)
    print(f"📤 Submission CSV saved to {output_dir}/submission.csv")

    return sub_df


In [36]:
predict_and_save()

🔮 Generating predictions on test set...

📦 Loading test data
✅ Using 366 total f-features (357 numeric + 9 encoded categorical)


100%|████████████████████████████████████████████████████████████████████████| 369301/369301 [04:56<00:00, 1244.67it/s]


⚠️ Missing offers: 53
⚠️ Missing event pair: 369301
↪️ Fallback to customer-level: 369301
↪️ Fallback to offer-level: 2
⚠️ Missing customer behavior: 369301
✅ Final feature matrix shape: (369301, 405)




✅ Saved raw predictions to C:\Users\dhruv\OneDrive\Desktop\AMEX\Code\Model/test_preds.npy
📤 Submission CSV saved to C:\Users\dhruv\OneDrive\Desktop\AMEX\Code\Model/submission.csv


Unnamed: 0,id1,id2,id3,id5,prediction
0,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04,0.002842
1,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04,0.032592
2,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05,0.965038
3,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04,0.003186
4,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05,0.003625
...,...,...,...,...,...
369296,1874443_95537_16-23_2023-11-05 09:21:24.182,1874443,95537,2023-11-05,0.014164
369297,1541978_5718_16-23_2023-11-05 00:56:43.946,1541978,5718,2023-11-05,0.004318
369298,1887841_85905_16-23_2023-11-05 20:40:43.312,1887841,85905,2023-11-05,0.015275
369299,1569367_944713_16-23_2023-11-05 00:43:04.335,1569367,944713,2023-11-05,0.022691


In [3]:
import matplotlib.pyplot as plt
import lightgbm as lgb
import joblib

model = joblib.load("../../Code/FinalModel/lightgbm_model.pkl")

lgb.plot_importance(model, max_num_features=20, importance_type='gain')
plt.title("Top 20 Feature Importances")
plt.tight_layout()
plt.show()


In [11]:
import os
os.getcwd()

'C:\\Users\\dhruv\\OneDrive\\Desktop\\AMEX\\Code\\FinalModel'

In [None]:
2. "Unknown" Token During Encoder Training
If you're training an embedding model (e.g., an MLP), you can include a dummy "__UNKNOWN__" sample during training. That way, the model learns how to represent and handle unseen IDs.


3. Similarity-Based Approximation (Advanced)
You can:

Cluster embeddings

Assign new IDs to the nearest cluster center
This is more complex and works well when you expect lots of cold-starts.

3. Optional: Use FAISS for High-Speed Approximate Search (Advanced)
If cluster centers aren't enough and you need fine-grained fallback:

Use Facebook’s FAISS library for fast similarity search.

Handles millions of embeddings with sub-second latency.

But start with KMeans → scale later.

In [None]:
✅ 3. Hybrid Fallbacks (Best in Practice)
The best production systems usually combine:

Order	Method	Fallback To
1	Direct match	Use if id2 seen
2	Regression model	Predict if metadata seen
3	Cluster similarity	Nearest center
4	Global avg vector	Use final dummy vec

This reduces blind zero vectors to almost zero and makes the model very stable.

In [31]:
import os
print(os.getcwd())

C:\Users\dhruv\OneDrive\Desktop\AMEX\Code\Model
