In [1]:
import os
os.chdir('../')

In [2]:
import numpy as np
import pandas as pd
data=pd.read_csv('Datasets/analysis_data.csv')

In [3]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, root_mean_squared_error

# =========================================================
# 1. Setup: target and features
# =========================================================
df = data.copy()  # assumes `data` exists
target_col = "monthly_spend"

y = df[target_col].reset_index(drop=True)
X = df.drop(columns=[target_col]).reset_index(drop=True)

# Identify column types
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["number"]).columns.tolist()

print("Categorical columns:", cat_cols)
print("Numeric columns:", num_cols)

# =========================================================
# 2. Trainâ€“test split
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Make copies for encoding
X_train_enc = X_train.copy().reset_index(drop=True)
X_test_enc = X_test.copy().reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Handle categorical missing + cast to string
if cat_cols:
    X_train_enc[cat_cols] = X_train_enc[cat_cols].astype(str).fillna("Missing")
    X_test_enc[cat_cols] = X_test_enc[cat_cols].astype(str).fillna("Missing")

# =========================================================
# 3. Out-of-fold Target Encoding (LEAK-PROOF)
#    - For train: out-of-fold encodings
#    - For test: maps fitted on full training
# =========================================================
def target_encode_train_test(
    X_train_df, y_train_ser, X_test_df, cat_columns, n_splits=5, smoothing=10
):
    """
    Returns:
        X_train_te: DataFrame of target-encoded train cols (one col per cat)
        X_test_te:  DataFrame of target-encoded test cols (one col per cat)
    """
    X_train_te = pd.DataFrame(index=X_train_df.index)
    X_test_te = pd.DataFrame(index=X_test_df.index)

    if not cat_columns:
        return X_train_te, X_test_te

    y_train_ser = y_train_ser.reset_index(drop=True)
    global_mean = y_train_ser.mean()

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # ---------- Out-of-fold encoding for TRAIN ----------
    for col in cat_columns:
        oof = pd.Series(index=X_train_df.index, dtype=float)

        for tr_idx, val_idx in kf.split(X_train_df):
            X_tr = X_train_df.iloc[tr_idx]
            X_val = X_train_df.iloc[val_idx]
            y_tr = y_train_ser.iloc[tr_idx]

            stats = (
                y_tr.groupby(X_tr[col])
                .agg(["mean", "count"])
                .rename(columns={"mean": "te_mean", "count": "te_count"})
            )

            # Smoothing
            smoothing_factor = 1 / (1 + np.exp(-(stats["te_count"] - smoothing)))
            te_values = global_mean * (1 - smoothing_factor) + stats["te_mean"] * smoothing_factor

            oof.iloc[val_idx] = X_val[col].map(te_values)

        oof = oof.fillna(global_mean)
        X_train_te[col + "_te"] = oof

    # ---------- Full-data encoding for TEST ----------
    for col in cat_columns:
        stats_full = (
            y_train_ser.groupby(X_train_df[col])
            .agg(["mean", "count"])
            .rename(columns={"mean": "te_mean", "count": "te_count"})
        )

        smoothing_factor_full = 1 / (1 + np.exp(-(stats_full["te_count"] - smoothing)))
        te_values_full = global_mean * (1 - smoothing_factor_full) + stats_full["te_mean"] * smoothing_factor_full

        test_encoded = X_test_df[col].map(te_values_full).fillna(global_mean)
        X_test_te[col + "_te"] = test_encoded

    return X_train_te, X_test_te

X_train_te, X_test_te = target_encode_train_test(
    X_train_enc, y_train, X_test_enc, cat_cols, n_splits=5, smoothing=10
)

# =========================================================
# 4. Build final numeric design matrices (num + target-encoded cats)
#    We DROP the original categorical columns.
# =========================================================
X_train_num = X_train_enc[num_cols].copy()
X_test_num = X_test_enc[num_cols].copy()

X_train_final = pd.concat([X_train_num.reset_index(drop=True),
                           X_train_te.reset_index(drop=True)], axis=1)
X_test_final = pd.concat([X_test_num.reset_index(drop=True),
                          X_test_te.reset_index(drop=True)], axis=1)

print("Final train shape (before MICE):", X_train_final.shape)
print("Final test shape  (before MICE):", X_test_final.shape)

# =========================================================
# 5. MICE Imputation (IterativeImputer with BayesianRidge)
# =========================================================
mice = IterativeImputer(
    estimator=BayesianRidge(),
    max_iter=10,
    initial_strategy="median",
    random_state=42
)

X_train_imputed = mice.fit_transform(X_train_final)
X_test_imputed = mice.transform(X_test_final)

X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train_final.columns)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test_final.columns)

print("After MICE - any nulls train?", X_train_imputed.isna().sum().sum())
print("After MICE - any nulls test? ", X_test_imputed.isna().sum().sum())

# =========================================================
# 6. Polynomial Features
# =========================================================
degree = 2  # you can try 2, 3, etc.
poly = PolynomialFeatures(
    degree=degree,
    interaction_only=True,   # interactions only, no squares; set False if you want all
    include_bias=False
)

X_train_poly = poly.fit_transform(X_train_imputed)
X_test_poly = poly.transform(X_test_imputed)

poly_feature_names = poly.get_feature_names_out(X_train_imputed.columns)
print("Polynomial feature count:", len(poly_feature_names))

# =========================================================
# 7. ElasticNetCV on polynomial-expanded, target-encoded, imputed data
# =========================================================
enet = ElasticNetCV(
    l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.9],
    alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10],
    cv=5,
    max_iter=5000,
    random_state=42
)

enet.fit(X_train_poly, y_train)

print("Best alpha from ElasticNetCV:", enet.alpha_)
print("Best l1_ratio from ElasticNetCV:", enet.l1_ratio_)

# =========================================================
# 8. Evaluation on held-out test set
# =========================================================
y_pred_test = enet.predict(X_test_poly)
rmse_test = root_mean_squared_error(y_test, y_pred_test)

print(f"\nðŸ”¥ Test RMSE (TE + MICE + Poly + ElasticNet): {rmse_test:.4f}")

# Optional: see top coefficients
coef_df = pd.DataFrame({
    "feature": poly_feature_names,
    "coef": enet.coef_
})
coef_df["abs_coef"] = coef_df["coef"].abs()
coef_df = coef_df.sort_values("abs_coef", ascending=False)

print("\nTop 20 features by |coef|:")
print(coef_df.head(20)[["feature", "coef"]])


Categorical columns: ['gender', 'marital_status', 'education_level', 'region', 'employment_status', 'card_type']
Numeric columns: ['customer_id', 'age', 'owns_home', 'has_auto_loan', 'annual_income', 'credit_score', 'credit_limit', 'tenure', 'num_transactions', 'avg_transaction_value', 'online_shopping_freq', 'reward_points_balance', 'travel_frequency', 'utility_payment_count', 'num_children', 'num_credit_cards']
Final train shape (before MICE): (32000, 22)
Final test shape  (before MICE): (8000, 22)
After MICE - any nulls train? 0
After MICE - any nulls test?  0
Polynomial feature count: 253


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Best alpha from ElasticNetCV: 10.0
Best l1_ratio from ElasticNetCV: 0.9

ðŸ”¥ Test RMSE (TE + MICE + Poly + ElasticNet): 250.5222

Top 20 features by |coef|:
                                      feature      coef
162    num_transactions avg_transaction_value  0.469540
188     online_shopping_freq travel_frequency -0.327376
21                               card_type_te  0.282414
154                       tenure num_children -0.214990
97                    has_auto_loan region_te -0.149258
237             num_credit_cards card_type_te  0.148222
44                          age has_auto_loan  0.127644
232                num_credit_cards gender_te -0.122655
79                        owns_home region_te -0.110324
96           has_auto_loan education_level_te  0.101881
87        has_auto_loan avg_transaction_value -0.083725
94                    has_auto_loan gender_te  0.079396
148                   tenure num_transactions -0.079049
65                     owns_home credit_score  0.078172
17

  model = cd_fast.enet_coordinate_descent(


# Train on full dataset

In [4]:
import numpy as np
import pandas as pd

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures

# =========================================================
# 1. Setup: full X, y
# =========================================================
df = data.copy()
target_col = "monthly_spend"

y_full = df[target_col].reset_index(drop=True)
X_full = df.drop(columns=[target_col]).reset_index(drop=True)

# Identify column types
cat_cols = X_full.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X_full.select_dtypes(include=["number"]).columns.tolist()

# Convert categoricals to string + fill NaN
if cat_cols:
    X_full[cat_cols] = X_full[cat_cols].astype(str).fillna("Missing")

# =========================================================
# 2. Target Encoding on FULL DATASET
# =========================================================
def target_encode_full(X, y, cat_columns, smoothing=10):
    TE_df = pd.DataFrame(index=X.index)
    y_mean = y.mean()

    for col in cat_columns:
        stats = (
            y.groupby(X[col])
            .agg(["mean", "count"])
            .rename(columns={"mean": "te_mean", "count": "te_count"})
        )

        smoothing_factor = 1 / (1 + np.exp(-(stats["te_count"] - smoothing)))
        te_values = y_mean * (1 - smoothing_factor) + stats["te_mean"] * smoothing_factor

        TE_df[col + "_te"] = X[col].map(te_values).fillna(y_mean)

    return TE_df

X_te_full = target_encode_full(X_full, y_full, cat_cols)

# =========================================================
# 3. Combine numeric + TE features
# =========================================================
X_num_full = X_full[num_cols].copy()
X_final_full = pd.concat(
    [X_num_full.reset_index(drop=True),
     X_te_full.reset_index(drop=True)],
    axis=1
)

# =========================================================
# 4. MICE Imputation on FULL dataset
# =========================================================
mice_full = IterativeImputer(
    estimator=BayesianRidge(),
    max_iter=10,
    initial_strategy="median",
    random_state=42
)

X_full_imputed = mice_full.fit_transform(X_final_full)
X_full_imputed = pd.DataFrame(X_full_imputed, columns=X_final_full.columns)

# =========================================================
# 5. Polynomial Expansion on FULL dataset
# =========================================================
degree = 2  # or what you found best earlier
poly_full = PolynomialFeatures(
    degree=degree,
    interaction_only=True,
    include_bias=False
)

X_full_poly = poly_full.fit_transform(X_full_imputed)
poly_feature_names_full = poly_full.get_feature_names_out(X_full_imputed.columns)

# =========================================================
# 6. Fit FINAL ElasticNet model on FULL dataset
# =========================================================
enet_full = ElasticNetCV(
    l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.9],
    alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10],
    cv=5,
    max_iter=5000,
    random_state=42
)

enet_full.fit(X_full_poly, y_full)

print("FINAL MODEL FITTED")
print("Best alpha:", enet_full.alpha_)
print("Best l1_ratio:", enet_full.l1_ratio_)
print("Number of features:", X_full_poly.shape[1])

# =========================================================
# OPTIONAL: Show top coefficients
# =========================================================
coef_df = pd.DataFrame({
    "feature": poly_feature_names_full,
    "coef": enet_full.coef_
})
coef_df["abs_coef"] = coef_df["coef"].abs()
print("\nTop 20 FINAL features:")
print(coef_df.sort_values("abs_coef", ascending=False).head(20))


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

FINAL MODEL FITTED
Best alpha: 10.0
Best l1_ratio: 0.9
Number of features: 253

Top 20 FINAL features:
                                      feature      coef  abs_coef
162    num_transactions avg_transaction_value  0.461052  0.461052
21                               card_type_te  0.216846  0.216846
188     online_shopping_freq travel_frequency -0.170801  0.170801
154                       tenure num_children -0.136621  0.136621
237             num_credit_cards card_type_te  0.131579  0.131579
97                    has_auto_loan region_te -0.119725  0.119725
126             credit_score num_credit_cards -0.104363  0.104363
96           has_auto_loan education_level_te  0.099017  0.099017
153              tenure utility_payment_count -0.075553  0.075553
55                           age num_children -0.074899  0.074899
148                   tenure num_transactions -0.064858  0.064858
179        avg_transaction_value num_children -0.053205  0.053205
51                   age online_shoppin

  model = cd_fast.enet_coordinate_descent(


In [6]:
y_hat_full=enet_full.predict(X_full_poly)

from sklearn.metrics import root_mean_squared_error

root_mean_squared_error(y_full,y_hat_full)

250.67539874379463

In [8]:
# -----------------------------------------------------------
# Build TE mapping from FULL TRAINING DATA (X_full, y_full)
# -----------------------------------------------------------

def build_te_mapping_full(X_full, y_full, cat_cols, smoothing=10):
    te_mapping = {}
    global_mean = y_full.mean()

    for col in cat_cols:
        stats = (
            y_full.groupby(X_full[col])
            .agg(['mean', 'count'])
            .rename(columns={'mean': 'te_mean', 'count': 'te_count'})
        )

        # smoothing factor
        sf = 1 / (1 + np.exp(-(stats['te_count'] - smoothing)))
        te_values = global_mean * (1 - sf) + stats['te_mean'] * sf

        te_mapping[col] = te_values

    return te_mapping, global_mean


# Recreate mappings
te_mapping, target_global_mean = build_te_mapping_full(
    X_full, y_full, cat_cols
)

print("Target encoding mappings ready.")


Target encoding mappings ready.


For scoring

In [9]:
# ===============================================
# Load Scoring Data
# ===============================================
scoring = pd.read_csv("Datasets/scoring_data.csv")

# Make a copy
sc = scoring.copy()

# Ensure categorical columns are strings
sc[cat_cols] = sc[cat_cols].astype(str).fillna("Missing")

# ===============================================
# Apply Target Encoding (same as training)
# ===============================================
scoring_te = pd.DataFrame(index=sc.index)

for col in cat_cols:
    mapping = te_mapping[col]              # precomputed mapping from full training
    scoring_te[col + "_te"] = sc[col].map(mapping).fillna(target_global_mean)

# ===============================================
# Combine Numeric + Target Encoded Features
# ===============================================
scoring_num = sc[num_cols].copy()
scoring_final = pd.concat([
    scoring_num.reset_index(drop=True),
    scoring_te.reset_index(drop=True)
], axis=1)

# ===============================================
# Apply MICE Imputation (trained imputer)
# ===============================================
X_scoring_imputed = mice_full.transform(scoring_final)
X_scoring_imputed_df = pd.DataFrame(X_scoring_imputed, columns=scoring_final.columns)

# ===============================================
# Polynomial Features (trained transformer)
# ===============================================
X_scoring_poly = poly_full.transform(X_scoring_imputed_df)

# ===============================================
# Predict using FINAL ElasticNet model
# ===============================================
scoring_pred = enet_full.predict(X_scoring_poly)

# ===============================================
# Export Submission File
# ===============================================
submission = pd.DataFrame({
    "customer_id": scoring["customer_id"],
    "monthly_spend": scoring_pred
})

submission.to_csv("Submissions/submission_file_15.csv", index=False)

print("Scoring completed. Saved to Submissions/submission_file_final.csv")


Scoring completed. Saved to Submissions/submission_file_final.csv
