In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

# Load data
data = pd.read_csv("Datasets/analysis_data.csv")

y = data["monthly_spend"]
X = data.drop(columns=["monthly_spend"])

# Identify column types
categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_columns = X.select_dtypes(include=['number']).columns.tolist()

# OneHot Encode
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_columns)
    ],
    remainder="passthrough"
)

X_encoded = preprocessor.fit_transform(X)
encoded_cat_cols = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_columns)
new_cols = list(encoded_cat_cols) + numeric_columns

X_encoded_df = pd.DataFrame(X_encoded, columns=new_cols)

# MICE Imputation using BayesianRidge
mice = IterativeImputer(
    estimator=BayesianRidge(),
    max_iter=10,
    initial_strategy='mean',
    random_state=42
)

X_imputed = mice.fit_transform(X_encoded_df)
X_imputed_df = pd.DataFrame(X_imputed, columns=new_cols)


In [3]:
from sklearn.preprocessing import PolynomialFeatures

# Polynomial features (degree=2 interactions only)
poly = PolynomialFeatures(
    degree=2,
    interaction_only=True,   # avoids squared terms if you want
    include_bias=False
)

X_poly = poly.fit_transform(X_imputed_df)
poly_feature_names = poly.get_feature_names_out(X_imputed_df.columns)


In [4]:
from sklearn.linear_model import ElasticNetCV

# Elastic Net without scaling X or y
enet = ElasticNetCV(
    l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.9],
    alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10],
    cv=5,
    random_state=42,
    max_iter=5000
)

enet.fit(X_poly, y)

print("Best alpha:", enet.alpha_)
print("Best l1_ratio:", enet.l1_ratio_)


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Best alpha: 10.0
Best l1_ratio: 0.9


  model = cd_fast.enet_coordinate_descent(


In [5]:
from sklearn.metrics import root_mean_squared_error

y_hat=enet.predict(X_poly)
root_mean_squared_error(y,y_hat)

249.84511327037154

In [6]:
# Load scoring data
scoring = pd.read_csv("Datasets/scoring_data.csv")

# Apply same OneHot encoder
X_scoring_encoded = preprocessor.transform(scoring)
numeric_scoring_cols = scoring.select_dtypes(include=['number']).columns.tolist()
X_scoring_encoded_df = pd.DataFrame(X_scoring_encoded, columns=new_cols)

# MICE imputation
X_scoring_imputed = mice.transform(X_scoring_encoded_df)
X_scoring_imputed_df = pd.DataFrame(X_scoring_imputed, columns=new_cols)

# Polynomial features
X_scoring_poly = poly.transform(X_scoring_imputed_df)

# Predict
pred = enet.predict(X_scoring_poly)

# Export submission
submission = pd.DataFrame({
    "customer_id": scoring["customer_id"],
    "monthly_spend": pred
})
submission.to_csv("Submissions/submission_file_12.csv", index=False)
