In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from i import svg_dir, data_dir, model_dir, submissions_dir, model_pred_train_dir, model_pred_test_dir
from i import FeaturesToLinearPipeline
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV

cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

train = pd.read_csv(data_dir + "train.csv", index_col=0)
test = pd.read_csv(data_dir + "test.csv", index_col=0)

for col in train.select_dtypes(include="object").columns:
    train[col] = train[col].astype("category")
for col in test.select_dtypes(include="object").columns:
    test[col] = test[col].astype("category")

y = train.iloc[:, -1]
X = train.iloc[:, :-1]
X_pred = test

In [None]:
lasso = Pipeline([
  ("feature", FeaturesToLinearPipeline),
  ("lasso", LogisticRegressionCV(
    random_state=42, 
    cv=cv,
    penalty="l1",
    solver="saga",        # saga supports L1
    class_weight="balanced",
    max_iter=2000,
    n_jobs=-1
  ))
])

ridge = Pipeline([
  ("feature", FeaturesToLinearPipeline),
  ("ridge", LogisticRegressionCV(
    random_state=42, 
    cv=cv,
    penalty="l2",
    solver="lbfgs",       # default supports L2
    class_weight="balanced",
    max_iter=2000,
    n_jobs=-1
  ))
])

elast = Pipeline([
  ("feature", FeaturesToLinearPipeline),
  ("elastic", LogisticRegressionCV(
    random_state=42, 
    cv=cv,
    penalty="elasticnet",
    solver="saga",
    l1_ratios=[0.5],  # mix between L1 (lasso) and L2 (ridge)
    class_weight="balanced",
    max_iter=2000,
    n_jobs=-1
  ))
])
lasso.fit(X, y)
ridge.fit(X, y)
elast.fit(X, y)
joblib.dump(lasso, model_dir + "lasso.joblib")
joblib.dump(ridge, model_dir + "ridge.joblib")
joblib.dump(elast, model_dir + "elast.joblib")
score_lasso = cross_val_score(lasso, X, y, cv=cv, scoring="roc_auc")
print(f"lasso - Score per fold: {score_lasso} - Average Score: {score_lasso.mean():0.4f}")
score_ridge = cross_val_score(ridge, X, y, cv=cv, scoring="roc_auc")
print(f"ridge - Score per fold: {score_ridge} - Average Score: {score_ridge.mean():0.4f}")
score_elast = cross_val_score(elast, X, y, cv=cv, scoring="roc_auc")
print(f"elast - Score per fold: {score_elast} - Average Score: {score_elast.mean():0.4f}")

In [None]:
linearFrame = pd.DataFrame(
  data={
    "lasso": np.abs(lasso.named_steps["lasso"].coef_.ravel()) / np.abs(lasso.named_steps["lasso"].coef_.ravel()).max(),
    "ridge": np.abs(ridge.named_steps["ridge"].coef_.ravel()) / np.abs(ridge.named_steps["ridge"].coef_.ravel()).max(),
    "elast": np.abs(elast.named_steps["elastic"].coef_.ravel()) / np.abs(elast.named_steps["elastic"].coef_.ravel()).max(),
  },
  index= lasso.named_steps["lasso"].feature_names_in_
)
linearFrame["AverageImportance"] = linearFrame.mean(axis=1)
linearFrame.sort_values("AverageImportance", ascending=False)

In [None]:
# Sort features by importance
top10 = linearFrame.sort_values("AverageImportance", ascending=False).head(10).reset_index().rename(columns={"index": "Feature"})

# Set Seaborn theme
sns.set_theme(style="whitegrid")

# Create horizontal bar plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(data=top10, x="AverageImportance", y="Feature", palette="icefire", hue="Feature")

# Title and labels
ax.set_title("Top 10 Features - Consensus Linear Importance\n(Lasso + Ridge + ElasticNet)", fontsize=13, pad=12)


# Annotate bars with values
for container in ax.containers:
    ax.bar_label(container, fmt="%.3f", label_type="edge", fontsize=9, padding=2)

plt.tight_layout()
plt.show()

In [None]:
prediction_train = pd.DataFrame(
  data={
    "lasso": cross_val_predict(lasso, X, y, cv=cv, method="predict_proba")[:, 1],
    "ridge": cross_val_predict(ridge, X, y, cv=cv, method="predict_proba")[:, 1],
    "elast": cross_val_predict(elast, X, y, cv=cv, method="predict_proba")[:, 1],
  },
  index= X.index
)


prediction_test = pd.DataFrame(
  data={
    "lasso": lasso.predict_proba(X_pred)[:, 1],
    "ridge": ridge.predict_proba(X_pred)[:, 1],
    "elast": elast.predict_proba(X_pred)[:, 1],
  },
  index= X_pred.index
)

prediction_train.to_parquet(model_pred_train_dir + "train_prob_linear.parquet")
prediction_test.to_parquet(model_pred_test_dir + "train_prob_linear.parquet")