---
# ðŸ”° Out of the Box Performance

In [None]:
### Train and evaluate
def train_and_score_model(X_train, X_val, y_train, y_val, model, target, 
                          task = "regression", 
                          verbose = False, 
                          TargetTransformer = None):
    """
    trains a model and returns teained model & score
    """
    model.fit(X_train, y_train)
    if task == "regression" or task == "classification": y_predict = model.predict(X_val)
    elif task =="classification_probability": y_predict = model.predict_proba(X_val)[:, 1]
    else: 
        print(f"Unknown task {task}")
        return model, None
    if TargetTransformer != None:
        y_v = TargetTransformer.inverse_transform(y_val.values.reshape(-1, 1))
        y_p = TargetTransformer.inverse_transform(y_predict.reshape(-1, 1))
    else:
        y_v = np.array(y_val).reshape(-1, 1)
        y_p = np.array(y_predict).reshape(-1, 1)
    score = calculate_score(y_v, y_p, metric = task)
    if verbose == True: 
        plot_training_results(X_train, X_val, y_train, y_v, y_p,
                              task=task, TargetTransformer=TargetTransformer)
    else: 
        print(f"***  model score:  {score:.4f}  ***")
    return model, score

def get_feature_importance(X_train, X_val, y_train, y_val, target, verbose = False, task = "regression"):
    """
    trains a model and returns dataseries of feature impotance 
    """
    if task == "regression":
        ModelFeatureImportance, score = train_and_score_model(X_train, X_val, y_train, y_val, model = lgb.LGBMRegressor(verbose = -1), target = target, verbose = False, task = task)
    elif task == "classification" or task == "classification_probability":
        ModelFeatureImportance, score = train_and_score_model(X_train, X_val, y_train, y_val, model = lgb.LGBMClassifier(verbose = -1), target = target, verbose = False, task = task)
    else: 
        print(f"!!!  Task not recognized   !!!")
        return None
    df = pd.Series(ModelFeatureImportance.feature_importances_, name="importance", index=X_train.columns)
    df.sort_values(ascending=False, inplace = True)
    print("=" * 69)
    print(f"  ***  Top feature is: {df.index[0]}  *** \n")
    df[:10].plot(kind = 'barh', title = f"Top {min(10, len(df))} of {len(df)} Features")
    if verbose:
        print("=" * 69)
        print(f"  Top Features:")
        print(df.head(12))
        print("=" * 69)
        print(f"  Bottom Features:")
        print("=" * 69)
        print(df.tail(12))
        print("=" * 69)
        print(f"Zero importance features: {(df == 0).sum()} of {len(df.index)}")
    return df


def plot_training_results(X_t, X_v, y_t, y_v, y_p, task = 'regression', TargetTransformer = None):
    """
    Uses X_t, y_t to train a lgm model and predict on X_v for comparison to the trained model predictions
    plots visualizations of model predictions (y_p) to truth (y_v) with a ridge/gausian naive bayes reference model
    """
    if task == "regression": base_model = skl.linear_model.Ridge()
    else: base_model = skl.naive_bayes.GaussianNB()
    numeric_features = [f for f in X_t.columns.tolist() if X_t[f].dtype != "object" and X_t[f].dtype != "string"]

    base_model.fit(X_t[numeric_features], y_t)
    
    if task == "classification_probability":
        y_base = base_model.predict_proba(X_v[numeric_features])[:, 1].reshape(-1, 1)
    else:
        y_base = base_model.predict(X_v[numeric_features]).reshape(-1, 1)
        if TargetTransformer != None:
            y_base = TargetTransformer.inverse_transform(y_base).reshape(-1, 1)
    
    def plot_regression_resid(ax):
        skl.metrics.PredictionErrorDisplay.from_predictions(y_v[:1000], y_base[:1000], kind = 'actual_vs_predicted',
                                                            scatter_kwargs={"color":'xkcd:gold', "alpha":0.8},
                                                            ax = ax)
        skl.metrics.PredictionErrorDisplay.from_predictions(y_v[:1000], y_p[:1000], kind = 'actual_vs_predicted', 
                                                            scatter_kwargs={"alpha":0.8},
                                                            line_kwargs={"color":'xkcd:dusty rose'},
                                                            ax = ax)
        ax.set_title(f"Trained Model {calculate_score(y_v, y_p, metric = task):.4f} vs Ridge {calculate_score(y_v, y_base, metric = task):.4f} RMSE")

    def plot_classification_cm(ax, predictions=y_p, title = "Trained"):
        skl.metrics.ConfusionMatrixDisplay.from_predictions(y_v, predictions, cmap='bone_r', 
                                                            normalize='all', colorbar=False, ax=ax)
        ax.invert_yaxis()
        ax.set_title(f"{title} Model Accuracy {100*calculate_score(y_v, predictions, metric = 'accuracy'):.1f}%")

    def plot_classification_roc(ax):
        skl.metrics.RocCurveDisplay.from_predictions(y_v, y_p, ax=ax, name="Trained Model")
        skl.metrics.RocCurveDisplay.from_predictions(y_v, y_base, name="GaussianNB", ax=ax)
        ax.set_title("ROC Curve")

    def plot_distribution(ax):
        ax.hist(y_v, bins=min(50,2+len(np.unique(y_v))), color='xkcd:silver', alpha=0.8, density = True)
        ax.hist(y_p, bins=min(50,2+len(np.unique(y_v))), color='xkcd:ocean blue', alpha=0.9, density = True)
        ax.set_title("Prediction Distribution vs Training Distribution")
        ax.set_yticks([])
        ax.set_ylabel("Probability Density")

    def plot_residuals(ax):
        residuals = y_p - y_v
        ax.hist(residuals, bins=min(50,2+len(np.unique(residuals))), color='xkcd:dull green', alpha=0.9)
        ax.set_title("Residual Distribution")
        ax.set_yticks([])
        ax.set_ylabel("Count")

    fig = plt.figure(figsize=(9, 6))
    gs = mpl.gridspec.GridSpec(2, 3, figure=fig)

    if task == "regression":
        plot_regression_resid(fig.add_subplot(gs[:, :2]))
        plot_distribution(fig.add_subplot(gs[0, 2]))
        plot_residuals(fig.add_subplot(gs[1, 2]))
            
    elif task == "classification":
        plot_classification_cm(fig.add_subplot(gs[:, :2]))
        plot_distribution(fig.add_subplot(gs[0, 2]))
        plot_classification_cm(fig.add_subplot(gs[1, 2]), 
                               predictions = y_base,
                               title = "GaussianNB")
    
    elif task == "classification_probability":
        plot_classification_roc(fig.add_subplot(gs[:, :2]))
        plot_distribution(fig.add_subplot(gs[0, 2]))
        plot_classification_cm(fig.add_subplot(gs[1, 2]), predictions=np.round(y_p))

    plt.tight_layout()
    plt.show()

In [None]:
### Select numeric features as training features
training_features = [f for f in XY.columns.tolist() if 
                     XY[f].dtype != "category" and 
                     f not in targets]

X_train, y_train, X_val,  y_val, X_test, y_test = split_training_data(XY, training_features, target,
                                                                      validation_size=0.2)

### Check feature_importance
feature_importance = get_feature_importance(X_train, X_val, y_train, y_val, target,
                                            task="regression")

In [None]:
### Model OOB performance
model = lgb.LGBMRegressor(verbose = -1, n_jobs = CORES)
_, score = train_and_score_model(X_train, X_val, y_train, y_val, model, target,
                                 verbose=True, 
                                 task="regression")

#### ðŸ‘€ Initial Model Observations and Notes
- Untuned LGBM (RMSE = 9.42) slightly outperforms Ridge Linear Regression (9.52)
- Model does poor job of predicting at target extreames <- improvement opportunity!!
- Residuals are normally distributed
- Model is predictive, but lots of noise!
---