In [1]:
from sklearn.model_selection import GridSearchCV

def hyperparameter_tuning(model, param_grid, X_train, y_train, cv=5):
    grid_search = GridSearchCV(model, param_grid, cv=cv, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    return best_params

In [2]:
from sklearn.model_selection import cross_val_score

def cross_validate(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv)
    return scores

In [3]:
from sklearn.metrics import classification_report

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    print(report)

In [4]:
from sklearn.metrics import roc_curve, roc_auc_score

def plot_roc_curve(model, X_test, y_test):
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, label=f'ROC AUC: {roc_auc_score(y_test, y_pred_proba):.2f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()

In [6]:
# Functions
def train_and_predict(model, X_train, y_train, X_test):
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the training data
    y_pred_train = model.predict(X_train)
    
    # Make predictions on the test data
    y_pred_test = model.predict(X_test)

    # Number of mislabeled points in the training set
    print("Number of mislabeled points out of a total of %d points : %d" % (X_train.shape[0], (y_train != y_pred_train).sum()))

    # Empirical error over 10 folds
    print("Empirical error over 10 folds: {:.2%}".format((y_train != y_pred_train).sum()/X_train.shape[0]))

    # Calculate cross-validation score over 10 folds
    scores = cross_val_score(model, X_train, y_train, cv=25, n_jobs=8, scoring='roc_auc')
    print("Cross-validation score over 10 folds : {:.3%}".format(np.mean(scores)))
        
    return y_pred_train, y_pred_test

In [7]:
# Example usage
from sklearn.ensemble import RandomForestClassifier

# Hyperparameter Tuning
rf_params = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
best_params = hyperparameter_tuning(RandomForestClassifier(), rf_params, X_train, y_train)
print(f"Best Parameters: {best_params}")

# Instantiate model with best parameters
rf_model = RandomForestClassifier(**best_params)
rf_model.fit(X_train, y_train)

# Cross-Validation
scores = cross_validate(rf_model, X_train, y_train)
print(f"Cross-Validation Scores: {scores}")

# Model Evaluation
evaluate_model(rf_model, X_test, y_test)

# Plotting ROC Curve
plot_roc_curve(rf_model, X_test, y_test)

NameError: name 'X_train' is not defined

In [8]:
classes = ['Spam', 'Ham']

def plot_confusion_matrix_and_report(y_test, y_pred, classes, ax):
    # Calculate confusion matrix
    conf_mat = confusion_matrix(y_test, y_pred)

    # Using seaborn to visualize confusion matrix
    sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', ax=ax)

    # Add x and y labels based on the provided classes list
    ax.set_xticks(np.arange(len(classes))+0.5)
    ax.set_xticklabels(classes, rotation=45)
    
    ax.set_yticks(np.arange(len(classes))+0.5)
    ax.set_yticklabels(classes, rotation=45)

    # Give titles and labels to the plot
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
    ax.set_title('Confusion Matrix')

In [9]:
# # Defining learning curve plot

# def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
#     plt.figure()
#     plt.title(title)
#     if ylim is not None:
#         plt.ylim(*ylim)
#     plt.xlabel("Training examples")
#     plt.ylabel("Score")
#     train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
#     train_scores_mean = np.mean(train_scores, axis=1)
#     train_scores_std = np.std(train_scores, axis=1)
#     test_scores_mean = np.mean(test_scores, axis=1)
#     test_scores_std = np.std(test_scores, axis=1)
#     plt.grid()
#     plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
#     plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
#     plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
#     plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
#     plt.legend(loc="best")
#     return plt

In [10]:
# # Define a function to plot the calibration curve for a given classifier
# def plot_calibration_curve(classifier, clf_name, X_train, y_train, X_test, y_test, ax):
#     # Create a calibrated version of the classifier
#     clf_calibrated = CalibratedClassifierCV(classifier, method='sigmoid', cv=5)
#     clf_calibrated.fit(X_train, y_train)
    
#     # Predict probabilities
#     probs = clf_calibrated.predict_proba(X_test)[:, 1]
    
#     # Plot
#     fraction_of_positives, mean_predicted_value = calibration_curve(y_test, probs, n_bins=10)
#     ax.plot(mean_predicted_value, fraction_of_positives, 's-', label=f'Calibrated ({clf_name})')
#     ax.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
#     ax.set_ylabel('Fraction of positives')  # Corrected here
#     ax.set_xlabel('Mean predicted probability')
#     ax.legend(loc='best')
#     ax.set_title(f'Calibration Curve for {clf_name}')

In [None]:
# Example of code which can be automated in the next cell.

# Hyperparameter Tuning
rf_params = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
best_params = hyperparameter_tuning(RandomForestClassifier(), rf_params, X_train, y_train)
print(f"Best Parameters: {best_params}")

# Instantiate model with best parameters
rf_model = RandomForestClassifier(**best_params)
rf_model.fit(X_train, y_train)

# Cross-Validation
scores = cross_validate(rf_model, X_train, y_train)
print(f"Cross-Validation Scores: {scores}")

# Model Evaluation
evaluate_model(rf_model, X_test, y_test)

# Plotting ROC Curve
plot_roc_curve(rf_model, X_test, y_test)

In [11]:
# from sklearn.model_selection import GridSearchCV

# def train_and_predict(model, X_train, y_train, X_test):
#     """
#     Trains the model with the training data and returns the predictions on the test data.
#     """
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     return y_pred

# def hyperparameter_tuning(model, param_grid, X_train, y_train, cv=5):
#     """
#     Tune hyperparameters of the given model and return the best parameters.
#     """
#     grid_search = GridSearchCV(model, param_grid, cv=cv, n_jobs=-1, verbose=1)
#     grid_search.fit(X_train, y_train)
#     return grid_search.best_params_

# def tune_train_predict(classifier, param_grid, X_train, y_train, X_test):
#     """
#     Tune the classifier's hyperparameters, train it with the best parameters,
#     and then make predictions on the test set.
#     Returns the predictions.
#     """
#     best_params = hyperparameter_tuning(classifier, param_grid, X_train, y_train)
#     print(f"Best Parameters for {classifier.__class__.__name__}: {best_params}")
    
#     classifier.set_params(**best_params)
#     classifier.fit(X_train, y_train)
    
#     return classifier.predict(X_test)