# APA Styling Functions

In [1]:
def set_styles():
    # Set APA style parameters for matplotlib
    plt.style.use('/kaggle/input/apa-mplstyle/apa.mplstyle')  # Select the style sheet
    rc('mathtext', **{'default': 'regular'})
    %matplotlib inline
    %config InlineBackend.figure_format = 'retina'
    
    # Define seaborn style
    def set_seaborn_style():
        # Define grayscale color cycle
        grayscale_color_cycle = cycler('color', ['black', '0.8', '0.6', '0.4', '0.2', '0.1', '0.7', '0.3'])

        # Define line style cycle
        linestyle_cycle = cycler('linestyle', ['-', '--', '-.', ':', '-', '--', '-.', ':'])

        # Define line width cycle
        linewidth_cycle = cycler('linewidth', [1.2, 1.2, 1, 0.7, 0.5, 1, 0.8, 0.6])

        # Define hatch pattern cycle
        hatch_cycle = cycler('hatch', ['/', '\\', '|', '-', '+', 'x', 'o', 'O'])

        # Combine all cycles
        combined_cycle = grayscale_color_cycle + linestyle_cycle + linewidth_cycle + hatch_cycle

        # Set the style for seaborn plots
        sns.set_style("white", rc={"font.family": "sans-serif", 
                                       "axes.facecolor": "white", 
                                       "axes.edgecolor": "black", 
                                       "axes.labelcolor": "black",
                                       "xtick.color": "black",
                                       "ytick.color": "black",
                                       "axes.labelweight": "bold",
                                       "axes.titleweight": "bold",
                                       "axes.prop_cycle": combined_cycle})

    # Set the global style for sns
    set_seaborn_style()

In [2]:
def create_countplot(df, column, title):
    # Plot the countplot
    plt.figure(figsize=(10, 6))  # Adjust figure size as needed
    ax = sns.countplot(data=df, x=column)

    # Define grayscale color cycle
    grayscale_color_cycle = ['black', '0.8', '0.6', '0.4', '0.2', '0.1', '0.7', '0.3']

    # Define hatch pattern cycle
    hatch_patterns = ['/', '\\', '|', '-', '+', 'x', 'o', 'O']

    # Iterate over the bars in the countplot and apply unique combinations of properties
    for i, bar in enumerate(ax.patches):
        color = grayscale_color_cycle[i % len(grayscale_color_cycle)]
        hatch = hatch_patterns[i % len(hatch_patterns)]
        bar.set_facecolor(color)  # Set face color to grayscale color
        bar.set_linestyle('-')  # Set linestyle
        bar.set_linewidth(1.2)  # Set linewidth
        bar.set_hatch(hatch)  # Cycle through hatch patterns

    # Show plot
    plt.title(title)
    plt.show()

def create_lineplot(df, x, y, title):
    # Plot the lineplot
    plt.figure(figsize=(10, 6))  # Adjust figure size as needed
    sns.lineplot(data=df, x=x, y=y)

    # Show plot
    plt.title(title)
    plt.show()
    
def create_scatterplot(df, x, y, title):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x=x, y=y, color='black', edgecolor='none', s=50)  # Adjust marker size as needed
    plt.title(title)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()

def create_boxplot(df, x, y, title):
    # Plot the boxplot
    plt.figure(figsize=(10, 6))  # Adjust figure size as needed
    sns.boxplot(data=df, x=x, y=y, color='black')

    # Show plot
    plt.title(title)
    plt.show()
    
def create_boxplot_with_count(df, x, y, title):
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, y=y, x=x, color='black')
    
    # Add count labels for each box
    medians = df.groupby(x)[y].median().values
    nobs = df.groupby(x)[y].agg(['count'])
    nobs = ["n: " + str(i) for s in nobs.values for i in s]

    pos = range(len(nobs))
    for tick, label in zip(pos, plt.gca().get_xticklabels()):
        plt.text(pos[tick], medians[tick] + 0.5, nobs[tick], horizontalalignment='center', size='x-small', color='white', weight='semibold')

    plt.title(title)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()

def create_heatmap(df, title, annotations=True):
    plt.figure(figsize=(10, 6))
    sns.heatmap(df, cmap='gray', annot=annotations, fmt='.1f', linewidths=0.5, linecolor='black')
    plt.title(title)
    plt.xlabel(df.columns.name)
    plt.ylabel(df.index.name)
    plt.show()

In [3]:
def plot_confusion_matrix(y_true, y_pred, title=None, labels=None):
    """
    Plot a confusion matrix.
    
    Args:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted labels.
        title (str): Title for the confusion matrix plot.
        labels (list): List of label names.
    """
    cm = confusion_matrix(y_true, y_pred)
    total_samples = np.sum(cm)
    percentages = (cm.T / np.sum(cm, axis=1)).T * 100
    annotations = [[f'({cm[i, j]})\n{percentages[i, j]:.2f}%' for j in range(len(cm))] for i in range(len(cm))]
    if labels is None:
        labels = [str(i) for i in range(len(cm))]
    sns.heatmap(cm, annot=annotations, fmt='', cmap='binary', cbar=False,
                xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    if title:
        plt.title(title)
    plt.show()

def plot_confusion_matrices_side_by_side(model_name, data, titles, labels=None):
    num_matrices = len(data)
    fig, ax = plt.subplots(1, num_matrices, figsize=(7 * num_matrices, 7))
    fig.subplots_adjust(wspace=0.5) 

    def add_labels_and_percentages(conf_matrix, ax):
        total_samples = np.sum(conf_matrix)
        group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
        labels = [f'{name}\n{count}\n{count/total_samples:.2%}' for name, count in zip(group_names, conf_matrix.flatten())]
        labels = np.asarray(labels).reshape(2, 2)
        sns.heatmap(conf_matrix, annot=labels, fmt='', cmap='binary', ax=ax,
                    annot_kws={'size': 14, 'fontweight': 'normal'}, cbar=False,
                    xticklabels=['0', '1'], yticklabels=['0', '1']) 

    for i, (y_true, y_pred) in enumerate(data):
        conf_matrix = confusion_matrix(y_true, y_pred, labels=labels)
        ax[i].set_title(titles[i], fontsize=14, fontweight='bold')
        add_labels_and_percentages(conf_matrix, ax[i])

        ax[i].set_xlabel('Predicted labels', fontsize=14)  
        ax[i].set_ylabel('True labels', fontsize=14) 
        for tick in ax[i].xaxis.get_major_ticks():
            tick.label.set_fontsize(10) 
        for tick in ax[i].yaxis.get_major_ticks():
            tick.label.set_fontsize(10)  

    fig.suptitle(model_name, fontsize=18, fontweight='bold', y=1.05, ha='center')
    plt.show()


# Modeling Functions

In [4]:
# Function to perform randomized hyperparameter search
def perform_randomized_search(clf, param_grid, X_train, y_train, X_test, y_test):
    scoring = {'recall': make_scorer(recall_score)}
    print("Starting hyperparameter search...")
    start_total = time.time()
    gcv = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=10, cv=3, random_state=42, n_jobs=-1, verbose=2)
    start_search = time.time()
    gcv.fit(X_train, y_train)
    end_search = time.time()

    print("Searching...")
    for current_iter in range(1, 11):
        elapsed_time = time.time() - start_search
        remaining_time = (elapsed_time / current_iter) * (10 - current_iter)
        print(f"Iteration {current_iter} completed. Estimated remaining time: {remaining_time:.2f} seconds.", end='\r')
        time.sleep(1)  # Simulate processing time

    end_total = time.time()
    elapsed_total = end_total - start_total
#   print(f"\nHyperparameter search completed. Total elapsed time: {elapsed_total:.2f} seconds.")
    fit_time = end_search - start_search
    cv_time = gcv.refit_time_

    train_predictions = gcv.predict(X_train)
    test_predictions = gcv.predict(X_test)
    train_metrics = get_metrics(y_train, train_predictions)
    test_metrics = get_metrics(y_test, test_predictions)
    return gcv.best_params_, train_metrics, test_metrics, elapsed_total


In [5]:
# Function to calculate evaluation metrics
def get_metrics(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='binary')
    recall = recall_score(true_labels, predicted_labels, average='binary')
    f1 = f1_score(true_labels, predicted_labels, average='binary')
    roc_auc = roc_auc_score(true_labels, predicted_labels)
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'roc_auc': roc_auc}

def display_metrics(model_name, train_metrics, test_metrics):
    # Combine train and test metrics into a DataFrame
    metrics_df = pd.DataFrame({'Train': train_metrics, 'Test': test_metrics})
    
    # Add a row for the metric names and set it as the index
    metrics_df = metrics_df.T.rename(index={'accuracy': 'Accuracy', 'precision': 'Precision', 'recall': 'Recall', 'f1': 'F1 Score', 'roc_auc': 'ROC AUC'})
    
    # Format numerical values to display a maximum of 4 decimal points
    metrics_df = metrics_df.round(4)
    
    # Add model name as headline
    print(f"\n{'='*20}\n{model_name}\n{'='*20}\n")
    
    # Display the DataFrame
    display(metrics_df)
    