In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
from ipywidgets import interact, FloatSlider
import seaborn as sns

In [2]:
def generate_species_data(n_samples=1000, presence_ratio=0.3):
    # Calculate number of samples for each class
    n_present = int(n_samples * presence_ratio)
    n_absent = n_samples - n_present
    
    # Generate features for presence sites 
    # Green crabs prefer warmer temps (between 64 and 79 degrees Fahrenheit) and  salinity between 26 and 39 ppt
    temp_present = np.random.normal(loc=71, scale= 4, size=n_present)
    salinity_present = np.random.normal(loc=32, scale=3, size=n_present)
    X_present = np.column_stack([temp_present, salinity_present])
    y_present = np.ones(n_present)
    
    # Generate features for absence sites
    # Sites with warmer temps or lower salinity
    temp_absent = np.random.normal(loc=26, scale=3, size=n_absent)
    salinity_absent = np.random.normal(loc=28, scale=2, size=n_absent)
    X_absent = np.column_stack([temp_absent, salinity_absent])
    y_absent = np.zeros(n_absent)
    
    # Combine and shuffle the data
    X = np.concatenate([X_present, X_absent])
    y = np.concatenate([y_present, y_absent])
    
    # Shuffle the data
    shuffle_idx = np.random.permutation(n_samples)
    X = X[shuffle_idx]
    y = y[shuffle_idx]
    
    return X, y

## Function2 create function

In [6]:
def plot_class_distribution(y):
    plt.figure(figsize = (8, 4))
    # Counr values in each cat
    class_counts = pd.Series(y).value_counts().sort_index()
    #creat a barplot of absent and present species
    
    sns.barplot(x = ['absent', 'present'], y = class_counts, color = 'blue')
    plt.title('Dist of species present/absent')
    plt.ylabel('Number of sampling sites')
    
    # Add percent over each bar
    total = len(y)
    for i, count in enumerate(class_counts):
        percentage = count/total * 100
        plt.text(i, count, f'{percentage:.1f}%', ha = 'center', va = 'bottom')
    plt.show()
    

In [19]:
def plot_confusion_matrix(y_true, y_pred):
    
    # Create confusion matric
    cm = confusion_matrix(y_true, y_pred)
    
    # Create confusion matrix plot
    
    plt.figure(figsize = (8, 6))
    sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'GnBu',
               xticklabels = ['Absent', 'Present'],
               yticklabels = ['Absent', 'Present'])
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('prediction')
    plt.show()
    
    # Calculate metrix 
    TP = cm[1,1]
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]
    
    print("\n Metrics from confusion matrix:")
    print(f"True Positives: {TP}")
    print(f"True Negative: {TN}")
    print(f"False Positives: {FP}")
    print(f"False Negative: {TP}")
    
    
    # calculate accuracy
    accuracy = (TP + TN)/ (TP + TN + FP + FN)
    majority_baseline = max(np.mean(y_true), 1-np.mean(y_true))
    sensitivity = TP/(TP + FN)
    specificity = TN/(TN + FP)
    
    # Print metrics
    print(f"\n Model Performance:")
    print(f"Accuracys: {accuracy:.3f}")
    print(f" sensitivity: { sensitivity:.3f}")
    print(f" specificity: { specificity:.3f}")
    
    

In [13]:
# The underscore ignores thresholds
def plot_roc_curve(y_test, y_pred_prob):
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize = (8, 6))
    plt.plot(fpr, tpr, color = 'darkorange', lw=2,
            label = f'ROC Curve (AUC = {roc_auc:.2f})')
    plt.plot([0,1], [0,1], color = 'navy', lw = 2, linestyle = '--',
            label = 'Random Classifier (AUC = 0.5)')
    plt.xlabel('False Pos rate')
    plt.ylabel('True pos rate')
    plt.title('ROC curve: species presences prediction')
    plt.legend()
    plt.grid(True)
    plt.show()

In [15]:
def interactive_logistic_regression(presence_ratio = 0.3):
    X,y = generate_species_data(presence_ratio = presence_ratio)
    
    # Plot claqss distribution
    print("\class distribution")
    plot_class_distribution(y)
    
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
    
    # Train model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Make my predictions
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:,1]
    
    # Plot confussion matrix
    print("\n Confussion Matrix")
    plot_confusion_matrix(y_test, y_pred)
    
    #plot ROC curve
    print("\n ROC curve:")
    plot_roc_curve(y_test, y_pred_prob)

In [20]:
# Create an interactive widget

def generate_log_regression():
    interact(interactive_logistic_regression, 
            precense_ratio = FloatSlider(min = 0.1, max = 0.9, step = .1, value = 0.3,
                                        description = "% present")
            )
generate_log_regression()

interactive(children=(FloatSlider(value=0.3, description='presence_ratio', max=0.8999999999999999, min=-0.3), …