In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

TEST_SIZE = 0.4

In [2]:
def load_data(filename):
    """
    Load the shopping CSV and apply simple preprocessing.

    - Reads `filename` into a DataFrame.
    - Encodes the `Month` column with LabelEncoder.
    - Converts `VisitorType` to a binary flag (Returning_Visitor -> 1 else 0).
    - Ensures `Weekend` is an integer (0/1).

    Returns
    -------
    evidence : pandas.DataFrame
        Feature matrix (all columns except the `Revenue` label).
    labels : pandas.Series
        Binary labels (Revenue converted to int).
    """
    df = pd.read_csv(filename)

    encoder = LabelEncoder()
    df['Month'] = encoder.fit_transform(df['Month'])
    df['VisitorType'] = (df['VisitorType'] == 'Returning_Visitor').astype(int)
    df['Weekend'] = df['Weekend'].astype(int)

    labels = df['Revenue'].astype(int)
    evidence = df.drop('Revenue', axis=1)

    return evidence, labels

In [3]:
def balance_classes(evidence, labels, alpha=0.25):
    """
    Balance the classes by oversampling the minority class.

    Notes on `alpha` (no behavior changes performed here):
    - Implementation oversamples the minority class to a target size computed as
      round((1 - alpha) * len(majority_class)).
    - alpha = 0.0 -> minority is oversampled up to the full majority size (fully balanced)
    - alpha > 0.0 -> minority is oversampled to a fraction of the majority size
      (e.g. alpha=0.25 -> minority oversampled to 75% of majority size)

    Parameters
    ----------
    evidence : pandas.DataFrame
        Feature matrix.
    labels : pandas.Series or array-like
        Binary labels (0/1).
    alpha : float, optional
        Controls the target oversample size (see notes above). Default 0.25.

    Returns
    -------
    balanced_evidence, balanced_labels
        DataFrame and Series with classes re-balanced according to `alpha`.
    """
    # Combine evidence and labels into a single DataFrame
    data = evidence.copy()
    data['Label'] = labels

    # Separate majority and minority classes
    majority_class = data[data['Label'] == 0]
    minority_class = data[data['Label'] == 1]

    # Undersample majority class (old approach - now using oversampling)
    # Oversample minority class to the target size computed from alpha
    minority_class_oversampled = minority_class.sample(n=round((1-alpha)*len(majority_class)), replace=True, random_state=42)

    # Combine majority class with oversampled minority class
    balanced_data = pd.concat([majority_class, minority_class_oversampled])

    # Shuffle the balanced dataset
    balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # Separate evidence and labels
    balanced_labels = balanced_data['Label']
    balanced_evidence = balanced_data.drop('Label', axis=1)

    return balanced_evidence, balanced_labels

In [4]:
def preprocess_data():
    """
    Create and return a ColumnTransformer preprocessor.

    - Currently applies StandardScaler to a set of numeric duration / count features.
    - Keeps the transformer separate so it can be reused in a Pipeline.

    Returns
    -------
    preprocessor : sklearn.compose.ColumnTransformer
        Transformer that scales numeric columns.
    """
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric', StandardScaler(), [ 'Administrative_Duration',
                                            'Informational_Duration',
                                            'ProductRelated',
                                            'ProductRelated_Duration',
                                            'PageValues'
                                                                                    ])

        ]
    )

    return preprocessor

In [5]:
def pipline():
    """
    Build and return two training pipelines:
      - KNN pipeline: preprocessing -> KNeighborsClassifier
      - LogisticRegression pipeline: preprocessing -> LogisticRegression

    Note: function name is `pipline` (typo) — keep it as-is to avoid breaking existing calls.
    Returns
    -------
    (knn_pipe, logreg_pipe) : tuple
        Two sklearn Pipeline objects ready for GridSearchCV or fitting.
    """
    preprocessor = preprocess_data()
    knn_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier()),
    ])

    # Add class_weight='balanced' to handle imbalanced data
    logreg_pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor), 

            # it changes the decision boundary to account for class imbalance
            # penalize mistakes on the minority class more heavily
            ('logistic', LogisticRegression(class_weight='balanced', max_iter=1000))
        ]
    )
    return knn_pipe, logreg_pipe

In [6]:
def tune_with_gridsearch(knn_pipe: Pipeline, logreg_pipe: Pipeline, X_train, y_train):
    """
    Prepare GridSearchCV objects for KNN and Logistic Regression.

    This function returns configured (but not fitted) GridSearchCV objects that
    can be fit on different training sets. Returning the objects this way lets
    callers inspect `cv_results_` after fitting.

    Parameters
    ----------
    knn_pipe : sklearn.pipeline.Pipeline
        Pipeline containing preprocessor and a KNeighborsClassifier placeholder.
    logreg_pipe : sklearn.pipeline.Pipeline
        Pipeline containing preprocessor and a LogisticRegression placeholder.
    X_train, y_train : used for compatibility (not used to fit here)

    Returns
    -------
    grid_search_knn, grid_search_logreg : tuple
        Configured GridSearchCV objects (not fitted).
    """
    param_grid_knn = {
        'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan', 'minkowski'],
    }
    param_grid_logreg = {
        'logistic__C': [0.01, 0.1, 1, 10, 100],
    }
    
    grid_search_knn = GridSearchCV(knn_pipe, param_grid_knn, cv=5, scoring='balanced_accuracy')
    grid_search_logreg = GridSearchCV(logreg_pipe, param_grid_logreg, cv=5, scoring='balanced_accuracy')

    return grid_search_knn, grid_search_logreg

In [7]:
def train_model(evidence, labels):
    """
    Train models using GridSearch over KNN hyperparameters and alpha-based balancing.

    Process:
    - For each alpha value, the training data is rebalanced via `balance_classes`.
    - GridSearchCV is run for the KNN pipeline on the balanced data.
    - The best performing alpha (by CV score) is reported.
    - A separate GridSearchCV is fit for LogisticRegression on the original data.

    Returns
    -------
    best_knn_model, best_logreg_model
        The best estimators (from the GridSearchCV runs) for KNN and LogisticRegression.
    """
    knn_pipe, logreg_pipe = pipline()
    
    # Define parameter grid including alpha values for balancing
    param_grid_knn = {
        'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan', 'minkowski'],
    }
    param_grid_logreg = {
        'logistic__C': [0.01, 0.1, 1, 10, 100],
    }
    
    # Alpha values to try for balancing (0 = fully balanced, higher = more minority bias)
    alpha_values = [0.1, 0.15, 0.2, 0.25, 0.3]
    
    best_score = -1
    best_alpha = None
    best_knn_model = None
    
    # Try each alpha value
    for alpha in alpha_values:
        # Rebalance training data for this alpha
        balanced_evidence, balanced_labels = balance_classes(evidence, labels, alpha=alpha)

        # Grid search for KNN on the balanced data
        grid_search_knn = GridSearchCV(knn_pipe, param_grid_knn, cv=5, scoring='balanced_accuracy')
        grid_search_knn.fit(balanced_evidence, balanced_labels)
        
        if grid_search_knn.best_score_ > best_score:
            best_score = grid_search_knn.best_score_
            best_alpha = alpha
            best_knn_model = grid_search_knn.best_estimator_
    
    print(f"Best alpha for KNN: {best_alpha} (CV score: {best_score:.4f})")
    
    # Train logistic regression without balancing
    grid_search_logreg = GridSearchCV(logreg_pipe, param_grid_logreg, cv=5, scoring='balanced_accuracy')
    grid_search_logreg.fit(evidence, labels)
    
    return best_knn_model, grid_search_logreg.best_estimator_

In [8]:
def evaluate(labels, predictions):
    """
    Compute sensitivity (true positive rate) and specificity (true negative rate).

    Parameters
    ----------
    labels : array-like
        Ground-truth binary labels (0/1).
    predictions : array-like
        Predicted binary labels (0/1).

    Returns
    -------
    (sensitivity, specificity) : tuple of floats
        sensitivity = TP / (TP + FN)
        specificity = TN / (TN + FP)
    """
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    return sensitivity, specificity

In [9]:
def main():
    """
    High-level runner:
    - Loads data
    - Splits train / test
    - Trains models (KNN with alpha tuning, LogisticRegression)
    - Evaluates on the test set and prints summary metrics

    Note: main currently prints results and does not persist models to disk.
    """

    # Load data from spreadsheet and split into train and test sets
    evidence, labels = load_data("shopping.csv")
    X_train, X_test, y_train, y_test = train_test_split(
        evidence, labels, test_size=TEST_SIZE, random_state=420, stratify=labels
    )

    # Train model and make predictions
    knn_model, logreg_model= train_model(X_train, y_train)
    predictions_knn = knn_model.predict(X_test) # type: ignore
    predictions_logreg = logreg_model.predict(X_test) 

      
    sensitivity, specificity = evaluate(y_test, predictions_knn)
    sensitivity_logreg, specificity_logreg = evaluate(y_test, predictions_logreg)

    print("K-Nearest Neighbors Results:")
    print(f"Correct: {(y_test == predictions_knn).sum()}")
    print(f"Incorrect: {(y_test != predictions_knn).sum()}")
    print(f"True Positive Rate: {100 * sensitivity:.2f}%")
    print(f"True Negative Rate: {100 * specificity:.2f}%")


    print("\nLogistic Regression Results:")
    print(f"Correct: {(y_test == predictions_logreg).sum()}")
    print(f"Incorrect: {(y_test != predictions_logreg).sum()}")
    print(f"True Positive Rate: {100 * sensitivity_logreg:.2f}%")
    print(f"True Negative Rate: {100 * specificity_logreg:.2f}%")

    # Return nothing (prints summary). If you want the fitted models or metrics,
    # modify this function to return them for downstream visualization.
    

In [10]:
if __name__ == "__main__":
    main()







Best alpha for KNN: 0.3 (CV score: 0.9274)
K-Nearest Neighbors Results:
Correct: 4190
Incorrect: 742
True Positive Rate: 72.35%
True Negative Rate: 87.26%

Logistic Regression Results:
Correct: 4316
Incorrect: 616
True Positive Rate: 72.48%
True Negative Rate: 90.26%
K-Nearest Neighbors Results:
Correct: 4190
Incorrect: 742
True Positive Rate: 72.35%
True Negative Rate: 87.26%

Logistic Regression Results:
Correct: 4316
Incorrect: 616
True Positive Rate: 72.48%
True Negative Rate: 90.26%
