In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

TEST_SIZE = 0.4

In [62]:
def load_data(filename):
    """
  
    """
    df = pd.read_csv(filename)

    encoder = LabelEncoder()
    df['Month'] = encoder.fit_transform(df['Month'])
    df['VisitorType'] = (df['VisitorType'] == 'Returning_Visitor').astype(int)
    df['Weekend'] = df['Weekend'].astype(int)

    labels = df['Revenue'].astype(int)
    evidence = df.drop('Revenue', axis=1)

    return evidence, labels





In [63]:
def balance_classes(evidence, labels, alpha=0.25):
    """
    Balance the classes in the dataset by oversampling the minority class.

    Returns a tuple (balanced_evidence, balanced_labels).
    """
    # Combine evidence and labels into a single DataFrame
    data = evidence.copy()
    data['Label'] = labels

    # Separate majority and minority classes
    majority_class = data[data['Label'] == 0]
    minority_class = data[data['Label'] == 1]

    # Undersample majority class (old approach - now using oversampling)
    # Oversample minority class
    minority_class_oversampled = minority_class.sample(n=round((1-alpha)*len(majority_class)), replace=True, random_state=42)

    # Combine majority class with oversampled minority class
    balanced_data = pd.concat([majority_class, minority_class_oversampled])

    # Shuffle the balanced dataset
    balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # Separate evidence and labels
    balanced_labels = balanced_data['Label']
    balanced_evidence = balanced_data.drop('Label', axis=1)

    return balanced_evidence, balanced_labels

In [64]:

def preprocess_data():
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric', StandardScaler(), [ 'Administrative_Duration',
                                            'Informational_Duration',
                                            'ProductRelated',
                                            'ProductRelated_Duration',
                                            'PageValues'
                                                                                    ])

        ]
    )

    return preprocessor

In [65]:
def pipline():
    preprocessor = preprocess_data()
    knn_pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier()),
    ])

    # Add class_weight='balanced' to handle imbalanced data
    logreg_pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor), 

            # it changes the decision boundary to account for class imbalance
            #penlize mistakes on the minority class more heavily
            # Without balanced weights (default):
            # weight_class_0 = 1.0
            # weight_class_1 = 1.0
            # # With balanced weights:
            # weight_class_0 = total_samples / (2 * count_class_0)  = 0.588
            # weight_class_1 = total_samples / (2 * count_class_1)   = 3.333 
            ('logistic', LogisticRegression(class_weight='balanced', max_iter=1000))
        ]
    )
    return knn_pipe, logreg_pipe


In [66]:
def tune_with_gridsearch(knn_pipe: Pipeline, logreg_pipe: Pipeline, X_train, y_train):
    """
    Full GridSearchCV over k, weights, and metric for KNN, and C for LogisticRegression.
    We use 'balanced_accuracy' to reflect sensitivity/specificity balance.
    """
    param_grid_knn = {
        'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan', 'minkowski'],
    }
    param_grid_logreg = {
        'logistic__C': [0.01, 0.1, 1, 10, 100],
    }
    
    grid_search_knn = GridSearchCV(knn_pipe, param_grid_knn, cv=5, scoring='balanced_accuracy')
    grid_search_logreg = GridSearchCV(logreg_pipe, param_grid_logreg, cv=5, scoring='balanced_accuracy')

    return grid_search_knn, grid_search_logreg

    

In [67]:
def train_model(evidence, labels):
    """
    Given a list of evidence lists and a list of labels, return
    fitted KNN and LogisticRegression models trained on the data.
    """
    knn_pipe, logreg_pipe = pipline()
    
    # Define parameter grid including alpha values for balancing
    param_grid_knn = {
        'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan', 'minkowski'],
    }
    param_grid_logreg = {
        'logistic__C': [0.01, 0.1, 1, 10, 100],
    }
    
    # Alpha values to try for balancing (0 = fully balanced, higher = more minority bias)
    alpha_values = [0.1, 0.15, 0.2, 0.25, 0.3]
    
    best_score = -1
    best_alpha = None
    best_knn_model = None
    
    # Try each alpha value
    for alpha in alpha_values:
        balanced_evidence, balanced_labels = balance_classes(evidence, labels, alpha=alpha)
        grid_search_knn = GridSearchCV(knn_pipe, param_grid_knn, cv=5, scoring='balanced_accuracy')
        grid_search_knn.fit(balanced_evidence, balanced_labels)
        
        if grid_search_knn.best_score_ > best_score:
            best_score = grid_search_knn.best_score_
            best_alpha = alpha
            best_knn_model = grid_search_knn.best_estimator_
    
    print(f"Best alpha for KNN: {best_alpha} (CV score: {best_score:.4f})")
    
    # Train logistic regression without balancing
    grid_search_logreg = GridSearchCV(logreg_pipe, param_grid_logreg, cv=5, scoring='balanced_accuracy')
    grid_search_logreg.fit(evidence, labels)
    
    return best_knn_model, grid_search_logreg.best_estimator_


In [68]:
def evaluate(labels, predictions):
    """
    Given a list of actual labels and a list of predicted labels,
    return a tuple (sensitivity, specificity).

    Assume each label is either a 1 (positive) or 0 (negative).

    `sensitivity` should be a floating-point value from 0 to 1
    representing the "true positive rate": the proportion of
    actual positive labels that were accurately identified.

    `specificity` should be a floating-point value from 0 to 1
    representing the "true negative rate": the proportion of
    actual negative labels that were accurately identified.
    """
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    return sensitivity, specificity
    

In [69]:
def main():

    # Load data from spreadsheet and split into train and test sets
    evidence, labels = load_data("shopping.csv")
    X_train, X_test, y_train, y_test = train_test_split(
        evidence, labels, test_size=TEST_SIZE, random_state=420, stratify=labels
    )

    # Train model and make predictions
    knn_model, logreg_model= train_model(X_train, y_train)
    predictions_knn = knn_model.predict(X_test) # type: ignore
    predictions_logreg = logreg_model.predict(X_test) 

      
    sensitivity, specificity = evaluate(y_test, predictions_knn)
    sensitivity_logreg, specificity_logreg = evaluate(y_test, predictions_logreg)

    print("K-Nearest Neighbors Results:")
    print(f"Correct: {(y_test == predictions_knn).sum()}")
    print(f"Incorrect: {(y_test != predictions_knn).sum()}")
    print(f"True Positive Rate: {100 * sensitivity:.2f}%")
    print(f"True Negative Rate: {100 * specificity:.2f}%")


    print("\nLogistic Regression Results:")
    print(f"Correct: {(y_test == predictions_logreg).sum()}")
    print(f"Incorrect: {(y_test != predictions_logreg).sum()}")
    print(f"True Positive Rate: {100 * sensitivity_logreg:.2f}%")
    print(f"True Negative Rate: {100 * specificity_logreg:.2f}%")

    

In [70]:
if __name__ == "__main__":
    main()







Best alpha for KNN: 0.3 (CV score: 0.9274)
K-Nearest Neighbors Results:
Correct: 4190
Incorrect: 742
True Positive Rate: 72.35%
True Negative Rate: 87.26%

Logistic Regression Results:
Correct: 4316
Incorrect: 616
True Positive Rate: 72.48%
True Negative Rate: 90.26%


[np.float64(0.9274405077738201), np.float64(0.8130901224871631)]