In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('unseen.csv')

# Mapping for 'Month' column
df['Month'] = df['Month'].map({
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
})

# Mapping for 'VisitorType' column
df['VisitorType'] = df['VisitorType'].map({
    'Returning_Visitor': 1, 'New_Visitor': 2, 'Other': 3
})

# Mapping for 'Weekend' and 'Revenue' columns
df['Revenue'] = df['Revenue'].astype(str).str.strip().str.upper()
df['Weekend'] = df['Weekend'].astype(str).str.strip().str.upper()

# Apply the correct mapping
df['Revenue'] = df['Revenue'].map({'TRUE': 1, 'FALSE': 0})
df['Weekend'] = df['Weekend'].map({'TRUE': 1, 'FALSE': 0})

# Handle missing values by filling with 0 (if any)
df.fillna(0, inplace=True)

# Check class distribution before resampling
print(f"Class distribution before resampling: \n{df['Revenue'].value_counts()}")

# Separate majority and minority classes
df_majority = df[df.Revenue == 0]
df_minority = df[df.Revenue == 1]

# Upsample the minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,  # Sample with replacement
                                 n_samples=len(df_majority),  # Match majority class size
                                 random_state=42)

# Combine the majority class with the upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled]).sample(frac=1, random_state=42)

# Confirm the new class balance
print(f"Balanced class distribution: \n{df_balanced['Revenue'].value_counts()}")


In [None]:
# Proceed with train-test split
X = df_balanced.iloc[:, :-1].values  # Features (all columns except the target 'Revenue')
y = df_balanced['Revenue'].values  # Target column ('Revenue')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train-test split successful!")
print(f"Class distribution in training set: \n{pd.Series(y_train).value_counts()}")


In [310]:
# Sigmoid function
def sigmoid(z):
    z = np.clip(z, -500, 500)  # Prevent overflow in exp
    return 1 / (1 + np.exp(-z))

# Cost function with regularization
def cost_function(X, y, theta, lambd=0):
    m = len(y)
    h = sigmoid(np.dot(X, theta))
    h = np.clip(h, 1e-10, 1 - 1e-10)  # Clip to avoid log(0) or log(1)
    regularization = (lambd / (2 * m)) * np.sum(np.square(theta[1:]))  # Regularization term
    cost = (-1/m) * (y.T @ np.log(h) + (1 - y).T @ np.log(1 - h)) + regularization
    return cost


In [311]:
# Gradient descent with regularization
def gradient_descent(X, y, theta, learning_rate=0.001, lambd=0, iterations=1000, tolerance=1e-4, verbose=False):
    m = len(y)
    costs = []
    for i in range(iterations):
        h = sigmoid(np.dot(X, theta))
        gradient = (1/m) * np.dot(X.T, (h - y)) + (lambd/m) * np.concatenate([[0], theta[1:]])
        theta -= learning_rate * gradient
        
        cost = cost_function(X, y, theta, lambd)
        if i > 0 and abs(costs[-1] - cost) < tolerance:  # Early stopping based on tolerance
            print(f"Early stopping at iteration {i}")
            break
        costs.append(cost)
    return theta, costs



In [312]:
# Predict function using trained theta
def predict(X, theta, threshold=0.5):
    return sigmoid(np.dot(X, theta)) >= threshold

# Evaluate the model
def evaluate_model(X_train, X_test, y_train, y_test, learning_rate=0.01, iterations=5000, lambd=100, threshold=0.5):
    # Initialize parameters
    theta = np.zeros(X_train.shape[1])
    
    # Train the model using gradient descent
    trained_theta, costs = gradient_descent(X_train, y_train, theta, learning_rate, lambd, iterations, tolerance=1e-6, verbose=False)
    
    # Make predictions on the test set
    y_pred = predict(X_test, trained_theta, threshold)
    y_pred = y_pred.astype(int)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=1)
    recall = recall_score(y_test, y_pred, zero_division=1)
    f1 = f1_score(y_test, y_pred, zero_division=1)
    
    return accuracy, precision, recall, f1, costs


In [313]:
# MinMax Scaling
def min_max_scaling(X):
    min_vals = X.min(axis=0)
    max_vals = X.max(axis=0)
    ranges = max_vals - min_vals
    ranges[ranges == 0] = 1  # Prevent division by zero
    return (X - min_vals) / ranges

# Mean Normalization
def mean_normalization(X):
    mean_vals = X.mean(axis=0)
    max_vals = X.max(axis=0)
    min_vals = X.min(axis=0)
    denominator = (max_vals - min_vals)
    denominator[denominator == 0] = 1  # Prevent division by zero
    return (X - mean_vals) / denominator

# Z-Score Normalization
def z_score_normalization(X):
    mean_vals = X.mean(axis=0)
    std_vals = X.std(axis=0)
    std_vals[std_vals == 0] = 1  # Prevent division by zero
    return (X - mean_vals) / std_vals


In [None]:
scaling_methods = {
    'No Scaling': lambda X: X,  # No scaling is applied here, raw data is used
    'MinMax Scaling': min_max_scaling,
    'Mean Normalization': mean_normalization,
    'Z-Score Normalization': z_score_normalization
}

results = {}

for method, scaling_function in scaling_methods.items():
    # Scale the training data
    X_train_scaled = scaling_function(X_train)
    
    # Scale the test data using the same scaling function
    X_test_scaled = scaling_function(X_test)
    
    # Evaluate the model
    accuracy, precision, recall, f1, costs = evaluate_model(X_train_scaled, X_test_scaled, y_train, y_test)
    
    # Store results for comparison
    results[method] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Costs': costs
    }
    
    # Output results
    print(f'{method} -> Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')



In [None]:
# Choose the best scaling method based on results (example: MinMax Scaling)
best_scaling_method = 'MinMax Scaling'
X_train_scaled = scaling_methods[best_scaling_method](X_train)
X_test_scaled = scaling_methods[best_scaling_method](X_test)

# Apply regularization (e.g., lambda = 10)
lambd = 10
accuracy_reg, precision_reg, recall_reg, f1_reg, costs_reg = evaluate_model(X_train_scaled, X_test_scaled, y_train, y_test, lambd=lambd)

# No regularization
accuracy


In [None]:
# Regularization
lambd = 100 # Adjust the lambda value if needed
accuracy_reg, precision_reg, recall_reg, f1_reg, costs_reg = evaluate_model(X_train_scaled, X_test_scaled, y_train, y_test, lambd=lambd)

# No regularization
accuracy_no_reg, precision_no_reg, recall_no_reg, f1_no_reg, costs_no_reg = evaluate_model(X_train_scaled, X_test_scaled, y_train, y_test, lambd=0)

# Display final results for both regularized and non-regularized models
print("\n===== Final Results =====")
print(f"Without Regularization -> Accuracy: {accuracy_no_reg:.4f}, Precision: {precision_no_reg:.4f}, Recall: {recall_no_reg:.4f}, F1 Score: {f1_no_reg:.4f}")
print(f"With Regularization (lambda={lambd}) -> Accuracy: {accuracy_reg:.4f}, Precision: {precision_reg:.4f}, Recall: {recall_reg:.4f}, F1 Score: {f1_reg:.4f}")

# Plot cost function for both regularized and non-regularized models
plt.plot(range(len(costs_no_reg)), costs_no_reg, label='Without Regularization', color='blue')
plt.plot(range(len(costs_reg)), costs_reg, label='With Regularization', color='orange')
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('Cost Function over Iterations: Regularized vs Non-Regularized')
plt.legend()
plt.show()
