In [2]:
from pymongo import MongoClient
import numpy as np
import pandas as pd
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
import os

In [3]:
# Get MongoDB connection details from environment variables
username = os.getenv("MONGODB_USERNAME")
password = os.getenv("MONGODB_PASSWORD")
cluster = os.getenv("MONGODB_CLUSTER")
database = os.getenv("MONGODB_DATABASE")

In [4]:
# Create connection string
connection_string = f"mongodb+srv://{username}:{password}@{cluster}/"


In [5]:
# Create a client connection
client = MongoClient(connection_string)



In [6]:
# Connect to your database
db = client.get_database(database)
collection = db.processed_retail_data  # This is your collection



In [7]:
# Fetch all documents into a DataFrame
cursor = collection.find({})
df = pd.DataFrame(list(cursor))



In [8]:
df.head()

Unnamed: 0,_id,user_id,age,platform,session_count,total_screens_viewed,used_search_feature,wrote_review,added_to_wishlist,purchase_24h,...,channel_social media,age_group_adult,age_group_senior,age_group_young,user_type_browser,user_type_buyer,user_type_power user,app_major_version,version_score,processing_version
0,6882dfedf9d88868d502a9b4,100000,66,0,5,10,1,1,0,1,...,False,False,True,False,False,True,False,2,2.08,retail_v20250725_070739_14e84f98
1,6882dfedf9d88868d502a9b5,100001,68,1,8,39,0,0,0,0,...,False,False,True,False,True,False,False,2,2.05,retail_v20250725_070739_14e84f98
2,6882dfedf9d88868d502a9b6,100002,25,1,9,19,1,0,1,0,...,True,True,False,False,True,False,False,2,2.06,retail_v20250725_070739_14e84f98
3,6882dfedf9d88868d502a9b7,100003,39,0,8,47,0,0,0,0,...,True,True,False,False,True,False,False,2,2.21,retail_v20250725_070739_14e84f98
4,6882dfedf9d88868d502a9b8,100004,28,1,9,29,0,1,1,0,...,False,True,False,False,True,False,False,2,2.28,retail_v20250725_070739_14e84f98


In [9]:
df.shape

(10000, 54)

In [10]:
# Fetch all documents into a DataFrame, excluding _id field
cursor = collection.find({}, {'_id': 0})
df = pd.DataFrame(list(cursor))


In [11]:
df.head()

Unnamed: 0,user_id,age,platform,session_count,total_screens_viewed,used_search_feature,wrote_review,added_to_wishlist,purchase_24h,hour,...,channel_social media,age_group_adult,age_group_senior,age_group_young,user_type_browser,user_type_buyer,user_type_power user,app_major_version,version_score,processing_version
0,100000,66,0,5,10,1,1,0,1,0,...,False,False,True,False,False,True,False,2,2.08,retail_v20250725_070739_14e84f98
1,100001,68,1,8,39,0,0,0,0,0,...,False,False,True,False,True,False,False,2,2.05,retail_v20250725_070739_14e84f98
2,100002,25,1,9,19,1,0,1,0,0,...,True,True,False,False,True,False,False,2,2.06,retail_v20250725_070739_14e84f98
3,100003,39,0,8,47,0,0,0,0,0,...,True,True,False,False,True,False,False,2,2.21,retail_v20250725_070739_14e84f98
4,100004,28,1,9,29,0,1,1,0,0,...,False,True,False,False,True,False,False,2,2.28,retail_v20250725_070739_14e84f98


In [12]:
df.shape

(10000, 53)

In [13]:
df.columns

Index(['user_id', 'age', 'platform', 'session_count', 'total_screens_viewed',
       'used_search_feature', 'wrote_review', 'added_to_wishlist',
       'purchase_24h', 'hour', 'dayofweek', 'is_weekend', 'productlist',
       'productdetail', 'categorybrowse', 'search', 'shoppingcart', 'checkout',
       'paymentmethods', 'deliveryoptions', 'wishlist', 'reviews',
       'promotions', 'account', 'addressbook', 'ordertracking',
       'shopping_count', 'cart_count', 'engagement_count', 'account_count',
       'other_screens', 'engagement_score', 'screen_diversity',
       'purchase_intent', 'region_asia pacific', 'region_europe',
       'region_latin america', 'region_middle east', 'region_north america',
       'channel_email', 'channel_organic search', 'channel_paid search',
       'channel_referral', 'channel_social media', 'age_group_adult',
       'age_group_senior', 'age_group_young', 'user_type_browser',
       'user_type_buyer', 'user_type_power user', 'app_major_version',
       

In [14]:
# Remove the specified columns and set up X and y
# Remove app_major_version, version_score, processing_version
# Set purchase_24h as target variable

# First, create a copy of the dataframe to avoid modifying the original
df_model = df.copy()

# Define the target variable
y = df_model['purchase_24h']

# Remove target variable and specified columns from features
columns_to_drop = ['purchase_24h', 'app_major_version', 'version_score', 'processing_version']
X = df_model.drop(columns=columns_to_drop)

# Verify shapes
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (10000, 49)
y shape: (10000,)


In [15]:
# Import necessary libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import time

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Check the shapes of the splits
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (7500, 49) (7500,)
Testing set shape: (2500, 49) (2500,)


In [17]:
# Function to evaluate model performance
def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None
    
    # Calculate metrics
    results = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred)
    }
    
    # Add ROC AUC if probability predictions are available
    if y_pred_proba is not None:
        results['roc_auc'] = roc_auc_score(y_test, y_pred_proba)
    
    return results

In [18]:
# Create pipelines for each model
pipelines = {
    'LogisticRegression': Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression( max_iter=1000))
    ]),
    
    'RandomForest': Pipeline([
        ('model', RandomForestClassifier())
    ]),
    
    'SVC': Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVC( probability=True))
    ]),
    
    'KNN': Pipeline([
        ('scaler', StandardScaler()),
        ('model', KNeighborsClassifier())
    ])
    
    
}

# Define parameter grids for each model
param_grids = {
    'LogisticRegression': {
        'model__C': [0.01, 0.1, 1.0, 10.0],
        'model__solver': ['liblinear', 'saga']
    },
    
    'RandomForest': {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5, 10]
    },
    
    'SVC': {
        'model__C': [0.1, 1.0, 10.0],
        'model__kernel': ['linear', 'rbf'],
        'model__gamma': ['scale', 'auto']
    },
    
    'KNN': {
        'model__n_neighbors': [3, 5, 7, 10],
        'model__weights': ['uniform', 'distance']
    }
}

In [19]:
# Dictionary to store results
results = {}

# Perform grid search for each model
for model_name, pipeline in pipelines.items():
    print(f"\nPerforming grid search for {model_name}...")
    start_time = time.time()
    
    # Create grid search
    grid_search = GridSearchCV(
        pipeline,
        param_grids[model_name],
        cv=2,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit the grid search
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Evaluate the model
    eval_results = evaluate_model(best_model, X_test, y_test)
    
    # Add best parameters and execution time
    eval_results['best_params'] = grid_search.best_params_
    eval_results['execution_time'] = time.time() - start_time
    
    # Store results
    results[model_name] = eval_results
    
    print(f"{model_name} completed in {eval_results['execution_time']:.2f} seconds")
    print(f"Best parameters: {eval_results['best_params']}")
    print(f"Test set metrics: {eval_results}")


Performing grid search for LogisticRegression...
Fitting 2 folds for each of 8 candidates, totalling 16 fits
LogisticRegression completed in 18.06 seconds
Best parameters: {'model__C': 10.0, 'model__solver': 'liblinear'}
Test set metrics: {'accuracy': 0.9092, 'precision': 0.828030303030303, 'recall': 1.0, 'f1_score': 0.9059262329050974, 'roc_auc': 0.9214533787733663, 'best_params': {'model__C': 10.0, 'model__solver': 'liblinear'}, 'execution_time': 18.055121898651123}

Performing grid search for RandomForest...
Fitting 2 folds for each of 27 candidates, totalling 54 fits
RandomForest completed in 16.05 seconds
Best parameters: {'model__max_depth': 10, 'model__min_samples_split': 2, 'model__n_estimators': 50}
Test set metrics: {'accuracy': 0.9092, 'precision': 0.828030303030303, 'recall': 1.0, 'f1_score': 0.9059262329050974, 'roc_auc': 0.9201658678246463, 'best_params': {'model__max_depth': 10, 'model__min_samples_split': 2, 'model__n_estimators': 50}, 'execution_time': 16.051851511001

In [20]:
# Create a summary DataFrame to compare models
import pandas as pd

# Extract metrics for comparison
metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc', 'execution_time']
summary_data = []

for model_name, result in results.items():
    model_metrics = {metric: result.get(metric, "N/A") for metric in metrics}
    model_metrics['model'] = model_name
    summary_data.append(model_metrics)

# Create DataFrame and sort by ROC AUC (or another preferred metric)
summary_df = pd.DataFrame(summary_data)
summary_df = summary_df[['model'] + metrics]
summary_df = summary_df.sort_values(by='roc_auc', ascending=False)

# Display the summary
print("Model Comparison Summary:")
print(summary_df)

Model Comparison Summary:
                model  accuracy  precision    recall  f1_score   roc_auc  \
0  LogisticRegression    0.9092   0.828030  1.000000  0.905926  0.921453   
1        RandomForest    0.9092   0.828030  1.000000  0.905926  0.920166   
3                 KNN    0.8596   0.828319  0.856359  0.842105  0.913693   
2                 SVC    0.9092   0.828030  1.000000  0.905926  0.911590   

   execution_time  
0       18.055122  
1       16.051852  
3       36.729565  
2      119.570708  


In [21]:
# Identify the best model
best_model_name = summary_df.iloc[0]['model']
best_model = results[best_model_name]

print(f"\nBest performing model: {best_model_name}")
print(f"ROC AUC: {best_model.get('roc_auc', 'N/A')}")
print(f"F1 Score: {best_model.get('f1_score', 'N/A')}")

# Get feature importance if available (for interpretable models like Random Forest)
if best_model_name == 'RandomForest':
    best_estimator = pipelines[best_model_name].named_steps['model']
    feature_importances = best_estimator.feature_importances_
    features = X.columns
    
    # Create a DataFrame for feature importance
    importance_df = pd.DataFrame({
        'feature': features,
        'importance': feature_importances
    }).sort_values(by='importance', ascending=False)
    
    # Display top 10 most important features
    print("\nTop 10 most important features:")
    print(importance_df.head(10))


Best performing model: LogisticRegression
ROC AUC: 0.9214533787733663
F1 Score: 0.9059262329050974


In [22]:
# Check class distribution first
print("Target class distribution:")
print(y.value_counts())
print(f"Percentage of positive class: {y.mean()*100:.2f}%")

# Get the correctly fitted estimator from our grid search results
best_estimator = None
if best_model_name in pipelines:
    # We need to get the actual fitted model from grid search
    for model_name, pipeline in pipelines.items():
        if model_name == best_model_name:
            grid_search = GridSearchCV(
                pipeline,
                param_grids[model_name],
                cv=5,
                scoring='roc_auc',
                n_jobs=-1
            )
            grid_search.fit(X_train, y_train)
            best_estimator = grid_search.best_estimator_
            break

# Now use this fitted estimator to make predictions
if best_estimator is not None:
    from sklearn.metrics import confusion_matrix, classification_report
    
    y_pred = best_estimator.predict(X_test)
    
    # Create and display confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(cm)
    print("\nFormat: [TN, FP]")
    print("        [FN, TP]")
    
    # Calculate metrics per class
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred))
else:
    print("Could not retrieve the fitted model")

Target class distribution:
purchase_24h
0    5627
1    4373
Name: count, dtype: int64
Percentage of positive class: 43.73%

Confusion Matrix:
[[1180  227]
 [   0 1093]]

Format: [TN, FP]
        [FN, TP]

Detailed Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.84      0.91      1407
           1       0.83      1.00      0.91      1093

    accuracy                           0.91      2500
   macro avg       0.91      0.92      0.91      2500
weighted avg       0.92      0.91      0.91      2500

