In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import joblib
from flask import Flask, request, jsonify
import numpy as np
import pickle

In [2]:
# Load the cleaned dataset from the directory
data_cleaned = pd.read_csv('../data/cleaned/data_cleaned.csv')

In [3]:
# Assuming data_cleaned is already loaded as a DataFrame
region_counts = data_cleaned[data_cleaned['Revenue'] == True]['Region'].value_counts().sort_index()

# Display result
print(region_counts)


Region
1    771
2    188
3    349
4    175
5     52
6    112
7    119
8     56
9     86
Name: count, dtype: int64


In [107]:
# Display the first 5 rows (head)
print("Head of the dataset:")
print(data_cleaned.head())

# Display the middle 5 rows
middle_index = len(data_cleaned) // 2
print("\nMiddle of the dataset:")
print(data_cleaned.iloc[middle_index-2:middle_index+3])  # Adjust the range to display 5 rows

# Display the last 5 rows (tail)
print("\nTail of the dataset:")
print(data_cleaned.tail())

Head of the dataset:
   Administrative  Administrative_Duration  Informational  \
0               0                      0.0              0   
1               0                      0.0              0   
2               0                      0.0              0   
3               0                      0.0              0   
4               0                      0.0              0   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                     0.0               1                 0.000000   
1                     0.0               2                64.000000   
2                     0.0               1                 0.000000   
3                     0.0               2                 2.666667   
4                     0.0              10               627.500000   

   BounceRates  ExitRates  PageValues  SpecialDay Month  OperatingSystems  \
0         0.20       0.20         0.0         0.0   Feb                 1   
1         0.00       0.10         0.0

In [108]:
# Encoding categorical variables
label_encoder = LabelEncoder()
data_cleaned['VisitorType'] = label_encoder.fit_transform(data_cleaned['VisitorType'].replace('Other', np.nan).fillna('Returning_Visitor'))
data_cleaned['Month'] = label_encoder.fit_transform(data_cleaned['Month'])

In [109]:
# Features and target variable
X = data_cleaned.drop(columns=['Revenue'])
y = data_cleaned['Revenue'].astype(int)

In [110]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [111]:
# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [112]:
# Handle class imbalance using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [113]:
# Train the model with class_weight='balanced'
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train_resampled, y_train_resampled)

In [114]:
# Adjusting threshold (e.g., 0.3)
y_prob = model.predict_proba(X_test_scaled)[:, 1]  # Probability of the positive class
threshold = 0.3
y_pred_adjusted = (y_prob >= threshold).astype(int)

# Evaluate model after threshold adjustment
print("Adjusted Classification Report:\n", classification_report(y_test, y_pred_adjusted))
print("Adjusted Confusion Matrix:\n", confusion_matrix(y_test, y_pred_adjusted))

Adjusted Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.87      0.91      2079
           1       0.52      0.81      0.63       362

    accuracy                           0.86      2441
   macro avg       0.74      0.84      0.77      2441
weighted avg       0.90      0.86      0.87      2441

Adjusted Confusion Matrix:
 [[1807  272]
 [  69  293]]


In [115]:
# Try Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_resampled, y_train_resampled)
y_pred_gb = gb_model.predict(X_test_scaled)

In [116]:
# Evaluate Gradient Boosting model
print("Gradient Boosting Classification Report:\n", classification_report(y_test, y_pred_gb))
print("Gradient Boosting Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))

Gradient Boosting Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.92      0.93      2079
           1       0.60      0.71      0.65       362

    accuracy                           0.89      2441
   macro avg       0.78      0.81      0.79      2441
weighted avg       0.90      0.89      0.89      2441

Gradient Boosting Confusion Matrix:
 [[1911  168]
 [ 106  256]]


In [117]:
# Cross-validation for RandomForest
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores:", cross_val_scores)
print("Mean cross-validation accuracy:", np.mean(cross_val_scores))

Cross-validation accuracy scores: [0.86770073 0.94219653 0.94949802 0.94736842 0.9555826 ]
Mean cross-validation accuracy: 0.9324692606796516


In [98]:
# Save the trained model under the "model" folder
joblib.dump(model, '../model/random_forest_model.pkl')

['../model/random_forest_model.pkl']

In [119]:
try:
    model = joblib.load('../model/random_forest_model.pkl')
    print("Model loaded successfully with joblib!")
except Exception as e:
    print(f"Failed to load with joblib: {e}")

Model loaded successfully with joblib!


In [121]:
# Example: Make a prediction
sample_input = [[0, 0.0, 0, 0.0, 1, 0.0, 2, 10.0, 0.2, 0.2, 0.0, 0.0, 2, 1, 1, 1, 1]]  # Replace with your actual input features
prediction = model.predict(sample_input)
print("Prediction:", prediction)

Prediction: [1]
