In [6]:
# Importing necessary libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Loading the cleaned and encoded dataset
df = pd.read_csv('../Data/coupon_data_cleaned_encoded.csv')
df.head()

Unnamed: 0,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y,destination_No Urgent Place,destination_Work,...,CarryAway_More than 8,CarryAway_never,RestaurantLessThan20_4~8,RestaurantLessThan20_Less than 1,RestaurantLessThan20_More than 8,RestaurantLessThan20_never,Restaurant20To50_4~8,Restaurant20To50_Less than 1,Restaurant20To50_More than 8,Restaurant20To50_never
0,-0.43343,1,0.0,-1.131581,-0.367745,0,1,1,True,False,...,False,False,True,False,False,False,False,False,False,False
1,0.871799,1,0.0,-1.131581,-0.367745,0,1,0,True,False,...,False,False,True,False,False,False,False,False,False,False
2,0.871799,1,0.0,0.88372,-0.367745,0,1,1,True,False,...,False,False,True,False,False,False,False,False,False,False
3,0.871799,1,0.0,0.88372,-0.367745,0,1,0,True,False,...,False,False,True,False,False,False,False,False,False,False
4,0.871799,1,0.0,0.88372,-0.367745,0,1,0,True,False,...,False,False,True,False,False,False,False,False,False,False


In [7]:
# Separating features and target variable
X = df.drop(columns=['Y'])  # Features
y = df['Y']                 # Target variable

# Spliting the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [8]:
# Initializing models
logreg = LogisticRegression(max_iter=1000, random_state=42)
tree = DecisionTreeClassifier(random_state=42)
forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Training the models
logreg.fit(X_train, y_train)
tree.fit(X_train, y_train)
forest.fit(X_train, y_train)

In [9]:
# Prediction on the test set
y_pred_logreg = logreg.predict(X_test)
y_pred_tree = tree.predict(X_test)
y_pred_forest = forest.predict(X_test)

In [10]:
# Function to display performance metrics
def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Evaluation:")
    print(f"Accuracy : {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall   : {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score : {f1_score(y_true, y_pred):.4f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred))

# Evaluating all models
evaluate_model("Logistic Regression", y_test, y_pred_logreg)
evaluate_model("Decision Tree", y_test, y_pred_tree)
evaluate_model("Random Forest", y_test, y_pred_forest)


Logistic Regression Evaluation:
Accuracy : 0.6807
Precision: 0.6975
Recall   : 0.7739
F1 Score : 0.7337

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.56      0.60      1095
           1       0.70      0.77      0.73      1442

    accuracy                           0.68      2537
   macro avg       0.67      0.67      0.67      2537
weighted avg       0.68      0.68      0.68      2537


Decision Tree Evaluation:
Accuracy : 0.6657
Precision: 0.7015
Recall   : 0.7171
F1 Score : 0.7092

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.60      0.61      1095
           1       0.70      0.72      0.71      1442

    accuracy                           0.67      2537
   macro avg       0.66      0.66      0.66      2537
weighted avg       0.66      0.67      0.67      2537


Random Forest Evaluation:
Accuracy : 0.7355
Precision: 0.7445
Recall   : 0.8141
F1 Score : 

In [11]:
# Defining the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],       # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],       # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],       # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum samples required at a leaf node
    'bootstrap': [True, False]             # Whether bootstrap samples are used
}

# Creating the base model
rf = RandomForestClassifier(random_state=42)

# Using GridSearchCV to search for the best parameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='f1')

# Fitting the grid search to the training data
grid_search.fit(X_train, y_train)

# Getting the best model from grid search
best_rf = grid_search.best_estimator_

# Printing best parameters
print("Best parameters found:\n", grid_search.best_params_)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   6.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   5.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   5.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   8.4s
[CV] END bootstrap=True, max_depth=None, min_sam

In [12]:
# Prediction with the best estimator
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Computing metrics
accuracy = accuracy_score(y_test, y_pred_best)
precision = precision_score(y_test, y_pred_best)
recall = recall_score(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best)

# Printing results
print("Optimized Random Forest Evaluation:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred_best))

Optimized Random Forest Evaluation:
Accuracy : 0.7375
Precision: 0.7401
Recall   : 0.8294
F1 Score : 0.7822

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.62      0.67      1095
           1       0.74      0.83      0.78      1442

    accuracy                           0.74      2537
   macro avg       0.74      0.72      0.73      2537
weighted avg       0.74      0.74      0.73      2537



In [13]:
import joblib

joblib.dump(best_rf, '../Models/random_forest_model.pkl')
print("Model saved successfully!")

Model saved successfully!
