## <strong><span style="color:lightblue">Test-Train Split </span></strong>

### Libraries

In [21]:
# Sklearn libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,confusion_matrix

#statmodel libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

In [8]:
leads = pd.read_pickle("leads.pkl")

In [9]:
leads

Unnamed: 0,Do Not Email,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Free_copy,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Origin_Quick Add Form,...,Specialization_Services Excellence,Specialization_Supply Chain Management,Specialization_Travel and Tourism,Current_occupation_Housewife,Current_occupation_Other,Current_occupation_Student,Current_occupation_Unemployed,Current_occupation_Working Professional,City_Other Cities,City_Other_city_india
0,0,0,0.0,0,0.00,0,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
1,0,0,5.0,674,2.50,0,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
2,0,1,2.0,1532,2.00,1,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,0,0,1.0,305,1.00,0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,0,1,2.0,1428,1.00,0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9235,1,1,8.0,1845,2.67,0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
9236,0,0,2.0,238,2.00,1,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
9237,1,0,2.0,199,2.00,1,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
9238,0,1,3.0,499,3.00,0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True


## Test train

In [10]:
# Predictor variables to x 
X = leads.drop("Converted",axis=1)

# targert variable to y
y = leads["Converted"]

In [11]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=100)

In [12]:
print("X_train shape:", X_train.shape,"\ny_train shape:",y_train.shape)
print("X_test shape:", X_test.shape,"\ny_test shape:",y_test.shape)

X_train shape: (7392, 50) 
y_train shape: (7392,)
X_test shape: (1848, 50) 
y_test shape: (1848,)


## Feature Scaling

In [13]:
# using standard scaler for scaling the features
scaler = StandardScaler()

# fetching int64 and float64 dtype columns from dataframe for scaling
num_cols=X_train.select_dtypes(include=['int64','float64']).columns

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

In [14]:
# X-train dataframe after standard scaling
X_train.head()

Unnamed: 0,Do Not Email,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Free_copy,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Origin_Quick Add Form,Lead Source_Facebook,...,Specialization_Services Excellence,Specialization_Supply Chain Management,Specialization_Travel and Tourism,Current_occupation_Housewife,Current_occupation_Other,Current_occupation_Student,Current_occupation_Unemployed,Current_occupation_Working Professional,City_Other Cities,City_Other_city_india
7263,-0.293157,-1.069308,-0.884942,-1.186261,-0.673351,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
6468,-0.293157,-0.405873,-0.632201,-0.143407,1.48511,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
7833,-0.293157,-0.405873,-0.740257,-0.143407,1.48511,True,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
4461,-0.293157,0.257561,2.006923,0.899446,-0.673351,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
8453,-0.293157,-1.069308,-0.884942,-1.186261,-0.673351,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False


In [15]:
lead_con_rate = (sum(leads["Converted"])/len(leads["Converted"].index))*100
lead_con_rate

38.53896103896104

In [17]:
X_test = X_test.drop(['Lead Origin_Lead Import','Lead Origin_Lead Add Form'],axis=1)
X_train = X_train.drop(['Lead Origin_Lead Import','Lead Origin_Lead Add Form'],axis=1)

## <strong><span style="color:lightblue">Model Building </span></strong>

## Feature Selection 

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


import xgboost as xgb


### Logistic Regression

In [19]:
X = leads.drop("Converted", axis=1)
y = leads["Converted"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate Logistic Regression
logistic_regression = LogisticRegression(max_iter=1000, random_state=42)
logistic_regression.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred_logistic = logistic_regression.predict(X_test)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
class_report_logistic = classification_report(y_test, y_pred_logistic)

print(accuracy_logistic, class_report_logistic)

0.8181818181818182               precision    recall  f1-score   support

           0       0.82      0.90      0.86      1107
           1       0.82      0.70      0.76       741

    accuracy                           0.82      1848
   macro avg       0.82      0.80      0.81      1848
weighted avg       0.82      0.82      0.82      1848



### Decision Tree

In [22]:
# Train and evaluate Decision Tree
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred_dt = decision_tree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
class_report_dt = classification_report(y_test, y_pred_dt)
mqrt = np.sqrt(metrics.mean_squared_error(y_test, y_pred_dt))

print(class_report_dt)
print("Accuracy:",accuracy_dt)
print("Mean Square Error:", mqrt)


              precision    recall  f1-score   support

           0       0.81      0.81      0.81      1107
           1       0.72      0.72      0.72       741

    accuracy                           0.77      1848
   macro avg       0.76      0.77      0.76      1848
weighted avg       0.77      0.77      0.77      1848

Accuracy: 0.7738095238095238
Mean Square Error: 0.47559486560567094


### Random Forest Classifier

In [23]:

# Train and evaluate Random Forest
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred_rf = random_forest.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf)
mqrt = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rf))

print(class_report_rf)
print("Accuracy:",accuracy_rf)
print("Mean Square Error:", mqrt)

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1107
           1       0.78      0.76      0.77       741

    accuracy                           0.82      1848
   macro avg       0.81      0.81      0.81      1848
weighted avg       0.82      0.82      0.82      1848

Accuracy: 0.8187229437229437
Mean Square Error: 0.42576643394830493


### XGBOOST

In [24]:
# Train and evaluate XGBoost
xgboost_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgboost_model.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred_xgb = xgboost_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
class_report_xgb = classification_report(y_test, y_pred_xgb)
mqrt = np.sqrt(metrics.mean_squared_error(y_test, y_pred_xgb))

print(class_report_xgb)
print("Accuracy:",accuracy_xgb)
print("Mean Square Error:", mqrt)



              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1107
           1       0.78      0.76      0.77       741

    accuracy                           0.82      1848
   macro avg       0.81      0.81      0.81      1848
weighted avg       0.82      0.82      0.82      1848

Accuracy: 0.8181818181818182
Mean Square Error: 0.4264014327112209


### SVM

In [None]:
# from sklearn.svm import SVC

# # Train and evaluate SVM with a linear kernel for faster training
# svm_model = SVC(kernel='linear', random_state=42)
# svm_model.fit(X_train, y_train)

# # Predict on the test set and evaluate
# y_pred_svm = svm_model.predict(X_test)
# accuracy_svm = accuracy_score(y_test, y_pred_svm)
# class_report_svm = classification_report(y_test, y_pred_svm)

# accuracy_svm, class_report_svm

### Naive Bayes

In [25]:

# Train and evaluate Naive Bayes
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred_nb = naive_bayes_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
class_report_nb = classification_report(y_test, y_pred_nb)
msqrt = np.sqrt(metrics.mean_squared_error(y_test, y_pred_nb))

print(class_report_nb)
print("Accuracy:",accuracy_nb)
print("Mean Square Error:", msqrt)

              precision    recall  f1-score   support

           0       0.71      0.93      0.81      1107
           1       0.80      0.44      0.57       741

    accuracy                           0.73      1848
   macro avg       0.76      0.68      0.69      1848
weighted avg       0.75      0.73      0.71      1848

Accuracy: 0.7316017316017316
Mean Square Error: 0.5180716826832639


### ADABOOST

In [26]:

# Train and evaluate AdaBoost
adaboost_model = AdaBoostClassifier(random_state=100)
adaboost_model.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred_adaboost = adaboost_model.predict(X_test)
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
class_report_adaboost = classification_report(y_test, y_pred_adaboost)
msqrt = np.sqrt(metrics.mean_squared_error(y_test, y_pred_adaboost))


print(class_report_adaboost)
print("Accuracy:",accuracy_adaboost)
print("Mean Square Error:", msqrt)



              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1107
           1       0.81      0.72      0.76       741

    accuracy                           0.82      1848
   macro avg       0.82      0.80      0.81      1848
weighted avg       0.82      0.82      0.82      1848

Accuracy: 0.8208874458874459
Mean Square Error: 0.42321691142079154


### SGDClassifier

In [27]:


# Train and evaluate Stochastic Gradient Descent (SGD) Classifier
sgd_model = SGDClassifier(random_state=42)
sgd_model.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred_sgd = sgd_model.predict(X_test)
accuracy_sgd = accuracy_score(y_test, y_pred_sgd)
class_report_sgd = classification_report(y_test, y_pred_sgd)
msqrt = np.sqrt(metrics.mean_squared_error(y_test, y_pred_sgd))

print(class_report_sgd)
print("Accuracy:",accuracy_sgd)
print("Mean Square Error:", msqrt)

              precision    recall  f1-score   support

           0       0.63      0.99      0.77      1107
           1       0.87      0.14      0.25       741

    accuracy                           0.65      1848
   macro avg       0.75      0.56      0.51      1848
weighted avg       0.73      0.65      0.56      1848

Accuracy: 0.6482683982683982
Mean Square Error: 0.5930696432389722


### Gradient Boost

In [28]:

# Train and evaluate Gradient Boosting Machines (GBM)
gbm_model = GradientBoostingClassifier(random_state=42)
gbm_model.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred_gbm = gbm_model.predict(X_test)
accuracy_gbm = accuracy_score(y_test, y_pred_gbm)
class_report_gbm = classification_report(y_test, y_pred_gbm)
msqrt = np.sqrt(metrics.mean_squared_error(y_test, y_pred_gbm))

print(class_report_gbm)
print("Accuracy:",accuracy_gbm)
print("Mean Square Error:", msqrt)

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1107
           1       0.81      0.79      0.80       741

    accuracy                           0.84      1848
   macro avg       0.83      0.83      0.83      1848
weighted avg       0.84      0.84      0.84      1848

Accuracy: 0.8398268398268398
Mean Square Error: 0.40021639168474865


In [29]:
# Predict on the training set and evaluate for overfitting
y_pred_train_gbm = gbm_model.predict(X_train)
accuracy_train_gbm = accuracy_score(y_train, y_pred_train_gbm)
msqrt_train = np.sqrt(metrics.mean_squared_error(y_train, y_pred_train_gbm))

print("Accuracy test:",accuracy_gbm)
print("Accuracy train:",accuracy_train_gbm)
print("Mean Square Error test:", msqrt)
print("Mean Square Error train:", msqrt_train)

Accuracy test: 0.8398268398268398
Accuracy train: 0.8369859307359307
Mean Square Error test: 0.40021639168474865
Mean Square Error train: 0.4037500083765563


### Hyperparameter - GRID 

In [None]:
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 300, 500, 1000],
#     'learning_rate': [0.01, 0.1, 0.5, 1.0],
#     'max_depth': [3, 5, 9, 15],
#     'min_samples_split': [2, 4,8],
#     'min_samples_leaf': [1, 2, 4],
#     'subsample': [0.8, 1.0, 1.2]
# }

# # Initialize the GridSearchCV object
# grid_search = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42),
#                            param_grid=param_grid,
#                            scoring='accuracy',
#                            cv=3,
#                            verbose=1,
#                            n_jobs=-1)

# # Fit to the training data
# grid_search.fit(X_train, y_train)

# # Get the best parameters and the corresponding accuracy
# best_params = grid_search.best_params_
# best_accuracy = grid_search.best_score_

# best_params, best_accuracy

In [30]:
best_gbm_model = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, min_samples_leaf=2,
                                            min_samples_split=2, n_estimators=300, subsample=1.0, random_state=100)

# Train the model using the training data
best_gbm_model.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred_best_gbm = best_gbm_model.predict(X_test)
accuracy_best_gbm = accuracy_score(y_test, y_pred_best_gbm)
class_report_best_gbm = classification_report(y_test, y_pred_best_gbm)

print(accuracy_best_gbm, class_report_best_gbm)

0.8306277056277056               precision    recall  f1-score   support

           0       0.85      0.87      0.86      1107
           1       0.80      0.78      0.79       741

    accuracy                           0.83      1848
   macro avg       0.82      0.82      0.82      1848
weighted avg       0.83      0.83      0.83      1848



In [34]:
leads

Unnamed: 0,Do Not Email,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Free_copy,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Origin_Quick Add Form,...,Specialization_Services Excellence,Specialization_Supply Chain Management,Specialization_Travel and Tourism,Current_occupation_Housewife,Current_occupation_Other,Current_occupation_Student,Current_occupation_Unemployed,Current_occupation_Working Professional,City_Other Cities,City_Other_city_india
0,0,0,0.0,0,0.00,0,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
1,0,0,5.0,674,2.50,0,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
2,0,1,2.0,1532,2.00,1,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False
3,0,0,1.0,305,1.00,0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,0,1,2.0,1428,1.00,0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9235,1,1,8.0,1845,2.67,0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
9236,0,0,2.0,238,2.00,1,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
9237,1,0,2.0,199,2.00,1,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
9238,0,1,3.0,499,3.00,0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True


In [51]:
# Predict the probabilities of the test set
proba = best_gbm_model.predict_proba(X_test)

# Print the predicted probabilities
proba

print("Predicted Probabilities:")
for probs in proba:
    formatted_probs = [f"{prob:.2f}" for prob in probs]
    print(formatted_probs)

Predicted Probabilities:
['0.82', '0.18']
['0.90', '0.10']
['0.99', '0.01']
['0.92', '0.08']
['0.87', '0.13']
['0.97', '0.03']
['0.00', '1.00']
['0.90', '0.10']
['0.78', '0.22']
['0.02', '0.98']
['0.18', '0.82']
['0.78', '0.22']
['0.36', '0.64']
['0.92', '0.08']
['0.80', '0.20']
['0.42', '0.58']
['0.02', '0.98']
['0.96', '0.04']
['0.96', '0.04']
['0.88', '0.12']
['0.05', '0.95']
['0.81', '0.19']
['0.81', '0.19']
['0.36', '0.64']
['0.08', '0.92']
['0.95', '0.05']
['0.15', '0.85']
['0.96', '0.04']
['0.55', '0.45']
['0.83', '0.17']
['0.96', '0.04']
['0.02', '0.98']
['0.98', '0.02']
['0.75', '0.25']
['0.92', '0.08']
['0.12', '0.88']
['0.24', '0.76']
['0.81', '0.19']
['0.96', '0.04']
['0.63', '0.37']
['0.32', '0.68']
['0.21', '0.79']
['0.80', '0.20']
['0.88', '0.12']
['0.96', '0.04']
['0.27', '0.73']
['0.04', '0.96']
['0.19', '0.81']
['0.96', '0.04']
['0.07', '0.93']
['0.23', '0.77']
['0.70', '0.30']
['0.81', '0.19']
['0.04', '0.96']
['0.97', '0.03']
['0.98', '0.02']
['0.54', '0.46']
['0.96

In [52]:
proba.shape

(1848, 2)

In [68]:
df_test = pd.DataFrame(X_test)
df_test['Convertion'] = y_test    
df_test['prob_0'] = proba[:,0] 
df_test['prob_1'] = proba[:,1]




In [69]:
df_test

Unnamed: 0,Do Not Email,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Free_copy,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Origin_Quick Add Form,Lead Source_Facebook,...,Current_occupation_Housewife,Current_occupation_Other,Current_occupation_Student,Current_occupation_Unemployed,Current_occupation_Working Professional,City_Other Cities,City_Other_city_india,Convertion,prob_0,prob_1
4608,0,8.0,252,2.67,1,True,False,False,False,False,...,False,False,False,True,False,False,False,1,0.816819,0.183181
7935,1,2.0,929,2.00,1,True,False,False,False,False,...,False,False,False,True,False,True,False,0,0.895411,0.104589
4043,1,1.0,2,1.00,0,True,False,False,False,False,...,False,False,True,False,False,True,False,0,0.994419,0.005581
7821,0,2.0,323,2.00,0,True,False,False,False,False,...,False,False,False,False,True,False,False,0,0.918331,0.081669
856,0,3.0,201,3.00,0,True,False,False,False,False,...,False,False,False,True,False,False,False,0,0.874773,0.125227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7387,0,3.0,1486,3.00,1,True,False,False,False,False,...,False,False,False,True,False,False,True,1,0.331985,0.668015
3063,0,8.0,1054,2.67,0,False,False,False,False,False,...,False,False,False,True,False,False,True,1,0.183658,0.816342
603,0,0.0,97,0.00,0,False,False,True,False,True,...,False,False,False,True,False,False,False,0,0.930144,0.069856
4210,0,4.0,409,4.00,0,True,False,False,False,False,...,False,False,False,True,False,False,False,1,0.941059,0.058941


In [70]:
# Rounding the values and converting them to percentages
df_test['prob_0'] = (df_test['prob_0'] * 100).round(2)
df_test['prob_1'] = (df_test['prob_1'] * 100).round(2)


In [71]:
df_test

Unnamed: 0,Do Not Email,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Free_copy,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Origin_Quick Add Form,Lead Source_Facebook,...,Current_occupation_Housewife,Current_occupation_Other,Current_occupation_Student,Current_occupation_Unemployed,Current_occupation_Working Professional,City_Other Cities,City_Other_city_india,Convertion,prob_0,prob_1
4608,0,8.0,252,2.67,1,True,False,False,False,False,...,False,False,False,True,False,False,False,1,81.68,18.32
7935,1,2.0,929,2.00,1,True,False,False,False,False,...,False,False,False,True,False,True,False,0,89.54,10.46
4043,1,1.0,2,1.00,0,True,False,False,False,False,...,False,False,True,False,False,True,False,0,99.44,0.56
7821,0,2.0,323,2.00,0,True,False,False,False,False,...,False,False,False,False,True,False,False,0,91.83,8.17
856,0,3.0,201,3.00,0,True,False,False,False,False,...,False,False,False,True,False,False,False,0,87.48,12.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7387,0,3.0,1486,3.00,1,True,False,False,False,False,...,False,False,False,True,False,False,True,1,33.20,66.80
3063,0,8.0,1054,2.67,0,False,False,False,False,False,...,False,False,False,True,False,False,True,1,18.37,81.63
603,0,0.0,97,0.00,0,False,False,True,False,True,...,False,False,False,True,False,False,False,0,93.01,6.99
4210,0,4.0,409,4.00,0,True,False,False,False,False,...,False,False,False,True,False,False,False,1,94.11,5.89


In [72]:
from sklearn.ensemble import GradientBoostingClassifier
import joblib

In [73]:
# Define the Gradient Boosting Model again
best_gbm_model = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, min_samples_leaf=2,
                                            min_samples_split=2, n_estimators=300, subsample=1.0, random_state=100)

# Save the model structure to a file
model_path = "my_model.pkl"
joblib.dump(best_gbm_model, model_path)

model_path

'my_model.pkl'