<a href="https://www.kaggle.com/code/adithyar3363/lead-score?scriptVersionId=197937156" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# Supress unnecessary warnings
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import time, warnings
import datetime as dt

from IPython.display import display
pd.options.display.max_columns = None

# Read and Understand the data

In [None]:
df_leads = pd.read_csv("/kaggle/input/leadscore/Leads.csv")

In [None]:
df_leads.head()

In [None]:
df_leads.shape

In [None]:
df_leads.columns

In [None]:
df_leads.describe()

In [None]:
# Checking the info to see the types of the feature variables and the null values present

df_leads.info()

## Data Cleaning and Preparation

In [None]:
# number of missing values in each column

df_leads.isnull().sum()

In [None]:
# Droping all the columns in which greater than 3000 missing values are present i.e greater than 1/3 of total entries

for col in df_leads.columns:
    if df_leads[col].isnull().sum() > 3000:
        df_leads.drop(col, 1, inplace=True)

In [None]:
df_leads.isnull().sum()

In [None]:
# Dropping city and country column as it is irrelevant to the model building

df_leads.drop(['City', 'Country'], axis = 1, inplace = True)

round(100*(df_leads.isnull().sum()/len(df_leads.index)), 2)

In [None]:
df_leads.isnull().sum()

In [None]:
# Getting the value counts and datatype of all the columns

for column in df_leads:
    print(df_leads[column].astype('category').value_counts())
    print('___________________________________________________')

In [None]:
# Dropping all such columns that are irrelvant to model building

df_leads.drop(['Lead Profile', 'How did you hear about X Education','Do Not Call', 'Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 'Newspaper', 
            'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses', 
            'Update me on Supply Chain Content', 'Get updates on DM Content', 
            'I agree to pay the amount through cheque'], axis = 1, inplace = True)

df_leads.head()

In [None]:
df_leads['What matters most to you in choosing a course'].value_counts()

In [None]:
df_leads.drop(['What matters most to you in choosing a course'], axis = 1, inplace=True)

df_leads.head()

In [None]:
df_leads.isnull().sum()

In [None]:
df_leads = df_leads[~pd.isnull(df_leads['What is your current occupation'])]

df_leads.isnull().sum()

In [None]:
#Dropping null value entries

df_leads = df_leads[~pd.isnull(df_leads['Lead Source'])]
df_leads = df_leads[~pd.isnull(df_leads['TotalVisits'])]
df_leads = df_leads[~pd.isnull(df_leads['Page Views Per Visit'])]
df_leads = df_leads[~pd.isnull(df_leads['Last Activity'])]
df_leads = df_leads[~pd.isnull(df_leads['Specialization'])]

df_leads.info()

In [None]:
print(len(df_leads.index))
print(len(df_leads.index)/9240)

In [None]:
# 68% of data is still intact and is safe enough to go ahead with model building excercise

In [None]:
df_leads.drop(['Prospect ID', 'Lead Number'], axis = 1, inplace = True)

df_leads.head()

## Preparing data for modelling

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
sns.pairplot(df_leads, diag_kind='kde',hue='Converted')
plt.show()

In [None]:
sns.pairplot(df_leads,hue='Converted')
plt.show()

In [None]:
df_edu = df_leads[['TotalVisits','Total Time Spent on Website','Page Views Per Visit','Converted']]
sns.pairplot(df_edu,diag_kind='kde',hue='Converted')
plt.show()

In [None]:
# Applying a power transform featurewise to make data more Gaussian-like and reduce high variances.

from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
transformed_df_edu = pd.DataFrame(pt.fit_transform(df_edu))
transformed_df_edu.columns = df_edu.columns
transformed_df_edu.head()

In [None]:
sns.pairplot(transformed_df_edu, diag_kind='kde',hue='Converted')
plt.show()

## Dummy variable creation

In [None]:
# Checking the columns which are of type 'object'

temp = df_leads.loc[:, df_leads.dtypes == 'object']
temp.columns

In [None]:
# Creating dummy variables using the 'get_dummies' command
dummy = pd.get_dummies(df_leads[['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
                              'What is your current occupation','A free copy of Mastering The Interview', 
                              'Last Notable Activity']], drop_first=True)

# Adding the results to the master dataframe
df_leads = pd.concat([df_leads, dummy], axis=1)

In [None]:
# Creating dummy variable separately for the variable 'Specialization' since it has the level 'Select' which is useless so we drop that level by specifying it explicitly

dummy_spl = pd.get_dummies(df_leads['Specialization'], prefix = 'Specialization')
dummy_spl = dummy_spl.drop(['Specialization_Select'], 1)
df_leads = pd.concat([df_leads, dummy_spl], axis = 1)

In [None]:
# Dropping the variables for which the dummy variables have been created

df_leads = df_leads.drop(['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
                   'Specialization', 'What is your current occupation',
                   'A free copy of Mastering The Interview', 'Last Notable Activity'], 1)

In [None]:
df_leads.head()

# Test-Train Split

In [None]:
# Importing the required library

from sklearn.model_selection import train_test_split

In [None]:
X = df_leads.drop(['Converted'], 1)
X.head()

In [None]:
y = df_leads['Converted']
y.head()

In [None]:
# Splitting the dataset into 70% train and 30% test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

## Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

X_train[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']] = scaler.fit_transform(X_train[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']])

X_train.head()

In [None]:
# Looking at the correlation table
plt.figure(figsize = (25,15))
sns.heatmap(df_leads.corr())
plt.show()

## Model Building

In [None]:
# Import 'LogisticRegression' and create a LogisticRegression object

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
# Import RFE and select 15 variables

from sklearn.feature_selection import RFE
rfe = RFE(estimator = logreg, n_features_to_select = 15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
# Put all the columns selected by RFE in the variable 'col'
col = X_train.columns[rfe.support_]

In [None]:
# Select only the columns selected by RFE

X_train = X_train[col]

In [None]:
# Import statsmodels

import statsmodels.api as sm

In [None]:
# Fit a logistic Regression model on X_train after adding a constant and output the summary

X_train_sm = sm.add_constant(X_train)
logm2 = sm.GLM(y_train, X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Making a VIF dataframe for all the variables present

vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# Dropping high VIF and P-value categories one by one

X_train.drop('Lead Source_Reference', axis = 1, inplace = True)

In [None]:
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('Last Notable Activity_Had a Phone Conversation', axis = 1, inplace = True)

In [None]:
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
# Dropping high P-value data

X_train.drop('What is your current occupation_Housewife', axis = 1, inplace = True)

In [None]:
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
X_train.drop('What is your current occupation_Working Professional', axis = 1, inplace = True)

In [None]:
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
res = logm1.fit()
res.summary()

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
## All entries are within acceptable VIF and P-value range. So we can go ahead and evaluate the model

## Model Evaluation

In [None]:
y_train_pred = res.predict(sm.add_constant(X_train))
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [None]:
# Create a new dataframe containing the actual conversion flag and the probabilities predicted by the model

y_train_pred_final = pd.DataFrame({'Converted':y_train.values, 'Conversion_Prob':y_train_pred})
y_train_pred_final.head()

## Creating new column 'Predicted' with 1 if Paid_Prob > 0.5 else 0

In [None]:
y_train_pred_final['Predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
# Import metrics from sklearn for evaluation

from sklearn import metrics

In [None]:
# Create confusion matrix 

confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted )
print(confusion)

In [None]:
# Predicted     not_converted    converted
# Actual
# not_converted        2543      463
# converted            692       1652  

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.Predicted))

In [None]:
# Let's evaluate the other metrics as well

TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Calculating the sensitivity

TP/(TP+FN)

In [None]:
# Calculating the specificity

TN/(TN+FP)

## Finding the Optimal Cutoff

In [None]:
# ROC function

def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Converted, y_train_pred_final.Conversion_Prob, drop_intermediate = False )


In [None]:
draw_roc(y_train_pred_final.Converted, y_train_pred_final.Conversion_Prob)

In [None]:
#Creating columns with different probability cutoffs 

numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Creating a dataframe to see the values of accuracy, sensitivity, and specificity at different values of probabiity cutoffs

cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

In [None]:
# The optimal point for acuracy, sensitivity and specifity trade off seems to be at 0.42. So using 0.42 as our next probability cut-off point 

y_train_pred_final['final_predicted'] = y_train_pred_final.Conversion_Prob.map( lambda x: 1 if x > 0.42 else 0)

y_train_pred_final.head()

In [None]:
#Checking accuracy 

metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

In [None]:
# Creating confusion matrix

confusion2 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
print(confusion2)

In [None]:
# Evaluate the other metrics as well

TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Sensitivity

TP/(TP+FN)

In [None]:
# Specificity

TN/(TN+FP)

## Making Predictions on the Test Set

In [None]:
# Scale the test set as well using 'transform'

X_test[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']] = scaler.transform(X_test[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']])

In [None]:
X_test = X_test[col]
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test[col])

X_test_sm

In [None]:
X_test.drop(['Lead Source_Reference', 'What is your current occupation_Housewife', 
             'What is your current occupation_Working Professional', 'Last Notable Activity_Had a Phone Conversation'], 1, inplace = True)

In [None]:
y_test_pred = res.predict(sm.add_constant(X_test))

y_test_pred[:20]

In [None]:
y_pred_1 = pd.DataFrame(y_test_pred)

y_pred_1.head()

In [None]:
y_test_df = pd.DataFrame(y_test)

In [None]:
# Removing index for both dataframes to append them side by side 

y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

y_pred_final.head()

In [None]:
y_pred_final= y_pred_final.rename(columns = {0 : 'Conversion_Prob'})

In [None]:
# Making predictions on the test set using 0.42 as the cutoff

y_pred_final['final_predicted'] = y_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.42 else 0)

y_pred_final.head()

In [None]:
# Accuracy

metrics.accuracy_score(y_pred_final['Converted'], y_pred_final.final_predicted)

In [None]:
# Confusion matrix

confusion2 = metrics.confusion_matrix(y_pred_final['Converted'], y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Sensitivity

TP / float(TP+FN)

In [None]:
# Specificity

TN / float(TN+FP)

## Precision-Recall View

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.Predicted )
confusion

In [None]:
# Precision
confusion[1,1]/(confusion[0,1]+confusion[1,1])

In [None]:
#Recall
confusion[1,1]/(confusion[1,0]+confusion[1,1])

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
y_train_pred_final.Converted, y_train_pred_final.Predicted

In [None]:
p, r, thresholds = precision_recall_curve(y_train_pred_final.Converted, y_train_pred_final.Conversion_Prob)


In [None]:
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

In [None]:
# Checking if there are any other optimal cutoff greater than 0.42

y_train_pred_final['final_predicted'] = y_train_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.44 else 0)

y_train_pred_final.head()

In [None]:
# Accuracy

metrics.accuracy_score(y_train_pred_final.Converted, y_train_pred_final.final_predicted)

In [None]:
# Confusion Matrix
confusion2 = metrics.confusion_matrix(y_train_pred_final.Converted, y_train_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Sensitivity
TP/(TP+FP)

In [None]:
#Specificity
TP/(TP+FN)

In [None]:
y_test_pred = res.predict(sm.add_constant(X_test))

In [None]:
y_pred_1 = pd.DataFrame(y_test_pred)

In [None]:
y_test_df = pd.DataFrame(y_test)

In [None]:
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)
y_pred_final.head()

In [None]:
y_pred_final= y_pred_final.rename(columns = {0 : 'Conversion_Prob'})

In [None]:
# Making predictions on the test set using 0.44 as the cutoff

y_pred_final['final_predicted'] = y_pred_final.Conversion_Prob.map(lambda x: 1 if x > 0.44 else 0)

In [None]:
y_pred_final.head()

In [None]:
# Accuracy 
metrics.accuracy_score(y_pred_final['Converted'], y_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final['Converted'], y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Sensitivity
TP/(TP+FP)

In [None]:
# Specificity
TP/(TP+FN)

There are a lot of leads generated in the initial stage but only a few of them come out as paying customers. In the middle stage, one needs to nurture the potential leads as well (i.e. educating the leads about the product, constantly communicating etc.) in order to get a higher lead conversion. Firstly, sort out the best prospects from the leads generated. 'TotalVisits' , 'Total Time Spent on Website' , 'Page Views Per Visit' which contribute most towards the probability of a lead getting converted. Then, keep a list of leads to inform them about new courses, services, job offers and future higher studies. Monitor each lead carefully to tailor the information to be sent. Provide job offerings, information or courses that suits best according to the interest of the leads. Hold question-answer sessions with leads to extract the right information. Make further inquiries and appointments with the leads to determine their intention and mentality to join online courses.