In [5]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [6]:
# Load the dataset
 file_path = "Leads.csv"
df = pd.read_csv(file_path)

In [9]:
# Display basic information
df

Unnamed: 0,Prospect ID,Lead Number,Lead Origin,Lead Source,Do Not Email,Do Not Call,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,...,Get updates on DM Content,Lead Profile,City,Asymmetrique Activity Index,Asymmetrique Profile Index,Asymmetrique Activity Score,Asymmetrique Profile Score,I agree to pay the amount through cheque,A free copy of Mastering The Interview,Last Notable Activity
0,7927b2df-8bba-4d29-b9a2-b6e0beafe620,660737,API,Olark Chat,No,No,0,0.0,0,0.00,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Modified
1,2a272436-5132-4136-86fa-dcc88c88f482,660728,API,Organic Search,No,No,0,5.0,674,2.50,...,No,Select,Select,02.Medium,02.Medium,15.0,15.0,No,No,Email Opened
2,8cc8c611-a219-4f35-ad23-fdfd2656bd8a,660727,Landing Page Submission,Direct Traffic,No,No,1,2.0,1532,2.00,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,20.0,No,Yes,Email Opened
3,0cc2df48-7cf4-4e39-9de9-19797f9b38cc,660719,Landing Page Submission,Direct Traffic,No,No,0,1.0,305,1.00,...,No,Select,Mumbai,02.Medium,01.High,13.0,17.0,No,No,Modified
4,3256f628-e534-4826-9d63-4a8b88782852,660681,Landing Page Submission,Google,No,No,1,2.0,1428,1.00,...,No,Select,Mumbai,02.Medium,01.High,15.0,18.0,No,No,Modified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9235,19d6451e-fcd6-407c-b83b-48e1af805ea9,579564,Landing Page Submission,Direct Traffic,Yes,No,1,8.0,1845,2.67,...,No,Potential Lead,Mumbai,02.Medium,01.High,15.0,17.0,No,No,Email Marked Spam
9236,82a7005b-7196-4d56-95ce-a79f937a158d,579546,Landing Page Submission,Direct Traffic,No,No,0,2.0,238,2.00,...,No,Potential Lead,Mumbai,02.Medium,01.High,14.0,19.0,No,Yes,SMS Sent
9237,aac550fe-a586-452d-8d3c-f1b62c94e02c,579545,Landing Page Submission,Direct Traffic,Yes,No,0,2.0,199,2.00,...,No,Potential Lead,Mumbai,02.Medium,01.High,13.0,20.0,No,Yes,SMS Sent
9238,5330a7d1-2f2b-4df4-85d6-64ca2f6b95b9,579538,Landing Page Submission,Google,No,No,1,3.0,499,3.00,...,No,,Other Metro Cities,02.Medium,02.Medium,15.0,16.0,No,No,SMS Sent


In [11]:
# Define the columns that are not useful for modeling and drop them
drop_cols = ['Prospect ID', 'Lead Number', 'I agree to pay the amount through cheque', 
             'A free copy of Mastering The Interview', 'Magazine', 'Newspaper Article',
             'Newspaper', 'Digital Advertisement', 'Receive More Updates About Our Courses']
df.drop(columns=drop_cols, inplace=True)

In [13]:
# Handle missing values for categorical features:
# Fill missing values in categorical columns with 'Unknown'
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].replace('Select Specialization', 'Unknown')
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

# Fill missing values in numerical columns with the median value

numerical_cols = df.select_dtypes(include=['number']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

In [15]:
# Convert categorical variables into numerical dummy variables using one-hot encoding
# 'drop_first=True' prevents multicollinearity by dropping the first category of each variable
df = pd.get_dummies(df, drop_first=True)

In [17]:
# Separate the dataset into features (X) and target variable (y)
X = df.drop(columns=['Converted'])
y = df['Converted']

In [23]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Standardize features: convert them to have mean = 0 and standard deviation = 1
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [27]:
# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [29]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [31]:
# Evaluate model performance using various metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation metrics

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.9416
Precision: 0.9402
Recall: 0.9123
F1 Score: 0.9260
Confusion Matrix:
[[1064   43]
 [  65  676]]


In [33]:
# Generate predicted probabilities for the positive class (conversion = 1)
y_prob = model.predict_proba(X_test)[:, 1]
# Convert probabilities to a lead score on a scale of 0-100

lead_scores = (y_prob * 100).astype(int)

# Create a DataFrame to display lead scores alongside actual conversion outcomes
lead_score_df = pd.DataFrame({'Lead_Score': lead_scores, 'Converted': y_test.values})
print(lead_score_df)

      Lead_Score  Converted
0             96          1
1              0          0
2              0          0
3              0          0
4              1          0
...          ...        ...
1843          99          1
1844          99          1
1845           0          0
1846          89          1
1847           1          0

[1848 rows x 2 columns]


In [35]:
df

Unnamed: 0,Converted,TotalVisits,Total Time Spent on Website,Page Views Per Visit,Asymmetrique Activity Score,Asymmetrique Profile Score,Lead Origin_Landing Page Submission,Lead Origin_Lead Add Form,Lead Origin_Lead Import,Lead Origin_Quick Add Form,...,Last Notable Activity_Form Submitted on Website,Last Notable Activity_Had a Phone Conversation,Last Notable Activity_Modified,Last Notable Activity_Olark Chat Conversation,Last Notable Activity_Page Visited on Website,Last Notable Activity_Resubscribed to emails,Last Notable Activity_SMS Sent,Last Notable Activity_Unreachable,Last Notable Activity_Unsubscribed,Last Notable Activity_View in browser link Clicked
0,0,0.0,0,0.00,15.0,15.0,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1,0,5.0,674,2.50,15.0,15.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1,2.0,1532,2.00,14.0,20.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,1.0,305,1.00,13.0,17.0,True,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,1,2.0,1428,1.00,15.0,18.0,True,False,False,False,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9235,1,8.0,1845,2.67,15.0,17.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9236,0,2.0,238,2.00,14.0,19.0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
9237,0,2.0,199,2.00,13.0,20.0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
9238,1,3.0,499,3.00,15.0,16.0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
