In [85]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score # for measuring model
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

import random
from collections import Counter

In [86]:
# import data
df = pd.read_df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv", sep = ",")
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [89]:
# check for null values and data types
null_summary = pd.DataFrame({
    'column': df.columns,
    'dtype': df.dtypes.values,
    'null_count': df.isnull().sum().values
})
null_summary

Unnamed: 0,column,dtype,null_count
0,lead_source,object,0
1,industry,object,0
2,number_of_courses_viewed,int64,0
3,annual_income,float64,0
4,employment_status,object,0
5,location,object,0
6,interaction_count,int64,0
7,lead_score,float64,0
8,converted,int64,0


In [88]:
# Separate categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
numerical_cols = df.select_dtypes(include=['number', 'float64', 'int64']).columns
numerical_cols_no_conv = numerical_cols.difference(['converted']) #converted removed for training
all_feat_cols = numerical_cols_no_conv.tolist() + categorical_cols.tolist()

# replace nulls with 'NA' for cat and 0 for num
df[categorical_cols] = df[categorical_cols].fillna('NA')
df[numerical_cols] = df[numerical_cols].fillna(0.0)

In [90]:
# separate feature and target
# x = df[numerical_cols_no_conv]  # numeric feature columns target excluded
x = df[all_feat_cols]
y = df['converted']  # target

# 0.25 * 0.8 = 0.2 → validation is 20% of total
x_temp, x_test, y_temp, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_temp, y_temp, test_size=0.25, random_state=42)

In [91]:
print(len(x), len(y))  # Should be identical
print(x.shape, y.shape)
print(x.index.equals(y.index))  # Must be True

1462 1462
(1462, 8) (1462,)
True


In [78]:
# 1 AUC
## ONLY RUN WHEN ABOVE IS SPLIT ON NUMERIC ONLY. COMMENT OUT ALL FEATURE ROW
# Dictionary to store AUCs
auc_scores = {}

for col in numerical_cols_no_conv:
    score = df_train[col]
    auc = roc_auc_score(y_train, score)
    
    # If AUC < 0.5, invert the variable
    if auc < 0.5:
        score = -score
        auc = roc_auc_score(y_train, score)
    
    auc_scores[col] = auc

# Find the variable with the highest AUC
# print("AUC scores:", auc_scores)
df_auc = pd.DataFrame(list(auc_scores.items()), columns=['Variable', 'AUC'])
df_auc['AUC'] = df_auc['AUC'].round(3)
df_auc = df_auc.sort_values(by='AUC', ascending=False).reset_index(drop=True)
print(df_auc)

print("Best variable:", best_var)

ValueError: Found input variables with inconsistent numbers of samples: [876, 1169]

In [92]:
# 2 Model Training
# Convert dicts from split data
train_dicts = x_train.to_dict(orient='records')
test_dicts  = x_test.to_dict(orient='records')
val_dicts   = x_val.to_dict(orient='records')

# One-hot encode
dv = DictVectorizer(sparse=False)
x_train_encoded = dv.fit_transform(train_dicts)
x_test_encoded  = dv.transform(test_dicts)
x_val_encoded   = dv.transform(val_dicts)

In [93]:
# logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(x_train_encoded, y_train)

y_val_pred = model.predict_proba(x_val_encoded)[:, 1]
auc_val = roc_auc_score(y_val, y_val_pred)
print("Validation ROC AUC:", round(auc_val, 3))

Validation ROC AUC: 0.855


In [None]:
# 3