# Lead Scoring Analysis 


- filling missing values (categorical → 'NA', numerical → 0.0)
- train/val/test split (60/20/20)  (seed 4)
- mutual information on categorical features (train set only)
- logistic regression with one-hot encoding (solver='liblinear', C=1.0, max_iter=1000, random_state=42)
- leave-one-out style feature elimination for three features
- tuning `C` over [0.01, 0.1, 1, 10, 100]




In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
# 1. Load data
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
df = pd.read_csv(url)
print('Loaded rows,cols:', df.shape)
df.head()


Loaded rows,cols: (1462, 9)


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [3]:
# 2. Data preparation per instructions
target = 'converted'
categorical = df.select_dtypes(include=['object']).columns.tolist()
numerical = [c for c in df.select_dtypes(include=[np.number]).columns.tolist() if c != target]
print('Categorical:', categorical)
print('Numerical:', numerical)

df_prep = df.copy()
for c in categorical:
    df_prep[c] = df_prep[c].fillna('NA')
for c in numerical:
    df_prep[c] = df_prep[c].fillna(0.0)

print('\nMissing values after fill:')
print(df_prep.isna().sum())


Categorical: ['lead_source', 'industry', 'employment_status', 'location']
Numerical: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

Missing values after fill:
lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [4]:
# Q1: mode of industry
industry_mode = df_prep['industry'].mode().iloc[0]
print('Q1 — mode of industry:', industry_mode)


Q1 — mode of industry: retail


In [5]:
# Correlation matrix among numerical features and Q2 candidates
corr = df_prep[numerical].corr()
display(corr)
candidates = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count'),
]
for p in candidates:
    print(p, corr.loc[p[0], p[1]])


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


('interaction_count', 'lead_score') 0.009888182496913131
('number_of_courses_viewed', 'lead_score') -0.004878998354681276
('number_of_courses_viewed', 'interaction_count') -0.023565222882888037
('annual_income', 'interaction_count') 0.02703647240481443


In [6]:
# Split data (60/20/20) with stratify and seed=42
X = df_prep.drop(columns=[target])
y = df_prep[target].astype(int)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=4, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=4, stratify=y_temp)
print('Sizes: train, val, test =', X_train.shape[0], X_val.shape[0], X_test.shape[0])


Sizes: train, val, test = 877 292 293


In [7]:
# Q3: mutual information on categorical variables (train only)
cat_features = categorical
X_train_cat = X_train[cat_features].apply(lambda col: col.astype('category').cat.codes)
mi = mutual_info_classif(X_train_cat, y_train, discrete_features=True, random_state=42)
mi_scores = {c: round(float(s), 2) for c, s in zip(cat_features, mi)}
print('Mutual information scores (train set):')
print(mi_scores)


Mutual information scores (train set):
{'lead_source': 0.02, 'industry': 0.01, 'employment_status': 0.02, 'location': 0.0}


In [8]:
# Q4: logistic regression with one-hot encoding
X_train_full = pd.get_dummies(X_train, drop_first=False)
X_val_full = pd.get_dummies(X_val, drop_first=False)
X_train_full, X_val_full = X_train_full.align(X_val_full, join='left', axis=1, fill_value=0)
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_full, y_train)
y_val_pred = model.predict(X_val_full)
acc_val = accuracy_score(y_val, y_val_pred)
print('Q4 — validation accuracy:', round(acc_val, 2))


Q4 — validation accuracy: 0.75


In [9]:
# Q5: leave-one-out style feature elimination for selected features
baseline_acc = acc_val
features_to_test = ['industry', 'employment_status', 'lead_score']
diffs = {}
for feat in features_to_test:
    Xtr = X_train.drop(columns=[feat])
    Xv = X_val.drop(columns=[feat])
    Xtr_enc = pd.get_dummies(Xtr, drop_first=False)
    Xv_enc = pd.get_dummies(Xv, drop_first=False)
    Xtr_enc, Xv_enc = Xtr_enc.align(Xv_enc, join='left', axis=1, fill_value=0)
    m = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    m.fit(Xtr_enc, y_train)
    yv_pred = m.predict(Xv_enc)
    acc = accuracy_score(y_val, yv_pred)
    diffs[feat] = baseline_acc - acc
    print(f'Removed {feat}: val_acc={acc:.6f}, diff={diffs[feat]:.6f}')
print('\nFeature with smallest absolute diff:', min(diffs.items(), key=lambda x: abs(x[1])))


Removed industry: val_acc=0.750000, diff=0.003425
Removed employment_status: val_acc=0.746575, diff=0.006849
Removed lead_score: val_acc=0.750000, diff=0.003425

Feature with smallest absolute diff: ('industry', 0.003424657534246589)


In [10]:
# Q6: try different C values
Cs = [0.01, 0.1, 1, 10, 100]
accs_by_C = {}
for C in Cs:
    m = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    m.fit(X_train_full, y_train)
    yv_pred = m.predict(X_val_full)
    acc = accuracy_score(y_val, yv_pred)
    accs_by_C[C] = round(acc, 3)
    print(f'C={C}: val_acc={accs_by_C[C]}')
best_C = min([C for C, a in accs_by_C.items() if a == max(accs_by_C.values())])
print('\nBest C (smallest among best):', best_C)


C=0.01: val_acc=0.743
C=0.1: val_acc=0.753
C=1: val_acc=0.753
C=10: val_acc=0.753
C=100: val_acc=0.753

Best C (smallest among best): 0.1


Bad pipe message: %s [b'"Microsoft Edge";v="141", "Not?A_Brand";v="8", "Chromium"']
Bad pipe message: %s [b'="141"\r\nsec-ch-ua-mobile: ?0\r\nsec-ch-ua-platform: "Windows']
Bad pipe message: %s [b'\nUpgrade-Insecure-Requests: 1\r\nUs', b'-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.']
Bad pipe message: %s [b'0.0 Safari/537.36 Edg/141.0.0.0\r\nAccept: text/h']
Bad pipe message: %s [b'ol: max-age=0\r\nsec-ch-ua: "Microsoft Edge";v="141", "Not?A_Brand";v="8", "Chromium";v="141"\r\nsec-ch-ua-mobile: ?0\r\n']
Bad pipe message: %s [b'c-ch-ua-platform: "Windows"\r\nUpgrade-Insecure-Requests: 1\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) A', b'leWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0\r\nAccept: text/html,application']
Bad pipe message: %s [b'html+xml,application/xml;q=0.9,image/avif,imag']
Bad pipe message: %s [b'"Microsoft Edge";v="141", "Not?A_Brand";v="8", "Chromium"']
Bad pipe messag

***END