In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)

df.head()


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [27]:
# Identify categorical & numerical columns
cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(exclude=['object']).columns

# Fill missing values:
# - Categorical → 'NA'
# - Numerical → 0.0
df[cat_cols] = df[cat_cols].fillna('NA')
df[num_cols] = df[num_cols].fillna(0.0)

# Check if any missing values remain
df.isnull().sum().sum()


0

In [28]:
df['industry'].mode()[0]


'retail'

In [33]:
correlation = df.corr(numeric_only=True)
correlation


Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

for a, b in pairs:
    print(f"{a} vs {b}: {corr.loc[a, b]:.3f}")


In [35]:
# Define target
target = 'converted'

# Split: 60% train, 20% val, 20% test
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)  # 0.25 of 0.8 = 0.2 overall

# Reset indices
for subset in (df_train, df_val, df_test):
    subset.reset_index(drop=True, inplace=True)

len(df_train), len(df_val), len(df_test)


(876, 293, 293)

In [37]:
from sklearn.preprocessing import LabelEncoder

X_train = df_train.drop(columns=[target])
y_train = df_train[target]

# Select categorical features
cat_cols = X_train.select_dtypes(['object']).columns

mi_scores = {}

for col in cat_cols:
    # Encode string categories as numbers
    encoded = LabelEncoder().fit_transform(X_train[col])
    
    # Compute mutual information
    mi = mutual_info_classif(encoded.reshape(-1, 1), y_train, discrete_features=True)
    
    mi_scores[col] = round(mi[0], 2)

mi_scores


{'lead_source': 0.04,
 'industry': 0.01,
 'employment_status': 0.01,
 'location': 0.0}

In [39]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

# Separate features and target
X_train = df_train.drop(columns=[target])
y_train = df_train[target]

X_val = df_val.drop(columns=[target])
y_val = df_val[target]

# Identify categorical and numerical columns
cat_cols = X_train.select_dtypes(['object']).columns
num_cols = X_train.select_dtypes(exclude=['object']).columns

# One-hot encode categorical vars
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)

# Logistic Regression model
model = make_pipeline(
    preprocessor,
    LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)
val_acc = accuracy_score(y_val, y_pred)
round(val_acc, 3)


0.7

In [41]:
# Baseline accuracy
baseline_acc = accuracy_score(y_val, model.predict(X_val))
print("Baseline:", baseline_acc)

# Calculate accuracy drop when removing each feature
diffs = {}
for col in X_train.columns:
    X_train_sub = X_train.drop(columns=[col])
    X_val_sub = X_val.drop(columns=[col])

    model_sub = make_pipeline(
        ColumnTransformer([
            ('num', 'passthrough', X_train_sub.select_dtypes(exclude=['object']).columns),
            ('cat', OneHotEncoder(handle_unknown='ignore'), X_train_sub.select_dtypes(['object']).columns)
        ]),
        LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    )

    model_sub.fit(X_train_sub, y_train)
    acc = accuracy_score(y_val, model_sub.predict(X_val_sub))
    diffs[col] = baseline_acc - acc

diffs


Baseline: 0.6996587030716723


{'lead_source': -0.0034129692832765013,
 'industry': 0.0,
 'number_of_courses_viewed': 0.14334470989761094,
 'annual_income': -0.15358361774744034,
 'employment_status': 0.0034129692832763903,
 'location': -0.010238907849829393,
 'interaction_count': 0.14334470989761094,
 'lead_score': -0.0068259385665528916}

In [43]:
C_values = [0.01, 0.1, 1, 10, 100]
results = {}

for c in C_values:
    model_c = make_pipeline(
        preprocessor,
        LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    )
    model_c.fit(X_train, y_train)
    acc = accuracy_score(y_val, model_c.predict(X_val))
    results[c] = round(acc, 3)

results


{0.01: 0.7, 0.1: 0.7, 1: 0.7, 10: 0.7, 100: 0.7}