In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mutual_info_score

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-12 19:58:35--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-12 19:58:35 (4.89 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [4]:
df = pd.read_csv("course_lead_scoring.csv")

target_col = "converted"
assert target_col in df.columns, f"Expected target column '{target_col}' not found."

# Identify numeric & categorical features (exclude target)
feature_cols = [c for c in df.columns if c != target_col]
num_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in feature_cols if c not in num_cols]

# Impute missing values
#   - Categorical: 'NA'
#   - Numerical: 0.0

df[cat_cols] = df[cat_cols].fillna('NA').replace('', 'NA')
df[num_cols] = df[num_cols].fillna(0.0)

In [5]:
# Q1: Most frequent observation (mode) for column 'industry'

mode_industry = df['industry'].mode(dropna=False).iloc[0]
print("Q1 — Mode of 'industry':", mode_industry)

Q1 — Mode of 'industry': retail


In [6]:
# Q2: Correlation matrix on numerical features

pairs_q2 = [
    ("interaction_count", "lead_score"),
    ("number_of_courses_viewed", "lead_score"),
    ("number_of_courses_viewed", "interaction_count"),
    ("annual_income", "interaction_count"),
]
# -----------------------------
corr = df[num_cols].corr(numeric_only=True)

pair_corrs = {}
for a, b in pairs_q2:
    if a in corr.index and b in corr.columns:
        pair_corrs[(a, b)] = abs(corr.loc[a, b])
    else:
        pair_corrs[(a, b)] = -np.inf  # if missing, mark as invalid

best_pair = max(pair_corrs, key=pair_corrs.get)
print("Q2 — Largest correlation pair among options:", best_pair, "with corr:", pair_corrs[best_pair])

Q2 — Largest correlation pair among options: ('annual_income', 'interaction_count') with corr: 0.02703647240481443


In [7]:
# Split data: 60/20/20 with seed=42
X = df[feature_cols].copy()
y = df[target_col].astype(int).values  # ensure numeric labels

# First split: train_val (80%) / test (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Second split: from train_val, split validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)


# Q3: Mutual information between y and each categorical variable (training only)
cat_candidates = ["industry", "location", "lead_source", "employment_status"]
mi_scores = {}
for col in cat_candidates:
    if col not in X_train.columns:
        mi_scores[col] = -np.inf
        continue
    # Work only on training set
    f = X_train[col].astype(str).fillna('NA')  # safety
    mi_scores[col] = round(mutual_info_score(y_train, f), 2)

print("Q3 — Mutual information (train only):", mi_scores)
best_mi_var = max(mi_scores, key=mi_scores.get)
print("Q3 — Highest MI variable:", best_mi_var)

Q3 — Mutual information (train only): {'industry': np.float64(0.01), 'location': np.float64(0.0), 'lead_source': np.float64(0.03), 'employment_status': np.float64(0.01)}
Q3 — Highest MI variable: lead_source


In [9]:
# one-hot encode categoricals

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

def prepare_X_matrix(df_part):
    # Impute again defensively (important if we subset)
    part = df_part.copy()
    part[cat_cols] = part[cat_cols].fillna('NA').replace('', 'NA')
    part[num_cols] = part[num_cols].fillna(0.0)

    # One-hot encode categoricals
    X_cat = ohe.transform(part[cat_cols]) if hasattr(ohe, "categories_") else None
    X_num = part[num_cols].values

    if X_cat is None:
        return X_num
    else:
        return np.hstack([X_num, X_cat])

# Fit OHE on training categoricals only
ohe.fit(X_train[cat_cols])

Xtr = prepare_X_matrix(X_train)
Xva = prepare_X_matrix(X_val)
Xte = prepare_X_matrix(X_test)


# Q4: Train LogisticRegression with given params, report val accuracy (2 decimals)
logreg = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
logreg.fit(Xtr, y_train)
val_pred = logreg.predict(Xva)
val_acc = round(accuracy_score(y_val, val_pred), 2)
print("Q4 — Validation accuracy:", val_acc)

Q4 — Validation accuracy: 0.73


In [11]:

# Q5: Least useful feature via leave-one-feature-out
loo_features = ['industry', 'employment_status', 'lead_score']

# Baseline (no rounding)
baseline_acc = accuracy_score(y_val, logreg.predict(Xva))

def fit_acc_without(feature_to_drop):
    # Drop the feature from raw frames *before* OHE/stacking
    cols_keep = [c for c in feature_cols if c != feature_to_drop]
    # Fit a new OHE on train with remaining categoricals
    rem_cat = [c for c in cat_cols if c in cols_keep]
    rem_num = [c for c in num_cols if c in cols_keep]

    ohe2 = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    ohe2.fit(X_train[rem_cat])

    def build(dfp):
        d = dfp[cols_keep].copy()
        d[rem_cat] = d[rem_cat].fillna('NA').replace('', 'NA')
        d[rem_num] = d[rem_num].fillna(0.0)
        Xc = ohe2.transform(d[rem_cat]) if rem_cat else None
        Xn = d[rem_num].values if rem_num else None
        if Xc is None: return Xn
        if Xn is None: return Xc
        return np.hstack([Xn, Xc])

    Xtr2 = build(X_train)
    Xva2 = build(X_val)

    m = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    m.fit(Xtr2, y_train)
    return accuracy_score(y_val, m.predict(Xva2))

diffs = {}
for f in loo_features:
    acc_wo = fit_acc_without(f)
    diffs[f] = baseline_acc - acc_wo  # can be negative
print("Q5 — Accuracy drop (baseline - without feature):", diffs)
min_drop_feature = min(diffs, key=diffs.get)
print("Q5 — Least useful (smallest difference):", min_drop_feature)

Q5 — Accuracy drop (baseline - without feature): {'industry': 0.0, 'employment_status': -0.0034129692832765013, 'lead_score': 0.0}
Q5 — Least useful (smallest difference): employment_status


In [12]:
# Q6: Regularized Logistic Regression; C in [0.01, 0.1, 1, 10, 100]
C_list = [0.01, 0.1, 1, 10, 100]
acc_by_C = {}
for C in C_list:
    mdl = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    mdl.fit(Xtr, y_train)
    acc_by_C[C] = round(accuracy_score(y_val, mdl.predict(Xva)), 3)

print("Q6 — Validation accuracy by C:", acc_by_C)
best_acc = max(acc_by_C.values())
best_Cs = [C for C, a in acc_by_C.items() if a == best_acc]
best_C = min(best_Cs)
print("Q6 — Best C:", best_C, "with val acc:", best_acc)


Q6 — Validation accuracy by C: {0.01: 0.734, 0.1: 0.73, 1: 0.73, 10: 0.73, 100: 0.73}
Q6 — Best C: 0.01 with val acc: 0.734
