In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import KFold

In [2]:
# Load and prepare data

df = pd.read_csv('data/course_lead_scoring.csv')

# Fill missing values
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].fillna('NA')
    else:
        df[column] = df[column].fillna(0.0)

In [3]:
# Split the data

X = df.drop(columns=['converted'])
y = df['converted']

# 80% train+val, 20% test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

# 60% train, 20% val
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=1
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}\n")

Training set size: 876
Validation set size: 293
Test set size: 293



### Question 1: ROC AUC feature importance

In [4]:
# Select numerical variables
numerical_vars = X_train.select_dtypes(include='number').columns.tolist()

auc_scores = {}

for var in numerical_vars:
    auc = roc_auc_score(y_train, X_train[var])
    if auc < 0.5:
        auc = roc_auc_score(y_train, -X_train[var])
    auc_scores[var] = auc

# Print all AUCs
for var, auc in auc_scores.items():
    print(f"{var}: {auc:.4f}")

# Find variable with highest AUC
best_var = max(auc_scores, key=auc_scores.get)
print(f"\nVariable with the highest AUC: {best_var} ({auc_scores[best_var]:.4f})")

number_of_courses_viewed: 0.7636
annual_income: 0.5520
interaction_count: 0.7383
lead_score: 0.6145

Variable with the highest AUC: number_of_courses_viewed (0.7636)


### Question 2: Training the model

In [13]:
# Define categorical and numerical columns
categorical = X_train.select_dtypes(include=['object']).columns.tolist()
numerical = X_train.select_dtypes(include=['number']).columns.tolist()

# Convert categorical columns to string
for col in categorical:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)

# Combine categorical and numerical features
features = categorical + numerical

# Convert data to dictionary format
train_dict = X_train[features].to_dict(orient='records')
val_dict = X_val[features].to_dict(orient='records')

# One-hot encode with DictVectorizer
dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(train_dict)
X_val_encoded = dv.transform(val_dict)

# Train logistic regression
model = LogisticRegression(solver='lbfgs', C=1.0, max_iter=10000)
model.fit(X_train_encoded, y_train)

# Predict probabilities for AUC
y_val_pred = model.predict_proba(X_val_encoded)[:, 1]

# Compute AUC
val_auc = roc_auc_score(y_val, y_val_pred)
print(f"Validation AUC: {val_auc:.3f}")

Validation AUC: 0.920


### Question 3: Precision and Recall

In [15]:
# Probabilities from your logistic regression model
y_scores = y_val_pred

# Define thresholds from 0.0 to 1.0 with step 0.01
thresholds = np.arange(0.0, 1.01, 0.01)

precisions = []
recalls = []

# Compute precision and recall for each threshold
for t in thresholds:
    preds = (y_scores >= t).astype(int)
    precisions.append(precision_score(y_val, preds))
    recalls.append(recall_score(y_val, preds))

# Convert to numpy arrays for easy computation
precisions = np.array(precisions)
recalls = np.array(recalls)

# Find where precision and recall are approximately equal
diff = np.abs(precisions - recalls)
intersection_idx = np.argmin(diff)
intersection_threshold = thresholds[intersection_idx]

print(f"Precision and Recall intersect at threshold: {intersection_threshold:.3f}")

Precision and Recall intersect at threshold: 0.550


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Question 4: F1 score

In [16]:
thresholds = np.arange(0.0, 1.01, 0.01)
f1_scores = []

for t in thresholds:
    preds = (y_scores >= t).astype(int)
    p = precision_score(y_val, preds, zero_division=0)
    r = recall_score(y_val, preds)
    
    if (p + r) == 0:
        f1 = 0
    else:
        f1 = 2 * (p * r) / (p + r)
    
    f1_scores.append(f1)

f1_scores = np.array(f1_scores)

# Find threshold where F1 is maximal
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"F1 is maximal at threshold: {best_threshold:.3f} (F1 = {best_f1:.3f})")

F1 is maximal at threshold: 0.530 (F1 = 0.878)


### Question 5: 5-Fold CV

In [20]:
# Prepare data
df_full_train = df.copy()

X_full = df_full_train.drop(columns=['converted'])
y_full = df_full_train['converted']

# Ensure categorical columns are strings
categorical = X_full.select_dtypes(include=['object']).columns.tolist()
for col in categorical:
    X_full[col] = X_full[col].astype(str)

# Numerical columns
numerical = X_full.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Combine categorical + numerical
features = categorical + numerical

# Initialize KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(X_full):
    X_train = X_full.iloc[train_idx]
    X_val = X_full.iloc[val_idx]
    y_train = y_full.iloc[train_idx]
    y_val = y_full.iloc[val_idx]

    # One-hot encoding with DictVectorizer
    dv = DictVectorizer(sparse=False)
    train_dict = X_train[features].to_dict(orient='records')
    val_dict = X_val[features].to_dict(orient='records')

    X_train_encoded = dv.fit_transform(train_dict)
    X_val_encoded = dv.transform(val_dict)

    # Train Logistic Regression
    model = LogisticRegression(solver='lbfgs', C=1.0, max_iter=10000)
    model.fit(X_train_encoded, y_train)

    # Predict probabilities
    y_val_pred = model.predict_proba(X_val_encoded)[:, 1]

    # Compute AUC
    auc = roc_auc_score(y_val, y_val_pred)
    scores.append(auc)

# Compute mean and std
mean_auc = np.mean(scores)
std_auc = np.std(scores)

print("AUC scores per fold:", [round(s, 3) for s in scores])
print(f"Mean AUC: {mean_auc:.3f}")
print(f"Standard deviation: {std_auc:.3f}")

AUC scores per fold: [0.923, 0.911, 0.935, 0.929, 0.915]
Mean AUC: 0.923
Standard deviation: 0.009


### Question 6: Hyperparameter Tuning

In [21]:
# Prepare data
df_full_train = df.copy()

X_full = df_full_train.drop(columns=['converted'])
y_full = df_full_train['converted']

# Ensure categorical columns are strings
categorical = X_full.select_dtypes(include=['object']).columns.tolist()
for col in categorical:
    X_full[col] = X_full[col].astype(str)

numerical = X_full.select_dtypes(include=['int64', 'float64']).columns.tolist()
features = categorical + numerical

# Initialize KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=1)

# Values of C to test
C_values = [0.000001, 0.001, 1]

results = {}

for C in C_values:
    scores = []
    
    for train_idx, val_idx in kfold.split(X_full):
        X_train = X_full.iloc[train_idx]
        X_val = X_full.iloc[val_idx]
        y_train = y_full.iloc[train_idx]
        y_val = y_full.iloc[val_idx]

        # One-hot encoding
        dv = DictVectorizer(sparse=False)
        train_dict = X_train[features].to_dict(orient='records')
        val_dict = X_val[features].to_dict(orient='records')

        X_train_encoded = dv.fit_transform(train_dict)
        X_val_encoded = dv.transform(val_dict)

        # Train logistic regression with specific C
        model = LogisticRegression(solver='lbfgs', C=C, max_iter=10000)
        model.fit(X_train_encoded, y_train)

        # Predict and compute AUC
        y_val_pred = model.predict_proba(X_val_encoded)[:, 1]
        auc = roc_auc_score(y_val, y_val_pred)
        scores.append(auc)

    # Compute mean and std for this C
    mean_auc = np.mean(scores)
    std_auc = np.std(scores)

    results[C] = (round(mean_auc, 3), round(std_auc, 3))

# Print results
for C, (mean_auc, std_auc) in results.items():
    print(f"C={C}: mean AUC={mean_auc}, std={std_auc}")

# Find best C
best_C = max(results, key=lambda c: (results[c][0], -results[c][1], -c))
print(f"\nBest C: {best_C}")

C=1e-06: mean AUC=0.55, std=0.031
C=0.001: mean AUC=0.87, std=0.012
C=1: mean AUC=0.923, std=0.009

Best C: 1
