In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

url ="https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv"
df = pd.read_csv(url)


In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=[np.number]).columns

for col in categorical_columns:
    df[col] = df[col].fillna('NA')
    
for col in numerical_columns:
    df[col] = df[col].fillna(0.0)
    df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)  # 0.25 * 0.8 = 0.2

print(f"Train: {len(df_train)} ({len(df_train)/len(df):.1%})")
print(f"Val: {len(df_val)} ({len(df_val)/len(df):.1%})")
print(f"Test: {len(df_test)} ({len(df_test)/len(df):.1%})")
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train['converted']
numerical_vars = ['lead_score', 'number_of_courses_viewed', 'interaction_count', 'annual_income']

auc_scores = {}

for var in numerical_vars:
    if var in df_train.columns:
        auc = roc_auc_score(y_train, df_train[var])
        
        if auc < 0.5:
            auc = roc_auc_score(y_train, -df_train[var])
            print(f"{var}: Original AUC < 0.5, using inverted variable")
        
        auc_scores[var] = auc
        print(f"{var}: AUC = {auc:.4f}")

best_variable = max(auc_scores, key=auc_scores.get)
highest_auc = auc_scores[best_variable]

print(f"\nThe numerical variable with the highest AUC is: {best_variable} (AUC = {highest_auc:.4f})")

Train: 876 (59.9%)
Val: 293 (20.0%)
Test: 293 (20.0%)
lead_score: AUC = 0.6145
number_of_courses_viewed: AUC = 0.7636
interaction_count: AUC = 0.7383
annual_income: AUC = 0.5520

The numerical variable with the highest AUC is: number_of_courses_viewed (AUC = 0.7636)


In [4]:

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=[np.number]).columns

for col in categorical_columns:
    df[col] = df[col].fillna('NA')
    
for col in numerical_columns:
    df[col] = df[col].fillna(0.0)

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

print(f"Train: {len(df_train)}")
print(f"Val: {len(df_val)}")
print(f"Test: {len(df_test)}")

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['converted'].values
y_val = df_val['converted'].values
y_test = df_test['converted'].values

del df_train['converted']
del df_val['converted']
del df_test['converted']

train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

y_pred_val = model.predict_proba(X_val)[:, 1]

auc_val = roc_auc_score(y_val, y_pred_val)
print(f"\nValidation AUC: {auc_val:.3f}")

auc_rounded = round(auc_val, 3)
print(f"Rounded AUC: {auc_rounded}")

Train: 876
Val: 293
Test: 293

Validation AUC: 0.817
Rounded AUC: 0.817


In [None]:
from sklearn.metrics import precision_score, recall_score
import numpy as np
import pandas as pd

y_pred_val = model.predict_proba(X_val)[:, 1]

thresholds = np.arange(0.0, 1.01, 0.01)

precisions = []
recalls = []

for t in thresholds:
    y_pred_binary = (y_pred_val >= t).astype(int)
    precision = precision_score(y_val, y_pred_binary, zero_division=0)
    recall = recall_score(y_val, y_pred_binary, zero_division=0)
    precisions.append(precision)
    recalls.append(recall)

metrics_df = pd.DataFrame({
    'threshold': thresholds,
    'precision': precisions,
    'recall': recalls
})

metrics_df['diff'] = np.abs(metrics_df['precision'] - metrics_df['recall'])
intersection_point = metrics_df.loc[metrics_df['diff'].idxmin()]

print(f"Precision and recall intersect at threshold: {intersection_point['threshold']:.3f}")
print(f"At this threshold - Precision: {intersection_point['precision']:.3f}, Recall: {intersection_point['recall']:.3f}")

options = [0.145, 0.345, 0.545, 0.745]
closest_option = min(options, key=lambda x: abs(x - intersection_point['threshold']))
print(f"\nClosest option to {intersection_point['threshold']:.3f} is: {closest_option}")

print("\nNear the intersection point:")
start_idx = max(0, metrics_df['diff'].idxmin() - 2)
end_idx = min(len(metrics_df), metrics_df['diff'].idxmin() + 3)
for idx in range(start_idx, end_idx):
    row = metrics_df.iloc[idx]
    print(f"Threshold: {row['threshold']:.3f}, Precision: {row['precision']:.3f}, Recall: {row['recall']:.3f}, Diff: {row['diff']:.3f}")

Precision and recall intersect at threshold: 0.980
At this threshold - Precision: 0.000, Recall: 0.000

Closest option to 0.980 is: 0.745

Near the intersection point:
Threshold: 0.960, Precision: 1.000, Recall: 0.018, Diff: 0.982
Threshold: 0.970, Precision: 1.000, Recall: 0.006, Diff: 0.994
Threshold: 0.980, Precision: 0.000, Recall: 0.000, Diff: 0.000
Threshold: 0.990, Precision: 0.000, Recall: 0.000, Diff: 0.000
Threshold: 1.000, Precision: 0.000, Recall: 0.000, Diff: 0.000


In [7]:

y_pred_val = model.predict_proba(X_val)[:, 1]

thresholds = np.arange(0.0, 1.01, 0.01)

precisions = []
recalls = []
f1_scores = []

for t in thresholds:
    y_pred_binary = (y_pred_val >= t).astype(int)
    precision = precision_score(y_val, y_pred_binary, zero_division=0)
    recall = recall_score(y_val, y_pred_binary, zero_division=0)
    
    precisions.append(precision)
    recalls.append(recall)
    
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0
    f1_scores.append(f1)

metrics_df = pd.DataFrame({
    'threshold': thresholds,
    'precision': precisions,
    'recall': recalls,
    'f1': f1_scores
})

max_f1_row = metrics_df.loc[metrics_df['f1'].idxmax()]

print(f"Maximum F1 score: {max_f1_row['f1']:.4f}")
print(f"Threshold at maximum F1: {max_f1_row['threshold']:.3f}")
print(f"At this threshold - Precision: {max_f1_row['precision']:.3f}, Recall: {max_f1_row['recall']:.3f}")

options = [0.14, 0.34, 0.54, 0.74]
closest_option = min(options, key=lambda x: abs(x - max_f1_row['threshold']))
print(f"\nClosest option to {max_f1_row['threshold']:.3f} is: {closest_option}")

print("\nTop F1 scores:")
top_f1 = metrics_df.nlargest(5, 'f1')[['threshold', 'f1', 'precision', 'recall']]
for idx, row in top_f1.iterrows():
    print(f"Threshold: {row['threshold']:.3f}, F1: {row['f1']:.4f}, Precision: {row['precision']:.3f}, Recall: {row['recall']:.3f}")

Maximum F1 score: 0.8125
Threshold at maximum F1: 0.570
At this threshold - Precision: 0.732, Recall: 0.912

Closest option to 0.570 is: 0.54

Top F1 scores:
Threshold: 0.570, F1: 0.8125, Precision: 0.732, Recall: 0.912
Threshold: 0.550, F1: 0.8112, Precision: 0.719, Recall: 0.930
Threshold: 0.560, F1: 0.8093, Precision: 0.724, Recall: 0.918
Threshold: 0.590, F1: 0.8085, Precision: 0.741, Recall: 0.889
Threshold: 0.580, F1: 0.8084, Precision: 0.733, Recall: 0.901


In [None]:
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

df_full_train = pd.concat([df_train, df_val], ignore_index=True)
y_full_train = pd.concat([pd.Series(y_train), pd.Series(y_val)], ignore_index=True)

# Prepare the features as dictionaries
df_full_train_features = df_full_train.copy()
full_train_dict = df_full_train_features.to_dict(orient='records')

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=1)

# Store AUC scores for each fold
auc_scores = []

# Perform 5-fold cross-validation
for train_idx, val_idx in kf.split(full_train_dict):
    # Split the data
    X_train_fold = [full_train_dict[i] for i in train_idx]
    X_val_fold = [full_train_dict[i] for i in val_idx]
    
    y_train_fold = y_full_train.iloc[train_idx]
    y_val_fold = y_full_train.iloc[val_idx]
    
    # Apply DictVectorizer
    dv = DictVectorizer(sparse=False)
    X_train_encoded = dv.fit_transform(X_train_fold)
    X_val_encoded = dv.transform(X_val_fold)
    
    # Train the model
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
    model.fit(X_train_encoded, y_train_fold)
    
    y_pred_val = model.predict_proba(X_val_encoded)[:, 1]
    
    auc = roc_auc_score(y_val_fold, y_pred_val)
    auc_scores.append(auc)
    
    print(f"Fold AUC: {auc:.4f}")

mean_auc = np.mean(auc_scores)
std_auc = np.std(auc_scores)

print(f"\nCross-validation results:")
print(f"Mean AUC: {mean_auc:.4f}")
print(f"Standard deviation: {std_auc:.4f}")

options = [0.0001, 0.006, 0.06, 0.36]
closest_option = min(options, key=lambda x: abs(x - std_auc))
print(f"\nClosest option to std {std_auc:.4f} is: {closest_option}")

print(f"\nAll AUC scores: {[f'{score:.4f}' for score in auc_scores]}")

Fold AUC: 0.8180
Fold AUC: 0.8035
Fold AUC: 0.8425
Fold AUC: 0.8024
Fold AUC: 0.8448

Cross-validation results:
Mean AUC: 0.8222
Standard deviation: 0.0183

Closest option to std 0.0183 is: 0.006

All AUC scores: ['0.8180', '0.8035', '0.8425', '0.8024', '0.8448']


In [None]:
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np

df_full_train = pd.concat([df_train, df_val], ignore_index=True)
y_full_train = pd.concat([pd.Series(y_train), pd.Series(y_val)], ignore_index=True)

df_full_train_features = df_full_train.copy()
full_train_dict = df_full_train_features.to_dict(orient='records')

C_values = [0.000001, 0.001, 1]

results = []

for C in C_values:
    print(f"\nTesting C = {C}")
    
    kf = KFold(n_splits=5, shuffle=True, random_state=1)
    
    auc_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(full_train_dict)):
        X_train_fold = [full_train_dict[i] for i in train_idx]
        X_val_fold = [full_train_dict[i] for i in val_idx]
        
        y_train_fold = y_full_train.iloc[train_idx]
        y_val_fold = y_full_train.iloc[val_idx]
        
        dv = DictVectorizer(sparse=False)
        X_train_encoded = dv.fit_transform(X_train_fold)
        X_val_encoded = dv.transform(X_val_fold)
        
        model = LogisticRegression(solver='liblinear', C=C, max_iter=1000)
        model.fit(X_train_encoded, y_train_fold)
        
        y_pred_val = model.predict_proba(X_val_encoded)[:, 1]
        
        auc = roc_auc_score(y_val_fold, y_pred_val)
        auc_scores.append(auc)
        
        print(f"  Fold {fold+1} AUC: {auc:.4f}")
    
    mean_auc = np.mean(auc_scores)
    std_auc = np.std(auc_scores)
    
    results.append({
        'C': C,
        'mean_auc': mean_auc,
        'std_auc': std_auc,
        'auc_scores': auc_scores.copy()
    })
    
    print(f"  Mean AUC: {mean_auc:.4f} ± {std_auc:.4f}")

results_sorted = sorted(results, key=lambda x: (-x['mean_auc'], x['std_auc'], x['C']))

print(f"\n{'='*50}")
print("FINAL RESULTS:")
print(f"{'='*50}")
for res in results_sorted:
    print(f"C = {res['C']:8} | Mean AUC: {res['mean_auc']:.3f} | Std: {res['std_auc']:.3f}")

best_C = results_sorted[0]['C']
best_mean = results_sorted[0]['mean_auc']
best_std = results_sorted[0]['std_auc']

print(f"\nBest C: {best_C}")
print(f"Best mean AUC: {best_mean:.3f}")
print(f"Best std: {best_std:.3f}")

print(f"\nSelection process:")
print(f"1. Highest mean AUC: {results_sorted[0]['mean_auc']:.3f}")
if len(results_sorted) > 1 and results_sorted[0]['mean_auc'] == results_sorted[1]['mean_auc']:
    print(f"2. Tie in mean AUC, selecting lower std: {results_sorted[0]['std_auc']:.3f}")
    if results_sorted[0]['std_auc'] == results_sorted[1]['std_auc']:
        print(f"3. Tie in std, selecting smallest C: {results_sorted[0]['C']}")


Testing C = 1e-06
  Fold 1 AUC: 0.5804
  Fold 2 AUC: 0.5751
  Fold 3 AUC: 0.5199
  Fold 4 AUC: 0.6251
  Fold 5 AUC: 0.5070
  Mean AUC: 0.5615 ± 0.0431

Testing C = 0.001
  Fold 1 AUC: 0.8591
  Fold 2 AUC: 0.8448
  Fold 3 AUC: 0.8877
  Fold 4 AUC: 0.8614
  Fold 5 AUC: 0.8798
  Mean AUC: 0.8666 ± 0.0154

Testing C = 1
  Fold 1 AUC: 0.8180
  Fold 2 AUC: 0.8035
  Fold 3 AUC: 0.8425
  Fold 4 AUC: 0.8024
  Fold 5 AUC: 0.8448
  Mean AUC: 0.8222 ± 0.0183

FINAL RESULTS:
C =    0.001 | Mean AUC: 0.867 | Std: 0.015
C =        1 | Mean AUC: 0.822 | Std: 0.018
C =    1e-06 | Mean AUC: 0.561 | Std: 0.043

Best C: 0.001
Best mean AUC: 0.867
Best std: 0.015

Selection process:
1. Highest mean AUC: 0.867
