In [1]:
import pandas as pd
import numpy as np

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [3]:
!wget $data

--2025-10-13 13:31:16--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-13 13:31:16 (30.1 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [5]:
df = pd.read_csv('course_lead_scoring.csv')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [6]:

# Display basic info
print("Dataset shape:", df.shape)
print("\nChecking for missing values:\n")
print(df.isnull().sum())




Dataset shape: (1462, 9)

Checking for missing values:

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [7]:
# Handling missing values
for col in df.columns:
    if df[col].isnull().sum() > 0:  # Only handle if there are missing values
        if df[col].dtype == 'object':  # Categorical features
            df[col] = df[col].fillna('NA')
        else:  # Numerical features
            df[col] = df[col].fillna(0.0)

# to verify no missing values remain
print("\nMissing values after imputation:\n")
print(df.isnull().sum())


Missing values after imputation:

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [8]:
df.industry.mode()

0    retail
Name: industry, dtype: object

In [9]:
# Select only numerical features
num_df = df.select_dtypes(include=['int64', 'float64'])

# Compute correlation matrix
corr_matrix = num_df.corr()

# Display correlation matrix
print("Correlation Matrix:\n", corr_matrix, "\n")

# Find the two features with the highest correlation (excluding self-correlation)
corr_unstacked = corr_matrix.unstack().sort_values(ascending=False)
# Drop self-correlations (where correlation == 1)
corr_unstacked = corr_unstacked[corr_unstacked < 1]

# Get the pair with the highest correlation
highest_corr = corr_unstacked.idxmax()
max_corr_value = corr_unstacked.max()

print(f"The two features with the highest correlation are: {highest_corr}")
print(f"Correlation coefficient: {max_corr_value:.3f}")


Correlation Matrix:
                           number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000   

The two features with the highest correlation are: ('converted', 'number_of_courses_viewed')
Correlation coefficient: 0.436


In [11]:
from sklearn.model_selection import train_test_split
# Separate target and features
y = df['converted']
X = df.drop('converted', axis=1)

# First split: 60% train, 40% temp (val+test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

print("Training set:", X_train.shape, y_train.shape)


Training set: (877, 8) (877,)


In [13]:
# Second split: 20% val, 20% test (each half of temp)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# Print resulting shapes

print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

Validation set: (292, 8) (292,)
Test set: (293, 8) (293,)


In [12]:
1462-877


585

In [14]:
292+293

585

In [15]:

from sklearn.metrics import mutual_info_score

# Select categorical columns
cat_features = X_train.select_dtypes(include=['object']).columns

# Compute Mutual Information for each categorical feature
mi_scores = {}
for col in cat_features:
    mi = mutual_info_score(X_train[col], y_train)
    mi_scores[col] = round(mi, 2)

# Convert to DataFrame for neat display
mi_df = pd.DataFrame(list(mi_scores.items()), columns=['Feature', 'Mutual Information'])
mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)

print("Mutual Information Scores between categorical features and target (y):\n")
print(mi_df)


Mutual Information Scores between categorical features and target (y):

             Feature  Mutual Information
0        lead_source                0.03
1           industry                0.02
2  employment_status                0.02
3           location                0.00


In [16]:
# logistic_regression_pipeline.py
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# One-hot encode categorical variables (use pandas.get_dummies)
X_train_enc = pd.get_dummies(X_train, drop_first=False)
X_val_enc = pd.get_dummies(X_val, drop_first=False)

# Align validation columns to training columns (add missing ones as zeros)
X_val_enc = X_val_enc.reindex(columns=X_train_enc.columns, fill_value=0)

# Train logistic regression with specified parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)

# Predict on validation set and compute accuracy (rounded to 2 decimals)
y_val_pred = model.predict(X_val_enc)
accuracy = accuracy_score(y_val, y_val_pred)
print("Validation accuracy (rounded to 2 decimals):", round(accuracy, 2))


Validation accuracy (rounded to 2 decimals): 0.74


In [17]:
# One-hot encode
X_train_enc = pd.get_dummies(X_train, drop_first=False)
X_val_enc = pd.get_dummies(X_val, drop_first=False)
X_val_enc = X_val_enc.reindex(columns=X_train_enc.columns, fill_value=0)

# --- Train base model ---
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)
base_acc = accuracy_score(y_val, model.predict(X_val_enc))

# --- Feature elimination test ---
results = []
for feature in X.columns:
    # Drop one feature
    X_train_drop = X_train.drop(columns=[feature])
    X_val_drop = X_val.drop(columns=[feature])

    # One-hot encode again
    X_train_drop_enc = pd.get_dummies(X_train_drop, drop_first=False)
    X_val_drop_enc = pd.get_dummies(X_val_drop, drop_first=False)
    X_val_drop_enc = X_val_drop_enc.reindex(columns=X_train_drop_enc.columns, fill_value=0)

    # Train model
    model.fit(X_train_drop_enc, y_train)
    acc = accuracy_score(y_val, model.predict(X_val_drop_enc))

    # Record accuracy difference
    diff = base_acc - acc
    results.append((feature, acc, diff))

# --- Display results ---
results_df = pd.DataFrame(results, columns=["Feature", "Accuracy_without_feature", "Accuracy_difference"])
results_df = results_df.sort_values(by="Accuracy_difference", ascending=True)
print("Base validation accuracy:", base_acc)
print("\nFeature elimination results (least useful at the top):")
print(results_df)


Base validation accuracy: 0.7431506849315068

Feature elimination results (least useful at the top):
                    Feature  Accuracy_without_feature  Accuracy_difference
3             annual_income                  0.856164            -0.113014
4         employment_status                  0.746575            -0.003425
5                  location                  0.743151             0.000000
1                  industry                  0.743151             0.000000
7                lead_score                  0.743151             0.000000
0               lead_source                  0.729452             0.013699
2  number_of_courses_viewed                  0.678082             0.065068
6         interaction_count                  0.674658             0.068493


In [18]:
# Try different values of C
C_values = [0.01, 0.1, 1, 10, 100]
results = []

for c in C_values:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train_enc, y_train)
    y_pred = model.predict(X_val_enc)
    acc = round(accuracy_score(y_val, y_pred), 3)
    results.append((c, acc))

# Display results
results_df = pd.DataFrame(results, columns=['C', 'Validation_Accuracy'])
best = results_df.loc[results_df['Validation_Accuracy'].idxmax()]

print("Validation results:\n", results_df)
print("\nBest C:", best['C'], "with accuracy:", best['Validation_Accuracy'])


Validation results:
         C  Validation_Accuracy
0    0.01                0.743
1    0.10                0.743
2    1.00                0.743
3   10.00                0.743
4  100.00                0.743

Best C: 0.01 with accuracy: 0.743
