## Data preparation

In [123]:
import pandas as pd
import  numpy as np

In [124]:
data ='wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [125]:
!wget $data

--2025-10-13 23:36:38--  http://wget/
Resolving wget (wget)... failed: Name or service not known.
wget: unable to resolve host address ‘wget’
--2025-10-13 23:36:38--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.2’


2025-10-13 23:36:38 (50.7 MB/s) - ‘course_lead_scoring.csv.2’ saved [80876/80876]

FINISHED --2025-10-13 23:36:38--
Total wall clock time: 0.06s
Downloaded: 1 files, 79K in 0.002s (50.7 MB/s)


In [126]:
df = pd.read_csv('course_lead_scoring.csv')

In [127]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [128]:
# A look at data shape and column types

print(df.dtypes)

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object


In [129]:
print(df.nunique())


lead_source                    5
industry                       7
number_of_courses_viewed      10
annual_income               1267
employment_status              4
location                       7
interaction_count             12
lead_score                   101
converted                      2
dtype: int64


In [130]:
print(df['converted'].unique())

# 1 should equal client signed up to the platform
# 0 not signed up

[1 0]


In [131]:
print(df['converted'].value_counts())

converted
1    905
0    557
Name: count, dtype: int64


In [132]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [133]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

## Checking for missing values in the features.

In [134]:
print("Missing values per column BEFORE cleaning:")
missing_before = df.isna().sum()
print(missing_before[missing_before > 0] if missing_before.sum() > 0 else "No missing values detected.")

# To detect categorical vs numeric columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

print("\nCategorical columns detected (will fill missing with 'NA'):")
print(categorical_cols)
print("\nNumerical columns detected (will fill missing with 0.0):")
print(numerical_cols)

Missing values per column BEFORE cleaning:
lead_source          128
industry             134
annual_income        181
employment_status    100
location              63
dtype: int64

Categorical columns detected (will fill missing with 'NA'):
['lead_source', 'industry', 'employment_status', 'location']

Numerical columns detected (will fill missing with 0.0):
['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']


## Handle missing values

In [135]:
df[categorical_cols] = df[categorical_cols].fillna('NA')
df[numerical_cols] = df[numerical_cols].fillna(0.0)

In [136]:
# Verifying that there's no missing left

print("\nMissing values per column AFTER cleaning:")
missing_after = df.isna().sum()
print(missing_after[missing_after > 0] if missing_after.sum() > 0 else "No missing values detected.")


Missing values per column AFTER cleaning:
No missing values detected.


In [137]:
print(df['industry'].unique())


['NA' 'retail' 'healthcare' 'education' 'manufacturing' 'technology'
 'other' 'finance']


In [138]:
print("First 10 rows of cleaned data:")


df.head(10)

First 10 rows of cleaned data:


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
5,events,manufacturing,1,59904.0,,africa,6,0.83,1
6,social_media,technology,0,51283.0,,middle_east,2,0.57,0
7,social_media,,5,62975.0,student,europe,4,0.62,1
8,referral,healthcare,4,38648.0,unemployed,south_america,2,0.86,1
9,paid_ads,other,3,59866.0,student,australia,3,0.43,1


## Question 1

In [139]:
print(df['industry'].value_counts().head())


industry
retail        203
finance       200
other         198
healthcare    187
education     187
Name: count, dtype: int64


In [140]:
mode_value = df['industry'].mode()[0]
print("The most frequent industry is:", mode_value)

The most frequent industry is: retail


## Question 2: correlation matrix


In [141]:
# Select only numeric columns

num_df = df.select_dtypes(include=['int64', 'float64'])

In [142]:
# Compute correlation matrix

corr_matrix = num_df.corr()
print(corr_matrix)

                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   
converted                                 0.435914       0.053131   

                          interaction_count  lead_score  converted  
number_of_courses_viewed          -0.023565   -0.004879   0.435914  
annual_income                      0.027036    0.015610   0.053131  
interaction_count                  1.000000    0.009888   0.374573  
lead_score                         0.009888    1.000000   0.193673  
converted                          0.374573    0.193673   1.000000  


In [143]:
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

for a, b in pairs:
    corr_value = df[a].corr(df[b])
    print(f"Correlation between {a} and {b}: {corr_value:.3f}")

Correlation between interaction_count and lead_score: 0.010
Correlation between number_of_courses_viewed and lead_score: -0.005
Correlation between number_of_courses_viewed and interaction_count: -0.024
Correlation between annual_income and interaction_count: 0.027


## Data splicing

In [144]:
from sklearn.model_selection import train_test_split

In [145]:
print(df.columns)

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')


In [146]:
# Separate features (X) and target (y)
X = df.drop('converted', axis=1)
y = df['converted']

In [147]:
# Split into train (60%) and temp (40%)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)


In [148]:
# Split temp into validation (20%) and test (20%)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [149]:
print("Train set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_test))

Train set size: 877
Validation set size: 292
Test set size: 293


## Question 3

In [150]:
from sklearn.metrics import mutual_info_score

In [151]:
categorical_vars = ['industry', 'location', 'lead_source', 'employment_status']

In [152]:
for col in categorical_vars:
    score = mutual_info_score(X_train[col], y_train)
    print(f"{col}: {round(score, 2)}")

industry: 0.02
location: 0.0
lead_source: 0.03
employment_status: 0.02


## Question 4

In [164]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [154]:
# Define categorical and numerical columns
# Done above so just printing

print("Categorical columns:")
print(categorical_cols)
print("\nNumerical columns:")
print(numerical_cols)

Categorical columns:
['lead_source', 'industry', 'employment_status', 'location']

Numerical columns:
['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']


In [155]:
# Create the pipeline

model = make_pipeline(
    ColumnTransformer([
        ('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ], remainder='passthrough'),
    LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
)

In [156]:
# Fit the model

model.fit(X_train, y_train)

0,1,2
,steps,"[('columntransformer', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('encoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [157]:
# Predict on validation set

y_pred = model.predict(X_val)

In [158]:
# Calculate accuracy

acc = accuracy_score(y_val, y_pred)
print(round(acc, 2))

0.74


## Question 5

In [168]:
features_to_check = ['industry', 'employment_status', 'lead_score']
accuracy_diffs = {}

for feature in features_to_check:
    # Drop one feature
    X_train_reduced = X_train.drop(columns=[feature])
    X_val_reduced = X_val.drop(columns=[feature])

        # Update preprocessor for reduced features
    cat_cols_reduced = [c for c in categorical_cols if c in X_train_reduced.columns]
    preprocessor_reduced = ColumnTransformer(
        transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols_reduced)],
        remainder='passthrough'
    )
    
    # Train new model
    model_reduced = Pipeline(steps=[
        ('preprocessor', preprocessor_reduced),
        ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])
    
    model_reduced.fit(X_train_reduced, y_train)
    y_pred_reduced = model_reduced.predict(X_val_reduced)
    acc_reduced = accuracy_score(y_val, y_pred_reduced)
    
    # Compute difference
    accuracy_diffs[feature] = base_acc - acc_reduced

In [169]:
print("\nAccuracy differences (base - reduced):")
for feature, diff in accuracy_diffs.items():
    print(f"{feature}: {diff:.4f}")

least_useful = min(accuracy_diffs, key=accuracy_diffs.get)
print("\nLeast useful feature:", least_useful)


Accuracy differences (base - reduced):
industry: 0.0000
employment_status: -0.0034
lead_score: 0.0000

Least useful feature: employment_status


## Question 6

In [177]:
# Store results

C_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

In [178]:
# Loop through each C value
for c in C_values:
    model = make_pipeline(
        ColumnTransformer([
            ('encoder', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ], remainder='passthrough'),
        LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    )

    # Fit model
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_val)

    # Calculate accuracy
    acc = accuracy_score(y_val, y_pred)
    accuracies[c] = round(acc, 3)
    print(f"C={c}: Validation Accuracy = {accuracies[c]}")

# Find best C
best_c = max(accuracies, key=accuracies.get)
print("\nBest C:", best_c, "with accuracy:", accuracies[best_c])

C=0.01: Validation Accuracy = 0.743
C=0.1: Validation Accuracy = 0.743
C=1: Validation Accuracy = 0.743
C=10: Validation Accuracy = 0.743
C=100: Validation Accuracy = 0.743

Best C: 0.01 with accuracy: 0.743
