In [2]:
import pandas as pd
import numpy as np

## Dataset

In [3]:
# download data

!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-19 02:28:57--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-19 02:28:58 (81.8 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [4]:
df = pd.read_csv('course_lead_scoring.csv')

In [5]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [9]:
df.head().T

Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [10]:
df.describe(include='all')

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
count,1334,1328,1462.0,1281.0,1362,1399,1462.0,1462.0,1462.0
unique,5,7,,,4,7,,,
top,organic_search,retail,,,self_employed,north_america,,,
freq,282,203,,,352,225,,,
mean,,,2.031464,59886.273224,,,2.976744,0.506108,0.619015
std,,,1.449717,15070.140389,,,1.681564,0.288465,0.485795
min,,,0.0,13929.0,,,0.0,0.0,0.0
25%,,,1.0,49698.0,,,2.0,0.2625,0.0
50%,,,2.0,60148.0,,,3.0,0.51,1.0
75%,,,3.0,69639.0,,,4.0,0.75,1.0


In [11]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [12]:
df.dtypes == 'object'

lead_source                  True
industry                     True
number_of_courses_viewed    False
annual_income               False
employment_status            True
location                     True
interaction_count           False
lead_score                  False
converted                   False
dtype: bool

In [13]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

## Data preparation

- Check if the missing values are presented in the features.
- If there are missing values:
    - For categorical features, replace them with 'NA'
    - For numerical features, replace with with 0.0

In [14]:
df_filled = df.copy()

In [15]:
df_filled

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [16]:
cat_cols = df_filled.select_dtypes(include=['object']).columns
df_filled[cat_cols] = df_filled[cat_cols].fillna('NA')

In [17]:
num_cols = df_filled.select_dtypes(include=['number']).columns
df_filled[num_cols] = df_filled[num_cols].fillna(0.0)

In [18]:
df_filled.isnull().sum()


lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [19]:
df_filled

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,0.0,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


## Question 1

What is the most frequent observation (mode) for the column industry?

- NA
- technology
- healthcare
- retail


Answer: retail

In [25]:
df_filled['industry'].mode()

0    retail
Name: industry, dtype: object

In [26]:
df_filled['industry'].value_counts(dropna=False)


industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

## Question 2

Biggest correlation (1 point)


- interaction_count and lead_score
- number_of_courses_viewed and lead_score
- number_of_courses_viewed and interaction_count
- annual_income and interaction_count


Answer: annual_income and interaction_count

In [31]:
num_df_filled = df.select_dtypes(include=['number'])
num_df_filled.head()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
0,1,79450.0,4,0.94,1
1,1,46992.0,1,0.8,0
2,5,78796.0,3,0.69,1
3,2,83843.0,1,0.87,0
4,3,85012.0,3,0.62,1


In [32]:
corr_matrix = num_df_filled.corr()
corr_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.031551,-0.023565,-0.004879,0.435914
annual_income,0.031551,1.0,0.048618,0.005334,0.078256
interaction_count,-0.023565,0.048618,1.0,0.009888,0.374573
lead_score,-0.004879,0.005334,0.009888,1.0,0.193673
converted,0.435914,0.078256,0.374573,0.193673,1.0


In [33]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = df_filled.drop(columns=['converted'])
y = df_filled['converted']

# First split: train + temp (60%) / (40%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Second split: validation + test (20% each)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Check sizes
print(len(X_train), len(X_val), len(X_test))


877 292 293


## Question 3. Biggest MI (1 point)

- industry
- location
- lead_source
- employment_status


Answer: lead_source

In [35]:
from sklearn.metrics import mutual_info_score

In [36]:
# Categorical columns
cat_cols = ['industry', 'location', 'lead_source', 'employment_status']

# Compute MI for each categorical variable
for col in cat_cols:
    score = mutual_info_score(X_train[col], y_train)
    print(col, round(score, 2))

industry 0.02
location 0.0
lead_source 0.03
employment_status 0.02


## Question 4. Accuracy (1 point)

- 0.64
- 0.74
- 0.84
- 0.94


Answer: 0.74

In [37]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

cat_cols = ['lead_source', 'industry', 'employment_status', 'location']
num_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Preprocessing: One-hot encode categorical features
preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), cat_cols),
    remainder='passthrough'
)

In [38]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

pipeline = make_pipeline(preprocessor, model)

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('columntransformer', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehotencoder', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [40]:
y_pred = pipeline.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(round(acc, 2))

0.74


## Question 5. Feature selection (1 point)

- 'industry'
- 'employment_status'
- 'lead_score'


Answer: 'industry'

In [41]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

cat_cols = ['lead_source', 'industry', 'employment_status', 'location']
num_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

features = cat_cols + num_cols

base_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Baseline pipeline
preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), cat_cols),
    remainder='passthrough'
)

pipeline = make_pipeline(preprocessor, base_model)
pipeline.fit(X_train, y_train)
baseline_acc = accuracy_score(y_val, pipeline.predict(X_val))
print("Baseline:", baseline_acc)

# Test removing each feature
for f in features:
    reduced_X_train = X_train.drop(columns=[f])
    reduced_X_val = X_val.drop(columns=[f])

    preproc = make_column_transformer(
        (OneHotEncoder(handle_unknown='ignore'), [c for c in cat_cols if c in reduced_X_train.columns]),
        remainder='passthrough'
    )

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    pipe = make_pipeline(preproc, model)
    pipe.fit(reduced_X_train, y_train)
    acc = accuracy_score(y_val, pipe.predict(reduced_X_val))

    diff = baseline_acc - acc
    print(f"{f}: {diff:.4f}")


Baseline: 0.7431506849315068
lead_source: 0.0137
industry: 0.0000
employment_status: -0.0034
location: 0.0000
number_of_courses_viewed: 0.0651
annual_income: -0.1130
interaction_count: 0.0685
lead_score: 0.0000


## Question 6. Parameter tuning (1 point)

- 0.01
- 0.1
- 1
- 10
- 100


Answer: C = 0.01

In [43]:
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [44]:
df_full_train, df_test = train_test_split(df_filled, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42) 

In [45]:
y_train = df_train.converted
y_val = df_val.converted

In [46]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']

In [47]:
df_train_encoded = pd.get_dummies(df_train, columns=categorical, drop_first=True)
df_val_encoded = pd.get_dummies(df_val, columns=categorical, drop_first=True)

In [48]:
df_val_encoded = df_val_encoded.reindex(columns=df_train_encoded.columns, fill_value=0)

In [49]:
C_values = [0.01, 0.1, 1, 10, 100]

In [50]:
accuracies = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(df_train_encoded, y_train)
    y_pred = model.predict(df_val_encoded)
    acc = accuracy_score(y_val, y_pred)
    accuracies[C] = round(acc, 3)
    print(f"C={C:<6} → Validation Accuracy = {acc:.3f}")

C=0.01   → Validation Accuracy = 0.768
C=0.1    → Validation Accuracy = 0.758
C=1      → Validation Accuracy = 0.758
C=10     → Validation Accuracy = 0.758
C=100    → Validation Accuracy = 0.758


In [52]:
best_C = max(accuracies, key=accuracies.get)
print("\n Best C:", best_C)
print(" Validation Accuracies:", accuracies)


 Best C: 0.01
 Validation Accuracies: {0.01: 0.768, 0.1: 0.758, 1: 0.758, 10: 0.758, 100: 0.758}
