In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv

--2025-10-15 20:42:46--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-15 20:42:46 (29.6 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [12]:
import pandas as pd

df = pd.read_csv("course_lead_scoring.csv")
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [13]:
print(df.isna().sum())
print()
print(df.dtypes)


lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object


In [14]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("Numerical columns:", num_cols.tolist())
cat_cols = df.select_dtypes(include=['object']).columns
print("Categorical columns:", cat_cols.tolist())

df[num_cols] = df[num_cols].fillna(0)
df[cat_cols] = df[cat_cols].fillna('NA')

df.head()

Numerical columns: ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score', 'converted']
Categorical columns: ['lead_source', 'industry', 'employment_status', 'location']


Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [15]:
df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [16]:
# Q1

# mode for column 'industry'
mode_industry = df['industry'].mode()[0]
print("Mode of 'industry':", mode_industry)

Mode of 'industry': retail


In [17]:
# Q2

# correlation between numerical columns
corr_cols = ['interaction_count', 'lead_score', 'number_of_courses_viewed', 'annual_income']
correlation_matrix = df[corr_cols].corr()
print("Correlation matrix:\n", correlation_matrix)
# Which pair of numerical features has the highest correlation?
import numpy as np
# Set the diagonal and lower triangle to NaN to avoid self-correlation and duplicate pairs
corr_matrix_upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
# Unstack and drop NaN values
corr_pairs = corr_matrix_upper.unstack().dropna()
# Find the pair with the maximum correlation
max_corr_pair = corr_pairs.idxmax()
max_corr_value = corr_pairs.max()
print(f"The pair of numerical features with the highest correlation is: {max_corr_pair} with a correlation of {max_corr_value}")

Correlation matrix:
                           interaction_count  lead_score  \
interaction_count                  1.000000    0.009888   
lead_score                         0.009888    1.000000   
number_of_courses_viewed          -0.023565   -0.004879   
annual_income                      0.027036    0.015610   

                          number_of_courses_viewed  annual_income  
interaction_count                        -0.023565       0.027036  
lead_score                               -0.004879       0.015610  
number_of_courses_viewed                  1.000000       0.009770  
annual_income                             0.009770       1.000000  
The pair of numerical features with the highest correlation is: ('annual_income', 'interaction_count') with a correlation of 0.027036472404814417


In [18]:
# Split the data into train/val/test sets (60%/20%/20%)
# Make sure that the target value 'converted' is not in your dataframe.
from sklearn.model_selection import train_test_split

df_copy = df.copy()
df_X = df_copy.drop(columns=['converted'])
df_y = df_copy['converted']

df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)
df_X_train, df_X_val, df_y_train, df_y_val = train_test_split(df_X_train, df_y_train, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2
print(len(df_X_train), len(df_X_val), len(df_X_test))





876 293 293


In [22]:
#Q3

# Calculate mutual information between each categorical feature and the target ('converted')
from sklearn.feature_selection import mutual_info_classif

# encode categorical columns to integer codes
X_train_cat = df_X_train[cat_cols].apply(lambda s: s.astype('category').cat.codes)

# compute mutual information (features are discrete)
mutual_info = mutual_info_classif(X_train_cat, df_y_train, discrete_features=True, random_state=42)

# format and print results
import pandas as pd
mi_series = pd.Series(mutual_info, index=cat_cols).sort_values(ascending=False).round(2)
print("Mutual information scores:\n", mi_series)


Mutual information scores:
 lead_source          0.04
employment_status    0.01
industry             0.01
location             0.00
dtype: float64


In [24]:
list(cat_cols)

['lead_source', 'industry', 'employment_status', 'location']

In [25]:
# Q4

# convert categorical columns using one-hot encoding
df_X_train_enc = pd.get_dummies(df_X_train, columns=list(cat_cols), drop_first=True)
df_X_val_enc = pd.get_dummies(df_X_val, columns=list(cat_cols), drop_first=True)
df_X_test_enc = pd.get_dummies(df_X_test, columns=list(cat_cols), drop_first=True)

# Align the train, validation, and test sets
df_X_val_enc = df_X_val_enc.reindex(columns=df_X_train_enc.columns, fill_value=0)
df_X_test_enc = df_X_test_enc.reindex(columns=df_X_train_enc.columns, fill_value=0)


In [26]:
# Fit a logistic regression model: model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(df_X_train_enc, df_y_train)
# Evaluate the model on the validation set using accuracy score
from sklearn.metrics import accuracy_score
df_y_val_pred = model.predict(df_X_val_enc)
val_accuracy = accuracy_score(df_y_val, df_y_val_pred)
print("Validation Accuracy:", round(val_accuracy, 2))



Validation Accuracy: 0.7


In [29]:
val_accuracy

0.6996587030716723

In [33]:
# Q5

# Least useful feature using feature elimination
# from sklearn.feature_selection import RFE
# selector = RFE(model, n_features_to_select=1)
# selector = selector.fit(df_X_train_enc, df_y_train)
# least_important_feature = df_X_train_enc.columns[~selector.support_][0]
# print("Least important feature:", least_important_feature)
cols_dropped = ['industry','employment_status','lead_score']
for col in cols_dropped:
    # Restore the original encoded dataframes for the next iteration
    df_X_train_enc_dropped = df_X_train_enc.copy()
    df_X_val_enc_dropped = df_X_val_enc.copy()
    df_X_test_enc_dropped = df_X_test_enc.copy()
    # Drop the least important feature
    if col in df_X_train_enc.columns:
        df_X_train_enc_dropped = df_X_train_enc.drop(columns=[col])
    if col in df_X_val_enc.columns:
        df_X_val_enc_dropped = df_X_val_enc.drop(columns=[col])
    if col in df_X_test_enc.columns:
        df_X_test_enc_dropped = df_X_test_enc.drop(columns=[col])
        
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(df_X_train_enc_dropped, df_y_train)
    # Evaluate the model on the validation set using accuracy score
    from sklearn.metrics import accuracy_score
    df_y_val_pred_dropped = model.predict(df_X_val_enc_dropped)
    val_accuracy_dropped = accuracy_score(df_y_val, df_y_val_pred_dropped)
    print("Validation Accuracy:", val_accuracy_dropped)

    print("Difference in accuracy after dropping the least important feature:", round(val_accuracy - val_accuracy_dropped, 3))

    # Restore the original encoded dataframes for the next iteration
    df_X_train_enc_dropped = df_X_train_enc.copy()
    df_X_val_enc_dropped = df_X_val_enc.copy()
    df_X_test_enc_dropped = df_X_test_enc.copy()

Validation Accuracy: 0.6996587030716723
Difference in accuracy after dropping the least important feature: 0.0
Validation Accuracy: 0.6996587030716723
Difference in accuracy after dropping the least important feature: 0.0
Validation Accuracy: 0.7064846416382252
Difference in accuracy after dropping the least important feature: -0.007


In [36]:
#Q6

# train a regularized logistic regression model with C: [0.01, 0.1, 1, 10, 100]
C_values = [0.01, 0.1, 1, 10, 100]


best_C = None
best_val_accuracy = 0
for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(df_X_train_enc, df_y_train)
    df_y_val_pred = model.predict(df_X_val_enc)
    val_accuracy = accuracy_score(df_y_val, df_y_val_pred)
    print(f"Validation Accuracy for C={C}: {round(val_accuracy, 3)}")
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_C = C
print("Best Validation Accuracy:", round(best_val_accuracy, 3))
print("Best C:", best_C)


Validation Accuracy for C=0.01: 0.7
Validation Accuracy for C=0.1: 0.7
Validation Accuracy for C=1: 0.7
Validation Accuracy for C=10: 0.7
Validation Accuracy for C=100: 0.7
Best Validation Accuracy: 0.7
Best C: 0.01
