In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import os

import matplotlib.pyplot as plt

In [48]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'


In [5]:
!wget $data 

--2025-10-16 09:31:45--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: 'course_lead_scoring.csv.4'

     0K .......... .......... .......... .......... .......... 63% 2.76K 11s
    50K .......... .......... ........                        100% 9.12K=21s

2025-10-16 09:32:14 (3.70 KB/s) - 'course_lead_scoring.csv.4' saved [80876/80876]



In [6]:
df = pd.read_csv('course_lead_scoring.csv.4')
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [7]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [9]:
numerical_vars = df.select_dtypes(include=['int64', 'float64'])
categorical_vars = df.select_dtypes(include=['object'])

In [10]:
print("Numerical Variables:")
print(numerical_vars.columns)

print("\nCategorical Variables:")
print(categorical_vars.columns)

Numerical Variables:
Index(['number_of_courses_viewed', 'annual_income', 'interaction_count',
       'lead_score', 'converted'],
      dtype='object')

Categorical Variables:
Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')


In [11]:
for col in categorical_vars.columns:
    df[col] = df[col].fillna('NA')

for col in numerical_vars.columns:
    df[col] = df[col].fillna(0.0)

In [12]:
industry_mode = df['industry'].mode()[0]
print(f"Q1 Answer - Mode of industry: {industry_mode}")

Q1 Answer - Mode of industry: retail


In [22]:
corr_matrix = df[numerical_vars.columns].corr()

#specified pairs
pairs_to_check = [
   ('interaction_count', 'lead_score'),
   ('number_of_courses_viewed', 'lead_score'),
   ('number_of_courses_viewed', 'interaction_count'),
   ('annual_income', 'interaction_count')
]

In [34]:
max_corr = 0
max_pair = None

for feat1, feat2 in pairs_to_check:
    if feat1 in numerical_vars.columns and feat2 in numerical_vars.columns:
      corr_value = abs(corr_matrix.loc[feat1, feat2])
    if corr_value > max_corr:
        max_corr = corr_value
        max_pair = (feat1, feat2)
print(f"Answer - Pair with biggest correlation: {max_pair[0]} and {max_pair[1]}")
    

Answer - Pair with biggest correlation: annual_income and interaction_count


In [54]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['converted'])  
y = df['converted']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Train set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")


Train set: (877, 8)
Validation set: (292, 8)
Test set: (293, 8)


In [56]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

categorical_features = ['industry', 'location', 'lead_source', 'employment_status']


X_train_encoded = X_train[categorical_features].apply(LabelEncoder().fit_transform)


mi_scores = mutual_info_classif(X_train_encoded, y_train, discrete_features=True)
                                
mi_dict = {feature: round(score, 2) for feature, score in zip(categorical_features, mi_scores)}


for feature, score in mi_dict.items():
    print(f"{feature}: {score}")


max_feature = max(mi_dict, key=mi_dict.get)
print(f"Answer = with biggest MI score: {max_feature}")


industry: 0.02
location: 0.0
lead_source: 0.03
employment_status: 0.02
Answer = with biggest MI score: lead_source


In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  
)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", round(accuracy, 2))

Validation Accuracy: 0.74


In [63]:
from sklearn.preprocessing import OneHotEncoder


model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

features_to_test = ['industry', 'employment_status', 'lead_score']

categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

pipeline.fit(X_train, y_train)
baseline_accuracy = accuracy_score(y_val, pipeline.predict(X_val))

differences = {}

for feature in features_to_test:
    X_train_drop = X_train.drop(columns=[feature])
    X_val_drop = X_val.drop(columns=[feature])
    
    cat_cols = X_train_drop.select_dtypes(include=['object', 'category']).columns.tolist()
    
    preprocessor_drop = ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ], remainder='passthrough')
    
    pipeline_drop = Pipeline([
        ('preprocessor', preprocessor_drop),
        ('classifier', model)
    ])
    
    pipeline_drop.fit(X_train_drop, y_train)
    accuracy_drop = accuracy_score(y_val, pipeline_drop.predict(X_val_drop))
    differences[feature] = baseline_accuracy - accuracy_drop
    
for feature, diff in differences.items():
    print(f"{feature}: {diff:.4f}")

# Identify least useful feature
least_impactful = min(differences, key=lambda k: abs(differences[k]))
print(f"\nLeast useful feature: {least_impactful}")

industry: 0.0000
employment_status: -0.0034
lead_score: 0.0000

Least useful feature: industry


In [67]:
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
], remainder='passthrough')

C_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    acc = round(accuracy_score(y_val, y_pred), 3)
    accuracies[C] = acc
    print(f"C={C}: Accuracy={acc}")

best_C = max(accuracies, key=accuracies.get)
print(f"Answer C value: {best_C} with accuracy {accuracies[best_C]}")


C=0.01: Accuracy=0.743
C=0.1: Accuracy=0.743
C=1: Accuracy=0.743
C=10: Accuracy=0.743
C=100: Accuracy=0.743
Answer C value: 0.01 with accuracy 0.743
