In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [4]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [5]:
print("\nMissing values before handling:")
print(df.isnull().sum())


Missing values before handling:
lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [9]:
# filling missing values with 0
catigorical_cols = ['lead_source', 'industry', 'employment_status', 'location']
numerical_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

df[catigorical_cols] = df[catigorical_cols].fillna('NA')
df[numerical_cols] = df[numerical_cols].fillna(0.0)

print(df.isnull().sum())

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [14]:
# Q1 
mode = df['industry'].mode()
print("Most frequent: ", mode)

Most frequent:  0    retail
Name: industry, dtype: object


In [16]:
# Q2
corr_matrix = df[numerical_cols].corr()

print(corr_matrix)

pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

for a, b in pairs:
    print(f"Correlation between {a} and {b}: {corr_matrix.loc[a, b]:.3f}")

                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  
Correlation between interaction_count and lead_score: 0.010
Correlation between number_of_courses_viewed and lead_score: -0.005
Correlation between number_of_courses_viewed and interaction_count: -0.024
Correlation between annual_income and interaction_count: 0.027


In [18]:
# splitting data
from sklearn.model_selection import train_test_split

# dropping the column 'converted'
X = df.drop(columns=['converted'])
y = df['converted']

# first split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# second split
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [20]:
# Q3
from sklearn.feature_selection import mutual_info_classif

categorical = ['lead_source', 'industry', 'employment_status', 'location']

# convert categorical columns to numeric codes
X_train_encoded = X_train[categorical].copy()
for col in categorical:
    X_train_encoded[col] = X_train_encoded[col].astype('category').cat.codes

# compute MI scores
mi_scores = mutual_info_classif(X_train_encoded, y_train, discrete_features=True)

# results
mi_results = pd.Series(mi_scores, index=categorical).sort_values(ascending=False)
print("Mutual Information scores:\n", mi_results.round(2))

Mutual Information scores:
 lead_source          0.03
employment_status    0.02
industry             0.02
location             0.00
dtype: float64


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# copy training, validation sets
X_train_enc = pd.get_dummies(X_train, drop_first=True)
X_val_enc = pd.get_dummies(X_val, drop_first=True)

# align columns
X_train_enc, X_val_enc = X_train_enc.align(X_val_enc, join='left', axis=1, fill_value=0)

In [25]:
# initialize model with given parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# fitting the model
model.fit(X_train_enc, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [26]:
# Q4
# prediction
y_pred = model.predict(X_val_enc)

# accuracy
val_accuracy = accuracy_score(y_val, y_pred)
print("Validation accuracy:", round(val_accuracy, 2))

Validation accuracy: 0.74


In [28]:
X_train_enc = pd.get_dummies(X_train, drop_first=True)
X_val_enc = pd.get_dummies(X_val, drop_first=True)

# align columns
X_train_enc, X_val_enc = X_train_enc.align(X_val_enc, join='left', axis=1, fill_value=0)

In [29]:
# Q5
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# original model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)

# baseline validation accuracy
y_pred = model.predict(X_val_enc)
baseline_acc = accuracy_score(y_val, y_pred)
print("Baseline accuracy:", baseline_acc)

features_to_test = ['industry', 'employment_status', 'lead_score']
differences = {}

for feature in features_to_test:
    if feature in ['industry', 'employment_status']:
        cols_to_drop = [col for col in X_train_enc.columns if col.startswith(feature)]
    else:
        cols_to_drop = [feature]
    
    # train new model
    X_train_drop = X_train_enc.drop(columns=cols_to_drop)
    X_val_drop = X_val_enc.drop(columns=cols_to_drop)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_drop, y_train)
    
    y_pred = model.predict(X_val_drop)
    acc_drop = accuracy_score(y_val, y_pred)
    
    differences[feature] = baseline_acc - acc_drop

# Show differences
for f, diff in differences.items():
    print(f"{f}: {diff:.4f}")

Baseline accuracy: 0.7431506849315068
industry: 0.0000
employment_status: -0.0034
lead_score: 0.0000


In [30]:
X_train_enc = pd.get_dummies(X_train, drop_first=True)
X_val_enc = pd.get_dummies(X_val, drop_first=True)

# Align columns
X_train_enc, X_val_enc = X_train_enc.align(X_val_enc, join='left', axis=1, fill_value=0)

In [36]:
# Q6
C_values = [0.01, 0.1, 1, 10, 100]
results = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_enc, y_train)
    y_pred = model.predict(X_val_enc)
    acc = round(accuracy_score(y_val, y_pred), 3)
    results[C] = acc

# Show results
for C, acc in results.items():
    print(f"C={C}: Accuracy={acc}")

C=0.01: Accuracy=0.74
C=0.1: Accuracy=0.743
C=1: Accuracy=0.743
C=10: Accuracy=0.743
C=100: Accuracy=0.743
