In [362]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv


--2025-10-14 10:01:36--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.1’


2025-10-14 10:01:36 (588 KB/s) - ‘course_lead_scoring.csv.1’ saved [80876/80876]



In [363]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score, mutual_info_score
import warnings
warnings.filterwarnings('ignore')


In [364]:
df= pd.read_csv('course_lead_scoring.csv')
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [365]:
df.index

RangeIndex(start=0, stop=1462, step=1)

In [366]:
df.shape

(1462, 9)

In [367]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [368]:
numerical_vars = df.select_dtypes(include=['int64', 'float64'])
categorical_vars = df.select_dtypes(include=['object'])

In [369]:
print("Numerical Variables:")
print(numerical_vars.columns)
 
print("\nCategorical Variables:")
print(categorical_vars.columns)

Numerical Variables:
Index(['number_of_courses_viewed', 'annual_income', 'interaction_count',
       'lead_score', 'converted'],
      dtype='object')

Categorical Variables:
Index(['lead_source', 'industry', 'employment_status', 'location'], dtype='object')


In [370]:
for col in categorical_vars.columns:
    df[col] = df[col].fillna('NA')

for col in numerical_vars.columns:
    df[col] = df[col].fillna(0.0)

In [371]:
industry_mode = df['industry'].mode()[0]
print(f"Q1 Answer - Mode of industry: {industry_mode}")


Q1 Answer - Mode of industry: retail


In [372]:
corr_matrix = df[numerical_vars.columns].corr()

# Check specified pairs
pairs_to_check = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

In [373]:
max_corr = -1
max_pair = None

In [374]:
for feat1, feat2 in pairs_to_check:
    if feat1 in numerical_vars.columns and feat2 in numerical_vars.columns:
        corr_value = abs(corr_matrix.loc[feat1, feat2])
        if corr_value > max_corr:
            max_corr = corr_value
            max_pair = (feat1, feat2)

print(f"Answer - Pair with biggest correlation: {max_pair[0]} and {max_pair[1]}")


Answer - Pair with biggest correlation: annual_income and interaction_count


In [375]:
X = df.drop('converted', axis=1)
y = df['converted']

In [376]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42
)

In [377]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [378]:
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()


In [379]:
mi_scores = {}
for col in categorical_features:
    mi_score = mutual_info_score(y_train, X_train[col])
    mi_scores[col] = round(mi_score, 2)


In [380]:
max_mi_var = max(mi_scores, key=mi_scores.get)
print(f"Answer - Variable with biggest MI score: {max_mi_var}")


Answer - Variable with biggest MI score: lead_source


In [381]:
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')


In [382]:
dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(train_dicts)
X_val_encoded = dv.transform(val_dicts)


In [383]:
# Train logistic regression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)


In [384]:
# Calculate validation accuracy
y_val_pred = model.predict(X_val_encoded)
baseline_accuracy = accuracy_score(y_val, y_val_pred)
accuracy_rounded = round(baseline_accuracy, 2)


In [385]:
print(f" Answer - Validation Accuracy: {accuracy_rounded}")

 Answer - Validation Accuracy: 0.74


In [386]:
all_features = X_train.columns.tolist()
feature_differences = {}


In [387]:
for feature in all_features:
    # Create dataset without this feature
    X_train_without = X_train.drop(feature, axis=1)
    X_val_without = X_val.drop(feature, axis=1)
    
    # Encode
    train_dicts_without = X_train_without.to_dict(orient='records')
    val_dicts_without = X_val_without.to_dict(orient='records')
    
    dv_without = DictVectorizer(sparse=False)
    X_train_encoded_without = dv_without.fit_transform(train_dicts_without)
    X_val_encoded_without = dv_without.transform(val_dicts_without)
    
    # Train model without this feature
    model_without = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_without.fit(X_train_encoded_without, y_train)
    
    # Calculate accuracy and difference
    y_val_pred_without = model_without.predict(X_val_encoded_without)
    accuracy_without = accuracy_score(y_val, y_val_pred_without)
    
    difference = baseline_accuracy - accuracy_without
    feature_differences[feature] = difference


In [388]:
# Find feature with smallest absolute difference
min_diff_feature = min(feature_differences, key=lambda k: abs(feature_differences[k]))
print(f" Answer - Feature with smallest difference: {min_diff_feature}")


 Answer - Feature with smallest difference: industry


In [389]:
C_values = [0.01, 0.1, 1, 10, 100]
c_results = {}

In [390]:
for C in C_values:
    model_c = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_c.fit(X_train_encoded, y_train)
    
    y_val_pred_c = model_c.predict(X_val_encoded)
    accuracy_c = accuracy_score(y_val, y_val_pred_c)
    c_results[C] = round(accuracy_c, 3)

In [391]:
best_accuracy = max(c_results.values())
best_c = min([c for c, acc in c_results.items() if acc == best_accuracy])


In [392]:
print(f"Answer - Best C value: {best_c}")


Answer - Best C value: 0.01
