In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import inv

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   lead_source               1334 non-null   object 
 1   industry                  1328 non-null   object 
 2   number_of_courses_viewed  1462 non-null   int64  
 3   annual_income             1281 non-null   float64
 4   employment_status         1362 non-null   object 
 5   location                  1399 non-null   object 
 6   interaction_count         1462 non-null   int64  
 7   lead_score                1462 non-null   float64
 8   converted                 1462 non-null   int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 102.9+ KB


In [6]:
# select categorical features
categorical = df.select_dtypes(include=['object']).columns.tolist()
categorical

['lead_source', 'industry', 'employment_status', 'location']

In [7]:
# select numerical features
numerical = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score',
 'converted']

In [8]:
data = df.copy()

In [9]:
data[categorical] = data[categorical].fillna('NA')

In [10]:
data[numerical] = data[numerical].fillna(0.0)

In [11]:
data.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

### Q1

In [12]:
data['industry'].mode()

0    retail
Name: industry, dtype: object

### Q2

In [13]:
data[numerical].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879,0.435914
annual_income,0.00977,1.0,0.027036,0.01561,0.053131
interaction_count,-0.023565,0.027036,1.0,0.009888,0.374573
lead_score,-0.004879,0.01561,0.009888,1.0,0.193673
converted,0.435914,0.053131,0.374573,0.193673,1.0


In [14]:
corrs = [
    data[numerical].corr()['interaction_count']['lead_score'],
    data[numerical].corr()['number_of_courses_viewed']['lead_score'],
    data[numerical].corr()['number_of_courses_viewed']['interaction_count'],
    data[numerical].corr()['annual_income']['interaction_count']
]

In [15]:
corrs

[np.float64(0.009888182496913131),
 np.float64(-0.004878998354681276),
 np.float64(-0.023565222882888037),
 np.float64(0.02703647240481443)]

### Q3

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
# First split: train (60%) and temp (40%)
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42)

# Second split: val (20%) and test (20%) from temp (which is 40% of original)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Check the sizes
print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

Train: 877, Val: 292, Test: 293


In [18]:
# Remove the target column from each split to get features
X_train = train_data.drop(columns=['converted'])
X_val = val_data.drop(columns=['converted'])
X_test = test_data.drop(columns=['converted'])

# Extract the target column from each split
y_train = train_data['converted']
y_val = val_data['converted']
y_test = test_data['converted']

In [19]:
from sklearn.metrics import mutual_info_score

In [20]:
for col in categorical:
    score = mutual_info_score(X_train[col], y_train)
    print(col, round(score,2))

lead_source 0.03
industry 0.02
employment_status 0.02
location 0.0


### Q4

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
onehot = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

In [23]:
X_train_cat  = onehot.fit_transform(X_train[categorical])

In [24]:
numerical = [col for col in numerical if col != 'converted']

In [25]:
ohe_feature_names = onehot.get_feature_names_out(categorical)
all_feature_names = list(numerical) + list(ohe_feature_names)

In [26]:
X_train_transformed_df = pd.DataFrame(
    np.hstack([X_train[numerical], X_train_cat]),
    columns=all_feature_names,
    index=X_train.index  # keep the same index for easy reference
)

In [27]:
X_val_cat = onehot.transform(X_val[categorical])

X_val_transformed_df = pd.DataFrame(
    np.hstack([X_val[numerical], X_val_cat]),
    columns=all_feature_names,
    index=X_val.index
)

X_test_cat = onehot.transform(X_test[categorical])

X_test_transformed_df = pd.DataFrame(
    np.hstack([X_test[numerical], X_test_cat]),
    columns=all_feature_names,
    index=X_test.index
)

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [30]:
model.fit(X_train_transformed_df, y_train)

In [31]:
preds_val = model.predict(X_val_transformed_df)

In [32]:
from sklearn.metrics import accuracy_score

In [33]:
original_accuracy = round(accuracy_score(y_val, preds_val), 2)
original_accuracy

0.74

### Q5

In [34]:
accuracy_scores_drop = {}

for col in X_train_transformed_df.columns:
    
    data_train = X_train_transformed_df.copy()
    data_train.drop(columns=[col], inplace=True)
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(data_train, y_train)
    
    data_val = X_val_transformed_df.copy()
    data_val.drop(columns=[col], inplace=True)
    
    preds_val = model.predict(data_val)
    
    accuracy_drop = accuracy_score(y_val, preds_val)
    accuracy_scores_drop[col] = original_accuracy - accuracy_drop
    
    

In [35]:
accuracy_scores_drop = {}

for col in X_train.columns:
    
    X_train_df = X_train.copy()
    X_train_df = X_train_df.drop(columns=[col])
    categorical = X_train_df.select_dtypes(include=['object']).columns.tolist()
    
    onehot = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    X_train_cat  = onehot.fit_transform(X_train_df[categorical])
    
    ohe_feature_names = onehot.get_feature_names_out(categorical)
    all_feature_names = list(numerical) + list(ohe_feature_names)
    
    X_train_transformed_df = pd.DataFrame(
        np.hstack([X_train[numerical], X_train_cat]),
        columns=all_feature_names,
        index=X_train.index  # keep the same index for easy reference
    )
     
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_transformed_df, y_train)
    
    X_val_df = X_val.copy()
    X_val_df = X_val_df.drop(columns=[col])
    X_val_cat = onehot.transform(X_val_df[categorical])

    X_val_transformed_df = pd.DataFrame(
        np.hstack([X_val[numerical], X_val_cat]),
        columns=all_feature_names,
        index=X_val.index
    )

    preds_val = model.predict(X_val_transformed_df)
    
    accuracy_drop = accuracy_score(y_val, preds_val)
    accuracy_scores_drop[col] = original_accuracy - accuracy_drop
    

In [36]:
pd.Series(accuracy_scores_drop).abs().sort_values().head()

industry                    0.003151
number_of_courses_viewed    0.003151
annual_income               0.003151
location                    0.003151
lead_score                  0.003151
dtype: float64

### Q6

In [37]:
for reg_val in [0.01, 0.1, 1, 10, 100]:
    
    model = LogisticRegression(solver='liblinear', C=reg_val, max_iter=1000, random_state=42)
    model.fit(X_train_transformed_df, y_train)
    
    preds_val = model.predict(X_val_transformed_df)
    
    val_accuracy = round(accuracy_score(y_val, preds_val), 3)
    print(f"C={reg_val}: {val_accuracy}")

C=0.01: 0.743
C=0.1: 0.743
C=1: 0.743
C=10: 0.743
C=100: 0.743
