# 03-classification hw

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
!wget $data

--2025-10-13 08:32:57--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv’


2025-10-13 08:32:57 (4,46 MB/s) - ‘course_lead_scoring.csv’ saved [80876/80876]



In [65]:
df = pd.read_csv("course_lead_scoring.csv")
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In this dataset our desired target for classification task will be `converted` variable - has the client signed up to the platform or not. 

## Data preparation

* Check if the missing values are presented in the features.
* If there are missing values:
    * For caterogiral features, replace them with 'NA'
    * For numerical features, replace with with 0.0 

In [66]:
#Clean column names
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [67]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [68]:
categorical = []
numerical = []

for col in df.columns:
    print(col)
    print(df[col].dtype)  # pandas dtype
    check_type = df[col].dtype
    missing = df[col].isnull().sum()
    if check_type == object:
        categorical.append(col)
        if missing > 0: 
            df[col] = df[col].fillna('NA')
            print('Column with missing values and string dtype')
    else:
        numerical.append(col)
        if missing > 0:
            df[col] = df[col].fillna(0)
            print('Column with missing values and numeric dtype')
    print('---')

lead_source
object
Column with missing values and string dtype
---
industry
object
Column with missing values and string dtype
---
number_of_courses_viewed
int64
---
annual_income
float64
Column with missing values and numeric dtype
---
employment_status
object
Column with missing values and string dtype
---
location
object
Column with missing values and string dtype
---
interaction_count
int64
---
lead_score
float64
---
converted
int64
---


In [69]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

#### Question 1

In [70]:
df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

#### Question 2
Correlation matrix for the numerical values

In [79]:
corr_matrix = round(df[numerical].corr(),3)
print(corr_matrix)

                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                     1.000          0.010   
annual_income                                0.010          1.000   
interaction_count                           -0.024          0.027   
lead_score                                  -0.005          0.016   
converted                                    0.436          0.053   

                          interaction_count  lead_score  converted  
number_of_courses_viewed             -0.024      -0.005      0.436  
annual_income                         0.027       0.016      0.053  
interaction_count                     1.000       0.010      0.375  
lead_score                            0.010       1.000      0.194  
converted                             0.375       0.194      1.000  


### Split the data

In [80]:
from sklearn.model_selection import train_test_split

In [88]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [89]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [90]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [91]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

#### Mutual information score

In [86]:
from sklearn.metrics import mutual_info_score

In [92]:
for col in categorical:
    print(col, mutual_info_score(df_train[col], y_train))

lead_source 0.03539624379726594
industry 0.011574521435657112
employment_status 0.012937677269442782
location 0.004464157884038034


#### One hot encoding