# **ML on Classification: To determine customer conversion via bank marketing**

#### In this task, our desired target for classification task will be converted variable - has the client signed up to the platform or not.

In [92]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

## 1. Data preparation

In [93]:
df = pd.read_csv("course_lead_scoring.csv")

In [94]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [95]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [96]:
#sometimes there's not uniformity among the names in the data, so we correct the names to lowercase and fill spaces with underscores

df.columns = df.columns.str.lower().str.replace(' ','_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ','_')

In [97]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [98]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [99]:
# To fill missing values in numerical dtype - annual_income

df.annual_income = df.annual_income.fillna(0.0)

In [100]:
df.annual_income.isnull().sum()

np.int64(0)

In [101]:
# To fill missing values in categorical dtypes - lead_source, industry, employment_status, location. 

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
df[categorical_columns] = df[categorical_columns].fillna('NA')

In [102]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


**Question 1.** 

What is the most frequent observation (mode) for the column industry?

In [103]:
df.industry.mode()

0    retail
Name: industry, dtype: object

## 2. Exploratory data analysis

To compute the correlation coefficient of the numerical features.

In [104]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [105]:
# In a correlation matrix, you compute the correlation coefficient between every pair of features.
# we create a variable, numerical, which has numerical features except the target variable - converted (y).
#we do this, numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns, if we want to include all numerical features in a large dataset.

numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
corr_matrix = df[numerical].corr()

corr_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


The **annual income and interaction_count** pair has the biggest/most positive correlation.

## 3. Setting up the validation framework

**First**, we split and set up the **train,val and test sets**; as well as the **y, target variable.**

In [106]:
from sklearn.model_selection import train_test_split

In [107]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [108]:
len(df_full_train), len(df_test)

(1169, 293)

In [109]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42) 

#25% is the proportion of the val segment, that is 20 out of the full_train 80.

In [110]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [111]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_tes = df_test.reset_index(drop=True)

In [112]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [113]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

we delete the target value y, from the dataframe (df_train,df_val,df_test), this is to avoid using them for the training and testing. 
For the testing, we assume, we do not know its value, and only compare its value against a predicted value.

#### **Next, Mutual information score between y, and the categorical variables. using the training set only**

In [114]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']

In [115]:
from sklearn.metrics import mutual_info_score

In [116]:
#checking through the categorical variables, and using the full_train dataset.

def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [117]:
mi = df_full_train[categorical].apply(mutual_info_converted_score)
mi.sort_values(ascending=False)

round(mi, 2)

lead_source          0.03
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

**Question 3. Which of these variables has the biggest mutual information score?**

- lead_source has the biggest mutual information score

## 4. Training a logistic regression.

**First transforming via one hot encoding**; then logistic regression. 

In [118]:
from sklearn.feature_extraction import DictVectorizer

In [119]:
dicts = df_train[categorical].to_dict(orient='records')

In [120]:
dv = DictVectorizer(sparse=False)

In [121]:
dv.fit(dicts)

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [122]:
dv.get_feature_names_out()

array(['employment_status=NA', 'employment_status=employed',
       'employment_status=self_employed', 'employment_status=student',
       'employment_status=unemployed', 'industry=NA',
       'industry=education', 'industry=finance', 'industry=healthcare',
       'industry=manufacturing', 'industry=other', 'industry=retail',
       'industry=technology', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america'], dtype=object)

In [123]:
dv.transform(dicts)

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(876, 27))

In [124]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [125]:
dv = DictVectorizer(sparse = False)

In [126]:
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

# A shorter way of writing the above, Xtrain = dv.fit_transform(train_dicts)

In [127]:
X_train.shape

(876, 31)

In [128]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [129]:
# you can check to see what X_train looks like; rem, X_train = dv.transform(train_dicts)

dv.transform(train_dicts[:5])[0]

# Gets first 5 dictionaries (indices 0-4), transforms via one hot encoding and,
## Gets just the first row.

array([5.8472e+04, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 5.0000e+00,
       3.0000e-02, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00])

In [130]:
# doing same for the validation dataset,

val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [131]:

X_val = dv.transform(val_dicts)

### Logistic regression

In [132]:
from sklearn.linear_model import LogisticRegression

In [133]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

**Logistic regression for X_train**

In [134]:
model.fit(X_train, y_train)

#NB: Xtrain = dv.fit_transform(train_dicts); y_train = df_train.converted.values

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [135]:
model.intercept_[0]

np.float64(-0.06914728027824993)

In [136]:
model.coef_[0].round(3)

array([-0.   , -0.015,  0.034,  0.003,  0.012, -0.103, -0.025,  0.049,
       -0.02 , -0.013, -0.003, -0.009, -0.032, -0.016,  0.311,  0.051,
        0.02 , -0.012, -0.012, -0.115,  0.08 , -0.03 ,  0.004, -0.011,
       -0.011, -0.006,  0.008,  0.006, -0.033, -0.025,  0.454])

In [137]:
model.predict_proba(X_train)

array([[0.42085658, 0.57914342],
       [0.12716509, 0.87283491],
       [0.41183895, 0.58816105],
       ...,
       [0.25265786, 0.74734214],
       [0.3302157 , 0.6697843 ],
       [0.14407824, 0.85592176]], shape=(876, 2))

**Calculating accuracy on validation model**

In [141]:
# we include, [1], just to see the converted part of the matrix.

y_pred = model.predict_proba(X_val)[:, 1] 
y_pred[:5]

array([0.61192162, 0.79982616, 0.53021342, 0.47131479, 0.5706613 ])

In [142]:
converted_decision = (y_pred >= 0.5)
converted_decision.astype(int)

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0])

In [143]:
# To check accuracy, we compare and get the mean for how many actual values (y_val) equal the value of the converted decision.

df_pred_correct = (y_val == converted_decision).mean().round(2)
df_pred_correct

np.float64(0.7)

## 5. Feature elimination in Logistic regression

In [149]:
# Original feature
features = categorical + numerical

# Training model with all features
train_dicts = df_train[features].to_dict(orient='records')
val_dicts = df_val[features].to_dict(orient='records')

In [150]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)
X_val = dv.transform(val_dicts)

In [151]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
original_accuracy = (y_val == (y_pred >= 0.5)).mean()

In [153]:
original_accuracy #without rounding

np.float64(0.6996587030716723)

In [154]:
# Feature elimination
results = {}
for f in features:
    reduced_features = [feature for feature in features if feature != f]
    
    train_dicts_reduced = df_train[reduced_features].to_dict(orient='records')
    val_dicts_reduced = df_val[reduced_features].to_dict(orient='records')
    
    dv_reduced = DictVectorizer(sparse=False)
    dv_reduced.fit(train_dicts_reduced)
    X_train_reduced = dv_reduced.transform(train_dicts_reduced)
    X_val_reduced = dv_reduced.transform(val_dicts_reduced)
    
    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)
    y_pred_reduced = model_reduced.predict_proba(X_val_reduced)[:, 1]
    acc_reduced = (y_val == (y_pred_reduced >= 0.5)).mean()
    
    results[f] = original_accuracy - acc_reduced

# Display the accuracy drop for each feature
results

{'lead_source': np.float64(-0.0034129692832765013),
 'industry': np.float64(0.0),
 'employment_status': np.float64(0.0034129692832763903),
 'location': np.float64(-0.010238907849829393),
 'number_of_courses_viewed': np.float64(0.14334470989761094),
 'annual_income': np.float64(-0.15358361774744034),
 'interaction_count': np.float64(0.14334470989761094),
 'lead_score': np.float64(-0.0068259385665528916)}

Annual income has the smallest, -0.15358361774744034

## 6. Regularised logistic regression  

**What is C?**

C is the regularization parameter (inverse of regularization strength) in Logistic Regression.
Key points:

- Smaller C = Stronger regularization = Simpler model (more penalty on coefficients)

- Larger C = Weaker regularization = More complex model (less penalty on coefficients)

How it works:

C = 0.01   # Strong regularization - coefficients pushed toward zero
           # Helps prevent overfitting, simpler model

C = 1.0    # Moderate regularization (default)
           # Balanced approach

C = 100    # Weak regularization - coefficients can be large
           # Model fits training data more closely
           # Risk of overfitting

In [176]:
model = LogisticRegression(solver='liblinear', C=100, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
original_accuracy = (y_val == (y_pred >= 0.5)).mean()

## NB: there is no difference in the result when different values of C are substituted, However changing the solver to lbfgs; brought new changes.
# using  (solver='lbfgs'), showed 0.01, gives the best accuracy.

It is normal for some datasets to show little or no change in accuracy for different regularization strengths, especially if the model is already performing well and not overfitting.

It usually means:

The model is not overfitting or underfitting with the current features and data size.
The regularization parameter C does not have a strong effect on your validation accuracy for this dataset.
The features may be well-behaved, or the dataset is not complex enough for regularization to make a difference.
What you can check:

Trying to round the accuracy to 3 decimal places (small differences may be hidden).
Try printing more decimal places.

In [177]:
original_accuracy

np.float64(0.6996587030716723)

In [179]:
C_values = [0.01, 0.1, 1, 10, 100]
accuracies = []

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    acc = (y_val == (y_pred >= 0.5)).mean()
    accuracies.append(acc)

for C, acc in zip(C_values, accuracies):
    print(f"C={C}: accuracy={acc}")    

C=0.01: accuracy=0.6996587030716723
C=0.1: accuracy=0.6996587030716723
C=1: accuracy=0.6996587030716723
C=10: accuracy=0.6996587030716723
C=100: accuracy=0.6996587030716723
