# Exploratory Analylsis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

##  Import clean dataset

In [2]:
bank_df = pd.read_csv("../data/processed/bank_clean.csv")
bank_df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,date,latitude,longitude,id_,contact_year
0,38.0,housemaid,MARRIED,basic.4y,0.0,False,False,telephone,261,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.857,5191,no,2019-08-02,41.495,-71.233,089b39d8-e4d0-461b-87d4-814d71e0e079,2019
1,57.0,services,MARRIED,high.school,unknown,False,False,telephone,149,1,999,0,NONEXISTENT,1.1,93.994,-36.4,3.626521,5191,no,2016-09-14,34.601,-83.923,e9d37224-cb6f-4942-98d7-46672963d097,2016
2,37.0,services,MARRIED,high.school,0.0,True,False,telephone,226,1,999,0,NONEXISTENT,1.1,93.994,-36.4,4.857,5191,no,2019-02-15,34.939,-94.847,3f9f49b5-e410-4948-bf6e-f9244f04918b,2019
3,40.0,admin.,MARRIED,basic.6y,0.0,False,False,telephone,151,1,999,0,NONEXISTENT,1.1,93.994,-36.4,3.625715,5191,no,2015-11-29,49.041,-70.308,9991fafb-4447-451a-8be2-b0df6098d13e,2015
4,56.0,services,MARRIED,high.school,0.0,False,True,telephone,307,1,999,0,NONEXISTENT,1.1,93.994,-36.4,3.581983,5191,no,2017-01-29,38.033,-104.463,eca60b76-70b6-4077-80ba-bc52e8ebb0eb,2017


In [3]:
bank_df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
       'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m',
       'nr.employed', 'y', 'date', 'latitude', 'longitude', 'id_',
       'contact_year'],
      dtype='object')

### Target feature 
    - y

In [4]:
bank_df["y"].value_counts()

y
no     37941
yes     4811
Name: count, dtype: int64

In [5]:
bank_df["y"].value_counts(normalize=True)

y
no     0.887467
yes    0.112533
Name: proportion, dtype: float64

The target variable is highly imbalanced. Approximately 88% of clients did not subscribe to the product, while only around 11% converted.
This indicates that the marketing campaign has a relatively low success rate.

### conversion rate by job

In [6]:
job_conversion_rate = bank_df.groupby("job")["y"].value_counts(normalize=True).reset_index()
job_conversion_rate[job_conversion_rate["y"]=='yes'].sort_values(by="proportion", ascending=False)

Unnamed: 0,job,y,proportion
17,student,yes,0.313616
11,retired,yes,0.252525
21,unemployed,yes,0.144476
1,admin.,yes,0.129791
23,unknown,yes,0.113372
9,management,yes,0.111331
13,self-employed,yes,0.108564
19,technician,yes,0.108391
7,housemaid,yes,0.099373
5,entrepreneur,yes,0.082174


In [7]:
bank_df["job"].value_counts(normalize=True).reset_index()

Unnamed: 0,job,proportion
0,admin.,0.252667
1,blue-collar,0.224668
2,technician,0.163361
3,services,0.096721
4,management,0.071014
5,retired,0.041682
6,entrepreneur,0.035297
7,self-employed,0.034688
8,housemaid,0.026127
9,unemployed,0.024771


Although students and retired clients show the highest conversion rates (31% and 25% respectively), they represent a small fraction of the dataset (2% and 4%). In contrast, administrative and blue-collar workers have lower conversion rates but represent a significantly larger portion of the population. Therefore, both efficiency (conversion rate) and population size must be considered when designing marketing strategies.

### conversion rate by marital status

In [12]:
marital_conversion = bank_df.groupby("marital")["y"].value_counts(normalize=True).reset_index()
marital_conversion[marital_conversion["y"] == 'yes'].sort_values(by="proportion", ascending=False)

Unnamed: 0,marital,y,proportion
7,unknown,yes,0.152941
5,SINGLE,yes,0.139417
3,MARRIED,yes,0.101858
1,DIVORCED,yes,0.101798


In [16]:
bank_df["marital"].value_counts(normalize=True)*100

marital
MARRIED     60.441617
SINGLE      28.169442
DIVORCED    11.190120
unknown      0.198821
Name: proportion, dtype: float64

When selecting conversion rate by marital status we find that "unknown" seems to be the one with the highest conversion but on a closer look we can see that this category only represents 0.1% so it has no significance
On the other hand, the rest of the categories are between 10 and ~14% which is not af any big significance either

### conversion rate by education

In [19]:
edu_conversion = bank_df.groupby("education")["y"].value_counts(normalize=True).reset_index()
edu_conversion[edu_conversion["y"] == 'yes'].sort_values(by="proportion", ascending=False)

Unnamed: 0,education,y,proportion
9,illiterate,yes,0.222222
15,unknown,yes,0.14437
13,university.degree,yes,0.136766
11,professional.course,yes,0.113787
7,high.school,yes,0.108387
1,basic.4y,yes,0.10344
3,basic.6y,yes,0.081719
5,basic.9y,yes,0.077695


In [21]:
bank_df["education"].value_counts(normalize=True)*100

education
university.degree      29.570546
high.school            23.091317
basic.9y               14.691710
professional.course    12.724551
basic.4y               10.130520
basic.6y                5.552957
unknown                 4.196295
illiterate              0.042103
Name: proportion, dtype: float64