In [1]:
# import data manipulation libraries

import numpy as np
import pandas as pd

# import data visualization libraries

import matplotlib.pyplot as plt
import seaborn as sns

#import filter warning libraries
import warnings
warnings.filterwarnings('ignore')

#import logging files

import logging
logging.basicConfig(level= logging.INFO,
                    filename='model.log',
                    filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    force=True)

#import stats scipy library
import scipy.stats as stats

In [2]:
# data import using pandas function
url='https://raw.githubusercontent.com/Frisk516/BankMarketing_MLModel/refs/heads/main/BankTelemarketing.csv'

df=pd.read_csv(url,sep=';')
df.sample(frac=1)#shuffle dataset


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
6552,31,blue-collar,single,tertiary,no,-701,yes,no,unknown,27,may,42,3,-1,0,unknown,no
32602,54,management,married,tertiary,no,6525,yes,no,cellular,17,apr,231,1,149,4,failure,no
35336,44,services,married,secondary,no,191,yes,yes,cellular,7,may,490,1,-1,0,unknown,no
4641,33,services,married,secondary,no,4963,yes,no,unknown,20,may,148,2,-1,0,unknown,no
24524,46,entrepreneur,divorced,tertiary,no,567,yes,no,cellular,17,nov,554,2,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36877,45,blue-collar,married,secondary,no,3092,yes,yes,cellular,12,may,180,2,-1,0,unknown,no
34673,45,admin.,single,secondary,no,341,yes,yes,cellular,5,may,77,2,-1,0,unknown,no
8380,30,admin.,single,secondary,no,3809,yes,no,unknown,3,jun,159,4,-1,0,unknown,no
25224,30,blue-collar,married,secondary,no,246,yes,no,cellular,18,nov,166,2,113,5,failure,no


In [3]:
logging.info('Dataset Uploaded Successfully.....')

In [5]:
# split the dataset into Numerical_Data and Categorical_Data

Numerical_Data=df.select_dtypes(exclude='object')

Categorical_Data=df.select_dtypes(include='object')

In [6]:
# Checking the Numerical_Data
Numerical_Data


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [7]:
#checking Categorical_Data
Categorical_Data


Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,management,married,tertiary,no,yes,no,unknown,may,unknown,no
1,technician,single,secondary,no,yes,no,unknown,may,unknown,no
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown,no
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown,no
4,unknown,single,unknown,no,no,no,unknown,may,unknown,no
...,...,...,...,...,...,...,...,...,...,...
45206,technician,married,tertiary,no,no,no,cellular,nov,unknown,yes
45207,retired,divorced,primary,no,no,no,cellular,nov,unknown,yes
45208,retired,married,secondary,no,no,no,cellular,nov,success,yes
45209,blue-collar,married,secondary,no,no,no,telephone,nov,unknown,no


In [17]:
#checking descriptive stats: Numerical_Data and Categorical_Data

from collections import OrderedDict
stats=[]

for i in Numerical_Data:
    Numerical_Stats=OrderedDict({
                    'Feature':i,
                    'Maximum':Numerical_Data[i].max(),
                    'Minimum':Numerical_Data[i].min(),
                    'Median':Numerical_Data[i].median(),
                    '25%':Numerical_Data[i].quantile(0.25),
                    '75%':Numerical_Data[i].quantile(0.75),
                    'Mean':Numerical_Data[i].mean(),
                    'Standard_Deviation':Numerical_Data[i].std(),
                    'kurtosis':Numerical_Data[i].kurt(),
                    'Skewdness':Numerical_Data[i].skew()



    })
    stats.append(Numerical_Stats)

    report=pd.DataFrame(stats)

report

Unnamed: 0,Feature,Maximum,Minimum,Median,25%,75%,Mean,Standard_Deviation,kurtosis,Skewdness
0,age,95,18,39.0,33.0,48.0,40.93621,10.618762,0.31957,0.684818
1,balance,102127,-8019,448.0,72.0,1428.0,1362.272058,3044.765829,140.751547,8.360308
2,day,31,1,16.0,8.0,21.0,15.806419,8.322476,-1.059897,0.093079
3,duration,4918,0,180.0,103.0,319.0,258.16308,257.527812,18.153915,3.144318
4,campaign,63,1,2.0,1.0,3.0,2.763841,3.098021,39.249651,4.89865
5,pdays,871,-1,-1.0,-1.0,-1.0,40.197828,100.128746,6.935195,2.615715
6,previous,275,0,0.0,0.0,0.0,0.580323,2.303441,4506.86066,41.846454


In [18]:
logging.info('The Above dataset is non-normally distributed')

In [None]:
# Checking Categorical_Data

for i in Categorical_Data:
    print(Categorical_Data[i].value_counts())
    print('*'*40)


job
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: count, dtype: int64
****************************************
marital
married     27214
single      12790
divorced     5207
Name: count, dtype: int64
****************************************
education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64
****************************************
default
no     44396
yes      815
Name: count, dtype: int64
****************************************
housing
yes    25130
no     20081
Name: count, dtype: int64
****************************************
loan
no     37967
yes     7244
Name: count, dtype: int64
****************************************
contact
cellular     29285
unknown      13020
telephone     2906
Name: count, dtype