In [1]:
# Import Data Manipulation Libraries
import numpy as np
import pandas as pd

# Import Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import Data Filter Warning Libraries
import warnings
warnings.filterwarnings('ignore')

# Import Logging Files
import logging
logging.basicConfig(level=logging.INFO,
                    filemode='w',
                    filename='model.log',
                    format= '%(asctime)s  %(levelname)s  %(message)s',force = True)


# Import Stats Scipy Library
import scipy.stats as stats

In [2]:
# Data Import Using Pandas Function

url = 'https://raw.githubusercontent.com/Arbajshaikh7030/BankMarketing_MLModel/refs/heads/main/BankTelemarketing.csv'

df = pd.read_csv(url,sep = ';')

df.sample(frac = 1)  # Shuffle Dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
40682,34,admin.,married,tertiary,no,35,yes,no,cellular,7,aug,88,2,-1,0,unknown,no
40383,71,retired,married,tertiary,no,2887,no,no,cellular,30,jun,128,1,-1,0,unknown,yes
39631,21,blue-collar,single,secondary,no,164,no,no,cellular,26,may,260,1,-1,0,unknown,no
24373,57,housemaid,married,tertiary,no,3760,no,no,cellular,17,nov,1070,1,-1,0,unknown,no
1570,30,management,single,tertiary,no,528,yes,no,unknown,8,may,145,4,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19570,46,retired,married,secondary,no,23,yes,yes,cellular,7,aug,122,2,-1,0,unknown,no
3731,31,unemployed,single,secondary,no,199,yes,no,unknown,16,may,86,1,-1,0,unknown,no
33793,51,management,married,tertiary,no,4917,no,no,cellular,23,apr,113,1,-1,0,unknown,yes
44484,28,management,single,tertiary,no,6791,no,no,telephone,9,aug,361,1,46,35,failure,no


In [3]:
# cheking Data Shape and Data Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [4]:
logging.info('Datasse Uploded Successfully')

In [5]:
# Split the Dataset into Numarical_Data and Categorical_Data 

Numarical_Data = df.select_dtypes(exclude = 'object')


Categorical_Data = df.select_dtypes(include = 'object')

In [6]:
# Checking Numarical_Data
Numarical_Data 

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [7]:
# Checking Categorical_Data
Categorical_Data

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,management,married,tertiary,no,yes,no,unknown,may,unknown,no
1,technician,single,secondary,no,yes,no,unknown,may,unknown,no
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown,no
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown,no
4,unknown,single,unknown,no,no,no,unknown,may,unknown,no
...,...,...,...,...,...,...,...,...,...,...
45206,technician,married,tertiary,no,no,no,cellular,nov,unknown,yes
45207,retired,divorced,primary,no,no,no,cellular,nov,unknown,yes
45208,retired,married,secondary,no,no,no,cellular,nov,success,yes
45209,blue-collar,married,secondary,no,no,no,telephone,nov,unknown,no


In [12]:
# Checking Descripteive Stats : Numarical_data and Categorical_data

from collections import OrderedDict

stats = []

for i in Numarical_Data:

    numarical_stats = OrderedDict({
        'Feature': i,
        'Maximum': Numarical_Data[i].max(),
        'Minimum': Numarical_Data[i].min(),
        'Mean':Numarical_Data[i].mean(),
        '25%' : Numarical_Data[i].quantile(0.25),
        'Median': Numarical_Data[i].quantile(0.50),
        '75%': Numarical_Data[i].quantile(0.75),
        'Kurtosis':Numarical_Data[i].kurt(),
        'Skewness': Numarical_Data[i].skew(),
        'Standard Daviation':Numarical_Data[i].std()
    })

    stats.append(numarical_stats)

    report = pd.DataFrame(stats)


report


Unnamed: 0,Feature,Maximum,Minimum,Mean,25%,Median,75%,Kurtosis,Skewness,Standard Daviation
0,age,95,18,40.93621,33.0,39.0,48.0,0.31957,0.684818,10.618762
1,balance,102127,-8019,1362.272058,72.0,448.0,1428.0,140.751547,8.360308,3044.765829
2,day,31,1,15.806419,8.0,16.0,21.0,-1.059897,0.093079,8.322476
3,duration,4918,0,258.16308,103.0,180.0,319.0,18.153915,3.144318,257.527812
4,campaign,63,1,2.763841,1.0,2.0,3.0,39.249651,4.89865,3.098021
5,pdays,871,-1,40.197828,-1.0,-1.0,-1.0,6.935195,2.615715,100.128746
6,previous,275,0,0.580323,0.0,0.0,0.0,4506.86066,41.846454,2.303441


In [13]:
logging.info('The above Dataset is Non Normal distributed')