In [10]:
# import Data Manipulation Libraries
import numpy as np
import pandas as pd
# import data visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
# import Filter Warning Libraries
import warnings
warnings.filterwarnings("ignore")

# import Logging Files
import logging
logging.basicConfig(level = logging.INFO,
                    filename= "model.log",
                    filemode= "w",
                    format= "%(asctime)s - %(levelname)s - %(message)s",
                    force= True)
# import Stats Scipy Library
import scipy.stats as stats

In [11]:
# data Import Using Pandas Function

url = "https://raw.githubusercontent.com/Digraskarpratik/BankMarketing_MLModel/refs/heads/main/BankTelemarketing.csv"

df = pd.read_csv(url, sep= ";")

df.sample(frac=1) # shuffle Dataset
# print (df.sample(frac=1))

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
34265,26,management,single,tertiary,no,4683,yes,no,cellular,4,may,334,2,180,1,failure,no
31510,22,student,single,secondary,no,722,yes,no,cellular,2,apr,108,1,-1,0,unknown,no
27219,31,blue-collar,married,secondary,no,3653,yes,yes,cellular,21,nov,252,1,168,4,failure,no
13900,56,retired,married,tertiary,no,89,no,no,telephone,10,jul,83,1,-1,0,unknown,no
15339,59,technician,married,tertiary,no,0,no,no,cellular,18,jul,274,5,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6955,29,management,divorced,tertiary,no,415,yes,yes,unknown,28,may,449,4,-1,0,unknown,no
3619,54,management,married,secondary,no,1394,yes,no,unknown,15,may,438,5,-1,0,unknown,no
39361,20,blue-collar,single,secondary,no,76,yes,no,cellular,18,may,639,2,-1,0,unknown,yes
33198,30,admin.,single,secondary,no,1473,yes,no,cellular,20,apr,401,1,-1,0,unknown,no


In [12]:
# Checking Data Shape and Data Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [13]:
logging.info("Dataset Uploaded Successfully.......")


In [14]:
logging.info("Staring the work........!")

In [None]:
# split the Dataset into Numerical_Data and Categorical_Data

Numerical_Data = df.select_dtypes(exclude = "object")

Categorical_Data = df.select_dtypes(include= "object")

In [18]:
#Checking the Numerical_Data
Numerical_Data

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [19]:
# Categorical_Data
Categorical_Data

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,management,married,tertiary,no,yes,no,unknown,may,unknown,no
1,technician,single,secondary,no,yes,no,unknown,may,unknown,no
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown,no
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown,no
4,unknown,single,unknown,no,no,no,unknown,may,unknown,no
...,...,...,...,...,...,...,...,...,...,...
45206,technician,married,tertiary,no,no,no,cellular,nov,unknown,yes
45207,retired,divorced,primary,no,no,no,cellular,nov,unknown,yes
45208,retired,married,secondary,no,no,no,cellular,nov,success,yes
45209,blue-collar,married,secondary,no,no,no,telephone,nov,unknown,no


In [31]:
# Checking Descriptive Stats :- Numerical_Data and Categorical_Data

from collections import OrderedDict

stats = []

for i in Numerical_Data:
    
    Numerical_Stats = OrderedDict({
        "Feature" : i,
        "Maximum" : Numerical_Data[i].max(),
        "Minimum" : Numerical_Data[i].min(),
        "25" : Numerical_Data[i].quantile(0.25),
        "Mean" : Numerical_Data [i].mean(),
        "Median" : Numerical_Data[i].median(),
        "75" : Numerical_Data [i].quantile(0.75),
        "Kurtosis" : Numerical_Data [i].kurt(),
        "Skewness" : Numerical_Data [i].skew(),
        "Standard" : Numerical_Data [i].std()
        
    })
    
    stats.append(Numerical_Stats)
    
    report = pd.DataFrame(stats)
    
report

Unnamed: 0,Feature,Maximum,Minimum,25,Mean,Median,75,Kurtosis,Skewness,Standard
0,age,95,18,33.0,40.93621,39.0,48.0,0.31957,0.684818,10.618762
1,balance,102127,-8019,72.0,1362.272058,448.0,1428.0,140.751547,8.360308,3044.765829
2,day,31,1,8.0,15.806419,16.0,21.0,-1.059897,0.093079,8.322476
3,duration,4918,0,103.0,258.16308,180.0,319.0,18.153915,3.144318,257.527812
4,campaign,63,1,1.0,2.763841,2.0,3.0,39.249651,4.89865,3.098021
5,pdays,871,-1,-1.0,40.197828,-1.0,-1.0,6.935195,2.615715,100.128746
6,previous,275,0,0.0,0.580323,0.0,0.0,4506.86066,41.846454,2.303441


In [32]:
logging.info("The Above Dataset is Non Normal Distributed..!!")