# DOMAIN ANALYSIS

### INPUT VARIABLES

#### ATTRIBUTES INFORMATION:

1 - age (numeric)

2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')

3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)

4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')

5 - default: has credit in default? (categorical: 'no','yes','unknown')

6 - housing: has housing loan? (categorical: 'no','yes','unknown')

7 - loan: has personal loan? (categorical: 'no','yes','unknown')

### related with the last contact of the current campaign:

8 - contact: contact communication type (categorical: 'cellular','telephone')

9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')

10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')

11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

### other attributes:

12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)

13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)

14 - previous: number of contacts performed before this campaign and for this client (numeric)

### social and economic 

16 - emp.var.rate: employment varia15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success
context attributes
tion rate - quarterly indicator (numeric)

17 - cons.price.idx: consumer price index - monthly indicator (numeric)

18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)

19 - euribor3m: euribor 3 month rate - daily indicator (numeric)

20 - nr.employed: number of employees - quarterly indicator (numeric)

### Output variable (desired target):

21 - y - has the client subscribed a term deposit? (binary: 'yes','no')


In [1]:
# importing libraries
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# data reading
data = pd.read_csv("Portughese bank.csv",sep =';')

FileNotFoundError: [Errno 2] No such file or directory: 'Portughese bank.csv'

In [None]:
data

In [None]:
data.columns

In [4]:
# replacing the catogerical data of y to numerical
data['y']=data['y'].replace({'yes':1,'no':0})
data['y']=data['y'].astype("int64")
data

NameError: name 'data' is not defined

#### BASIC CHECKS

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.info()

#### Data has no null values

In [None]:
data.shape

In [None]:
# describing the numerical datas
data.describe()

In [None]:
# describing the catogerical datas
data.describe (include='O')

In [None]:
data.dtypes

# EXPLORATORY DATA ANALYSIS

In [None]:
data.head()

### Univariate analysis 

In [None]:
# univariate analysis using count plots
sns.countplot(x=data["age"],data=data)
plt.show()

In [None]:
data.columns

In [None]:
# ANALYSING ALL NUMERICAL DATA SETS USING COUNTPLOT
import warnings
warnings.filterwarnings('ignore')
data1=data[['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate','cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed','y']]
plt.figure(figsize=(10,100),facecolor="white")
plotnumber=1
for column in data1:
    
    if plotnumber<=10:
        
        ax=plt.subplot(10,1,plotnumber)
        sns.countplot(data1[column])
        plt.xlabel(column,fontsize=25)
        
        
    plotnumber+=1
plt.tight_layout()
plt.show()
    

In [None]:
# Anlaysing the all categorical datas using countplot
import warnings
warnings.filterwarnings('ignore')
data2=data[['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']]
plt.figure(figsize=(10,100),facecolor="white")
plotnumber=1
for column in data2:
    
    if plotnumber<=11:
        
        ax=plt.subplot(11,1,plotnumber)
        sns.countplot(data2[column])
        plt.xlabel(column,fontsize=25)
        
        
    plotnumber+=1
plt.tight_layout()
plt.show()

In [3]:
# Analysing the distribution of numerical datas using histplot
import warnings
warnings.filterwarnings('ignore')
plt.figure(figsize=(10,80),facecolor="white")
plotnumber=1
for column in data1:
    
    if plotnumber<=10:
        
        ax=plt.subplot(10,1,plotnumber)
        sns.histplot(data[column],kde=True)
        plt.xlabel(column,fontsize=25)
        
        
    plotnumber+=1
plt.tight_layout()
plt.show()

NameError: name 'data1' is not defined

<Figure size 1000x8000 with 0 Axes>

In [None]:
# Analysing the categorical datas using histplot
import warnings
warnings.filterwarnings("ignore")
data2=data[['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']]
plt.figure(figsize=(10,100),facecolor="white")
plotnumber=1
for column in data2:
    if plotnumber<=11:
        ax=plt.subplot(11,1,plotnumber)
        sns.histplot(data2[column],kde=True)
        plt.xlabel(column,fontsize=25)
    plotnumber+=1
plt.tight_layout()
plt.show()
        

In [None]:
# Analysing the data using pieplot
data2=data[['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']]
plt.figure(figsize=(100,100))
plotnumber=1
for column in data2:
    if plotnumber<=21:
        ax = plt.subplot(21,1,plotnumber)
        ax.pie(data2[column].value_counts(normalize=True),labels=data[column].value_counts().index,autopct='%1.1f%%')
        ax.set_title(column,fontsize=25)
    plotnumber+=1
plt.tight_layout()
plt.show()
         

In [None]:
data.columns

In [None]:
data1=data[['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']]
data1

In [None]:
# Analysing the data using distplot
data1=data[['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']]
plt.figure(figsize=(10,75))
plotnumber=1
for column in data1:
    if plotnumber<=10:
        ax=plt.subplot(10,1,plotnumber)
        sns.distplot(data1[column])
        plt.xlabel(column,fontsize=25)
    plotnumber+=1
plt.tight_layout()
plt.show()

In [None]:
data.loc()

In [None]:
# checking outliers using boxplots
import warnings
warnings.filterwarnings("ignore")
data1=data[['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate','cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']]
plt.figure(figsize=(10,100),facecolor="white")
plotnumber=1
for column in data1:
    if plotnumber<=11:
        ax=plt.subplot(11,1,plotnumber)
        sns.boxplot(data1[column])
        plt.xlabel(column,fontsize=25)
    plotnumber+=1
plt.tight_layout()
plt.show()
        

In [None]:
!pip install sweetviz

In [None]:
import sweetviz as sv
my_report= sv.analyze(data)
my_report.show_html()

### Bivariate analysis

In [None]:
data

In [None]:
# bivariate analysis using countplot
import warnings
warnings.filterwarnings("ignore")
data=data[['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','y']]
plt.figure(figsize=(20,50),facecolor="white")
plotnumber=1
for column in data:
    if plotnumber<=20:
        ax=plt.subplot(10,2,plotnumber)
        sns.countplot(data[column],hue=data['y'])
        plt.xlabel(column,fontsize=20)
        
    plotnumber+=1
plt.tight_layout()
plt.show()

## INSIGHTS IN THE EDA 

### Types of data

@ Variable Types:
    

Numerical (10):- ['age', 'duration', 'campaign', 'pdays', 'previous','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']


Categorical (11) :-['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome', 'y']


The information is written on the basis of above graphs and statistical summary.

- age:

We have clients with all the ages (minimum age 17 to maximum age 98).
Mean(40.0241) and median(38) have very less diference.
age attribute does not have any missing values.

- job:

Job attribute shows the clients job information.
It also shows that there are some unknown values. It mean that they are not mentioned there job type.

- marital:

mariatal attribute is shows the client marital information(married,single,divorced).
Pie and Count plots shows that married people are taking more subscriptions than the other clients.

- Education:

Education attribue shows the clients Education information
Pie and Count plots shows that secondary education type people are taking more subscription than the other clients.

- default:

default attribute shows the client credit information-['yes','no'].
Pie and count plots shows that no credit clients are taking more subscription than the other clients.

- housing:

housing attribute shows the client housing loan information.['yes','No']
Pie and Count plots shows that housing loan ('yes') are taken more when compair to othere clients('no').

- loan:

housing attribute shows the client personal loan information.['yes','No'].
Pie and Count plots shows that personal loan ('no') are taken more when compair to other clients('yes').
It quite opposite to the housing loan.

- contact:

contact attribute show the communication type-['cellular', 'telephone'].
Pie and count plot shows that clients are more contacted through the celluar type of communication when to other type.

- month:

Month attribute is about the 12 months information.
pie and count plot shows that the May has the highest count when compair to other months.

- day_of_week:

day_of_week attribute is about the 5 days information.
The mean is 3 and median is 3.
It is normally distributed. The box plot, histogram and distribution plot also showing same. It doesnot have outilers and missing values.

- duration

Duration is in seconds.
The mean is 258.285 and median is 180. there is more difference between mean and the median.

- campaign:

They conducted 56 campaigns.
The campaign is distributed as right skewed.
mean is 2.5676 and median is 2.
There is no missing values in campaigns.

- pdays:

it is about the number of days that passed by after the client was last contacted from a previous campaign.
If pdays=999 means client was not previously contacted.
mean is 962.4755 and median is 999.

- previous:

It is about the number of contacts performed before this campaign and for this client.

- poutcome:

poutcome attribute is about the previous outcome information['nonexistent', 'failure', 'success'].
pie and count plots shows that highest count is nonexistent and lowest count is Success.
mean is 10.4537 and median is 11.

- emp.var.rate:

mean is 0.08189 and median is 1.1.

- cons.price.idx:

It is about the consumer price index in monthly period.
The mean is 93.5757 and median is 93.749. there is only small difference between mean and the median.

- cons.conf.idx:

It is about the consumer confidence index in monthly period.

- euribor3m:

mean is 3.6213 and median is 4.857.

- nr.employed:

It is indicated as the number of employees.
The mean is 5167.0359 and median is 5191.

- y:

y is the target variable. It is about the clients subscribe the term desposite (yes or no).
pie and count plots shows that highest count is no and lowest count is yes.

### Multivariate analysis

In [None]:
sns.pairplot(data,size=2.1,hue='y',palette='crest')

## DATA PREPROCESSING AND FEATURE ENGINEERING

### checking correlation

In [None]:
# checking correlation
plt.figure(figsize=(20,10))
sns.heatmap(data.corr(),cmap='viridis',annot=True,linewidths=0.03,center=0)


In [None]:
data.corr()

### checking null values

In [None]:
# checking null value
data.isnull().sum()

#### data has no null value.

### checking outliers

In [None]:
# checking the outliers by boxplot
sns.boxplot(data['age'])
plt.show()

In [None]:
# Age
Q1=data['age'].quantile(q=.25)
Q3=data['age'].quantile(q=.75)
print('Q1 is:',Q1)
print('Q3 is:',Q3)

In [None]:
# IQR = Q3-Q1
# LOWER 1.5*IQR IS Q1-1.5*IQR
# UPPER 1.5*IQR IS Q3+1.5*IQR
l_outlier=Q1-1.5*(Q3-Q1)
U_outlier=Q3+1.5*(Q3-Q1)
print('l_outliers:',l_outlier)
print('U_outliers:',U_outlier)

In [None]:
print('Number of outliers in age upper:',data[data['age']> 69.5]['age'].count())
print('Number of outliers in age lower:',data[data['age']< 9.5]['age'].count())                                              
                                              

In [None]:
data.loc[data['age']>69.5]

In [None]:
median= data.loc[data['age']<69.5,'age'].median()
median

In [None]:
data.loc[data.age>69.5,'age']=np.nan
data['age'].fillna(data['age'].median(),inplace=True)

In [None]:
 sns.boxplot(data['age'])

In [None]:
# DURATION
Q1=data['duration'].quantile(q=.25)
Q3=data['duration'].quantile(q=.75)
print('Q1 IS :',Q1)
print('Q3 IS :',Q3)

In [None]:
L_outliers=(Q1)-1.5*(Q3-Q1)
U_outliers=(Q3)+1.5*(Q3-Q1)
print('L_outliers :',L_outliers)
print('U_outliers :',U_outliers)

In [None]:
print('Number of oultliers in Duration upper : ',data[data['duration']>644.5]['duration'].count())
print('Number of outliers in Duration lower : ', data[data['duration']<(-223.5)]['duration'].count())

In [None]:
data.loc[data['duration']>644.5]

In [None]:
median=data.loc[data['duration']<644.5,'duration'].median()
median

In [None]:
data.loc[data.duration>644.5,'duration']=np.nan
data['duration'].fillna(data['duration'].median(),inplace=True)

In [None]:
sns.boxplot(data['duration'])

In [None]:
# CAMPAIGN
Q1= data['campaign'].quantile(q=.25)
Q3= data['campaign'].quantile(q=.75)
print('Q1 IS:',Q1)
print('Q3 IS:',Q3)

In [None]:
L_outliers=(Q1)-1.5*(Q3-Q1)
U_outliers=(Q3)+1.5*(Q3-Q1)
print('L_outliers :',L_outliers)
print('U_outliers :',U_outliers)

In [None]:
print('Number of oultliers in campaign upper : ',data[data['campaign']>6.0]['duration'].count())
print('Number of outliers in campaign lower : ', data[data['campaign']<(-2.0)]['duration'].count())

In [None]:
data.loc[data['campaign']>6]

In [None]:
median=data.loc[data['campaign']<6,'campaign'].median()
median

In [None]:
data.loc[data.campaign>6,'campaign']=np.nan
data['campaign'].fillna(data['campaign'].median(),inplace=True)

In [None]:
sns.boxplot(data['campaign'])

In [None]:
# Pdays
Q1= data['pdays'].quantile(q=.25)
Q3= data['pdays'].quantile(q=.75)
print('Q1 IS:',Q1)
print('Q3 IS:',Q3)

In [None]:
L_outliers=(Q1)-1.5*(Q3-Q1)
U_outliers=(Q3)+1.5*(Q3-Q1)
print('L_outliers :',L_outliers)
print('U_outliers :',U_outliers)

In [None]:
print('Number of oultliers in pdays upper : ',data[data['pdays']>999]['pdays'].count())
print('Number of outliers in pdays lower : ', data[data['pdays']<999]['pdays'].count())

In [None]:
data.loc[data['pdays']<999]

In [None]:
median=data.loc[data['pdays']<999,'pdays'].median()
median

In [None]:
data.loc[data.pdays<999,'pdays']=np.nan
data['pdays'].fillna(data['pdays'].median(),inplace=True)

In [None]:
sns.boxplot(data['pdays'])

### CONVERTING CATEGORICAL DATA IN TO NUMERICAL DATA

In [None]:
df=data.copy()

In [None]:
# converting to numerical data
df['marital'].replace(['married', 'single', 'divorced','unknown'],[3,2,1,0],inplace=True)
df['education'].replace(['basic.4y','basic.6y','basic.9y','professional.course', 'unknown', 'high.school','university.degree','illiterate'],[3,2,0,1,4,5,6,7],inplace=True)
df['contact'].replace([ 'cellular', 'telephone'],[1,2],inplace=True)
df['month'].replace(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'jan', 'feb', 'mar', 'apr', 'sep'],[5,6,7,8,10,11,12,1,2,3,4,9],inplace=True)
df['poutcome'].replace([ 'success','nonexistent' ,'failure'],[0,2,1],inplace=True)
df['day_of_week'].replace(['mon','tue','wed','thu','fri'],[0,1,2,3,4],inplace=True)
df['job'].replace(['housemaid','services','admin.', 'blue-collar', 'technician' ,'retired','management', 'unemployed', 'self-employed', 'unknown' ,'entrepreneur','student'],[0,1,2,3,4,5,6,7,8,9,10,11],inplace=True)
df['default'].replace(['unknown','no','yes'],[1,0,2],inplace=True)
df['housing'].replace(['yes','no','unknown'],[1,0,2],inplace=True)
df['loan'].replace(['yes','no','unknown'],[1,0,2],inplace=True)
df['y'].replace(['no','yes'],[0,1],inplace=True)

In [None]:
df

In [None]:
df.info()

# MODEL BUILDING

In [None]:
X=df.drop(columns=['y'])
Y=df['y']

##### scaling the data using MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
Scaler=MinMaxScaler()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X , Y, test_size=.25, random_state=42, )

In [None]:
X_train1= Scaler.fit_transform(X_train)
X_test1=Scaler.transform(X_test)

In [None]:
X_train1.shape,X_test1.shape

## LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg= LogisticRegression()
log_reg.fit(X_train1,Y_train)
Y_pred=log_reg.predict(X_test1)
Y_pred

In [None]:
from sklearn.metrics import accuracy_score,auc,confusion_matrix,precision_score,recall_score
accuracy = accuracy_score(Y_test,Y_pred)
accuracy

In [None]:
Y_train_predict=log_reg.predict(X_train1)
acc_train=accuracy_score(Y_train,Y_train_predict)
acc_train

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))


### Logistic regression algorithm got 91% of perfomance.

# KNN

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
sm=SMOTE()

In [None]:
x_sm,y_sm=sm.fit_resample(X_train1,Y_train)

In [None]:
from collections import Counter
counter=Counter()

In [None]:
print('Actual classes :',Counter(Y_train))
print("smoted classes:",Counter(y_sm))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
error_rate = []
for i in range(1,11):             
    knn = KNeighborsClassifier(n_neighbors=i)                               
    knn.fit(x_sm,y_sm)                      
    pred_knn = knn.predict(X_test1)
    error_rate.append(np.mean(pred_knn != Y_test))         

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,11),error_rate,color='blue',linestyle='dashed',marker='o',markerfacecolor='red',markersize=10)
plt.title('Error Rate vs. K value')
plt.xlabel('k')
plt.ylabel('Error Rate')

In [None]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(x_sm,y_sm)
Y_pred=knn.predict(X_test1)

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,precision_score

In [None]:
print("accuracy_score is : ",accuracy_score(Y_test,Y_pred))

In [None]:
print(confusion_matrix(Y_test,Y_pred))

In [None]:
print(classification_report(Y_test,Y_pred))

In [None]:
print(precision_score(Y_test,Y_pred))

### KNN classification accuracy got 82%.

# SVM

In [None]:
from sklearn.svm import SVC
svm=SVC()
svm.fit(X_train,Y_train)

In [None]:
Y_pred_sv=svm.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,recall_score,classification_report,f1_score

In [None]:
print(accuracy_score(Y_test,Y_pred_sv))

In [None]:
print(classification_report(Y_test,Y_pred_sv))

In [None]:
cm=pd.crosstab(Y_test,Y_pred_sv)

In [None]:
cm

### from data applying SVM algorithm we have get 90% perfomance.

#  RANDOM  FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(X_train,Y_train)

In [None]:
Y_pred_rf= rf.predict(X_test)

In [None]:
Y_pred_rf

In [None]:
(Y_test==Y_pred_rf).sum()/len(Y_test)*100

In [None]:
print(accuracy_score(Y_test,Y_pred_rf))

In [None]:
print(classification_report(Y_test,Y_pred_rf))

In [None]:
pd.crosstab(Y_test,Y_pred_rf)

### RANDOM FOREST algorithm got 91% accuracy. 

# XG BOOSTER

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
XGB=GradientBoostingClassifier()
XGB.fit(X_train,Y_train)

In [None]:
Y_pred_GB=XGB.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report,recall_score,f1_score

In [None]:
print(accuracy_score(Y_test,Y_pred_GB))

In [None]:
print(classification_report(Y_test,Y_pred_GB))

In [None]:
pd.crosstab(Y_test,Y_pred_GB)

### XG BOOSTER Algorithm got 92% perfomance.

# DECISION TREE

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT=DecisionTreeClassifier()
DT.fit(X_train,Y_train)

In [None]:
Y_pred_DT=DT.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,accuracy_score,f1_score,recall_score

In [None]:
print(accuracy_score(Y_test,Y_pred_DT))

In [None]:
print(classification_report(Y_test,Y_pred_DT))

In [None]:
pd.crosstab(Y_test,Y_pred_DT)

   ## DECISION TREE algorithm got perfomance rate is 89%.

# CONCLUSION

## 1) Portughese bank data sets we predict the model using different type of machine learning algorithms LOGISTIC REGRESSION, KNN, SVM, RANDOM FOREST, XG BOOSTER, DECISION TREE.

## 2) We have get a different accuracy score in machine learning algorithms. here we got maximum accuracy score in XG BOOSTER Perfomance rate is 92% compared to other model evaluation.