# CAPSTONE 1: PREDICT IF CLIENT WILL SUBSCRIBE TO A TERM DEPOSIT  

This Capstone Project seeks to explore the following concepts:
- Exploratory Data Analysis
- Data Visualisation
- Data Classification: K-means clustering
- Data Correlation
- Predictive Analysis
- Machine Learning

#### IMPORT LIBRARIES AND INSPECT DATA

In [289]:
# import libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from random import sample
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [290]:
# Import test and train data
# convert file to dataframe
# Inspect dataframe

train = pd.read_csv(r"C:\Users\fkole\Desktop\Kemi_Drive\Other_springboard\Capstone_1_project\bank-additional-full.csv", sep=';')
test = pd.read_csv(r"C:\Users\fkole\Desktop\Kemi_Drive\Other_springboard\Capstone_1_project\bank-additional.csv", sep=';')

print(Train.shape)

(41188, 21)


#### EXPLORATORY DATA ANALYSIS AND PREPARATION

In [291]:
# View summary statistics

print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
age               4119 non-null int64
job               4119 non-null object
marital           4119 non-null object
education         4119 non-null object
default           4119 non-null object
housing           4119 non-null object
loan              4119 non-null object
contact           4119 non-null object
month             4119 non-null object
day_of_week       4119 non-null object
duration          4119 non-null int64
campaign          4119 non-null int64
pdays             4119 non-null int64
previous          4119 non-null int64
poutcome          4119 non-null object
emp.var.rate      4119 non-null float64
cons.price.idx    4119 non-null float64
cons.conf.idx     4119 non-null float64
euribor3m         4119 non-null float64
nr.employed       4119 non-null float64
y                 4119 non-null object
dtypes: float64(5), int64(5), object(11)
memory usage: 675.9+ KB
None


In [292]:
test.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [300]:
# The following columns will be dropped because they most relate to events that have happened already

test_updated = test.drop(['contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays','previous', 'poutcome'], axis=1)
test_updated.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,-0.1,93.2,-42.0,4.191,5195.8,no


In [301]:
# ESTIMATE MISSING DATA

# Calculate total number of cells in dataframe
totalCells = np.product(test_updated.shape)

# Count number of missing values per column
missingCount = test_updated.isnull().sum()

# Calculate total number of missing values
totalMissing = missingCount.sum()

# Calculate percentage of missing values
print("The Test dataset contains", round(((totalMissing/totalCells) * 100), 2), "%", "missing values.")

The Test dataset contains 0.0 % missing values.


#### EXPLORATORY ANALYSIS

In [302]:
# Use Describe function to get summary sstatistics of data. 
test_updated.describe()

Unnamed: 0,age,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0
mean,40.11362,0.084972,93.579704,-40.499102,3.621356,5166.481695
std,10.313362,1.563114,0.579349,4.594578,1.733591,73.667904
min,18.0,-3.4,92.201,-50.8,0.635,4963.6
25%,32.0,-1.8,93.075,-42.7,1.334,5099.1
50%,38.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,1.4,93.994,-36.4,4.961,5228.1
max,88.0,1.4,94.767,-26.9,5.045,5228.1


In [303]:
test_updated.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,-0.1,93.2,-42.0,4.191,5195.8,no


In [305]:
#Create a random index
randomIndex = np.array(sample(range(len(test_updated)),5))
# Get 5 random rows
test_updatedSample = test_updated.loc[randomIndex]

# Print the sample
test_updatedSample

Unnamed: 0,age,job,marital,education,default,housing,loan,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
313,41,admin.,divorced,high.school,no,no,no,1.1,93.994,-36.4,4.857,5191.0,no
2430,26,admin.,single,high.school,no,no,no,1.1,93.994,-36.4,4.859,5191.0,yes
1049,37,unemployed,unknown,university.degree,no,no,no,-2.9,92.963,-40.8,1.262,5076.2,no
1264,26,student,single,unknown,no,yes,no,-3.4,92.649,-30.1,0.716,5017.5,no
3781,31,admin.,single,high.school,no,no,no,1.4,93.918,-42.7,4.962,5228.1,no
