In [1]:
# importing librarires
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# loading dataset
banking_data=pd.read_csv('banking_data.csv')

# Display relevant information from the dataset
print(banking_data.head())
print(banking_data.columns)
print(banking_data.nunique())

# Summary statistics
print(banking_data.describe())

   age           job  marital marital_status  education default  balance  \
0   58    management  married        married   tertiary      no     2143   
1   44    technician   single         single  secondary      no       29   
2   33  entrepreneur  married        married  secondary      no        2   
3   47   blue-collar  married        married    unknown      no     1506   
4   33       unknown   single         single    unknown      no        1   

  housing loan  contact  day month day_month  duration  campaign  pdays  \
0     yes   no  unknown    5   may    05-May       261         1     -1   
1     yes   no  unknown    5   may    05-May       151         1     -1   
2     yes  yes  unknown    5   may    05-May        76         1     -1   
3     yes   no  unknown    5   may    05-May        92         1     -1   
4      no   no  unknown    5   may    05-May       198         1     -1   

   previous poutcome   y  
0         0  unknown  no  
1         0  unknown  no  
2         0

In [3]:
# DATA CLEANING

# Checking value coutns of similar fields
print(banking_data['marital'].value_counts()) 
print(banking_data['marital_status'].value_counts())
print(banking_data['day'].value_counts())
print(banking_data['month'].value_counts())  
print(banking_data['day_month'].value_counts())

# Dropping the fields accordingly
banking_data.drop(columns=['marital_status'],inplace=True)
banking_data.drop(columns=['day_month'],inplace=True)
#print(banking_data.columns)

# Primary Descriptive Analysis
print(banking_data['marital'].value_counts()) 
print(banking_data['job'].value_counts()) 
print(banking_data['default'].value_counts()) 
print(banking_data['y'].value_counts()) 


# Handling fields with null values
# Checking for null values
print(banking_data.isnull().sum())


# Missing values are from categorical fields 'marital' and 'education'
# Dropping the rows with null values as they are less in number (3 each)
banking_data.dropna(subset=['marital'],inplace=True)
banking_data.dropna(subset=['education'],inplace=True)
#print(banking_data.isnull().sum())

marital
married     27216
single      12790
divorced     5207
Name: count, dtype: int64
marital_status
married     27216
single      12790
divorced     5207
Name: count, dtype: int64
day
20    2752
18    2308
21    2026
17    1942
6     1932
5     1910
14    1848
8     1842
28    1830
7     1817
19    1757
29    1745
15    1703
12    1603
13    1585
30    1566
9     1561
11    1479
4     1445
16    1417
2     1293
27    1121
3     1079
26    1035
23     939
22     905
25     840
31     643
10     524
24     447
1      322
Name: count, dtype: int64
month
may    13766
jul     6895
aug     6247
jun     5341
nov     3975
apr     2932
feb     2649
jan     1403
oct      738
sep      579
mar      477
dec      214
Name: count, dtype: int64
day_month
15-May    1126
14-May    1011
13-May    1002
07-May     891
08-May     841
          ... 
22-Jan       1
21-Jun       1
13-Jun       1
10-Jun       1
01-Aug       1
Name: count, Length: 318, dtype: int64
marital
married     27216
single      12790


In [4]:
# DERIVING PRILIMINARY INFERENCES

# Conversion Rate
total_clients = banking_data.shape[0]
subscribed_clients = banking_data[banking_data['y'] == 'yes'].shape[0]
conversion_rate = (subscribed_clients / total_clients) * 100
print(f"Conversion Rate: {conversion_rate:.2f}%")

# Average Contact Duration
average_duration = banking_data['duration'].mean()
print(f"Average Contact Duration: {average_duration:.2f} seconds")

# Average Balance by Subscription Status
average_balance_subscribed = banking_data[banking_data['y'] == 'yes']['balance'].mean()
average_balance_not_subscribed = banking_data[banking_data['y'] == 'no']['balance'].mean()
print(f"Average Yearly Balance for Subscribed Clients: {average_balance_subscribed:.2f} Rs.")
print(f"Average Yearly Balance for Non-Subscribed Clients: {average_balance_not_subscribed:.2f} Rs.")

# Loan Default Rate
clients_with_default = banking_data[banking_data['default'] == 'yes'].shape[0]
default_rate = (clients_with_default / total_clients) * 100
print(f"Loan Default Rate: {default_rate:.2f}%")

# Average Number of Contacts
average_contacts = banking_data['campaign'].mean()
print(f"Average Number of Contacts per Client: {average_contacts:.2f}")

# Previous Campaign Success Rate
previous_campaign_success = banking_data[banking_data['poutcome'] == 'success'].shape[0]
previous_campaign_total = banking_data[banking_data['poutcome'] != 'unknown'].shape[0]
previous_campaign_success_rate = (previous_campaign_success / previous_campaign_total) * 100
print(f"Previous Campaign Success Rate: {previous_campaign_success_rate:.2f}%")

# Customer Churn Data
churned_clients = banking_data[(banking_data['pdays'] != -1) & (banking_data['y'] == 'no')].shape[0]
total_clients = banking_data.shape[0]
churn_rate = churned_clients / total_clients * 100
print(f"Churn Rate: {churn_rate:.2f}%")

Conversion Rate: 11.71%
Average Contact Duration: 258.16 seconds
Average Yearly Balance for Subscribed Clients: 1804.18 Rs.
Average Yearly Balance for Non-Subscribed Clients: 1303.74 Rs.
Loan Default Rate: 1.80%
Average Number of Contacts per Client: 2.76
Previous Campaign Success Rate: 18.34%
Churn Rate: 14.04%


In [None]:
# VISUALISATIONS

# Plotting a histogram for the distribution of age
plt.figure(figsize=(10, 6))
plt.hist(banking_data['age'], bins=30, edgecolor='k', alpha=0.7)
plt.title('Distribution of Age among Clients')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Plotting bar chart for the job type distribution
plt.figure(figsize=(12, 6))
banking_data['job'].value_counts().plot(kind='bar', color='skyblue', edgecolor='k')
plt.title('Job Type Distribution among Clients')
plt.xlabel('Job Type')
plt.ylabel('Number of Clients')
plt.xticks(rotation=10)
plt.show()

# Plotting pie chart for the marital status distribution
plt.figure(figsize=(8, 8))
banking_data['marital'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightgreen', 'lightcoral'])
plt.title('Marital Status Distribution among Clients')
plt.ylabel('')
plt.show()

# Ploting bar chart for the education level distribution
plt.figure(figsize=(12, 6))
banking_data['education'].value_counts().plot(kind='bar', color='lightgreen', edgecolor='k')
plt.title('Education Level Distribution among Clients')
plt.xlabel('Education Level')
plt.ylabel('Number of Clients')
plt.xticks(rotation=0)
plt.show()

# Plotting pie chart the credit in default proportion
plt.figure(figsize=(8, 8))
banking_data['default'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['lightcoral', 'skyblue'])
plt.title('Proportion of Clients with Credit in Default')
plt.ylabel('')
plt.show()

# Plotting a histogram for the distribution of average yearly balance
plt.figure(figsize=(10, 6))
plt.hist(banking_data['balance'], bins=30, edgecolor='k', alpha=0.7, color='purple')
plt.title('Distribution of Average Yearly Balance among Clients')
plt.xlabel('Average Yearly Balance (Euros)')
plt.ylabel('Frequency')
plt.show()

# Plotting a pie chart for the housing loan distribution
housing_loan_counts = banking_data['housing'].value_counts()
plt.figure(figsize=(8, 8))
housing_loan_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['lightgreen', 'lightcoral'])
plt.title('Proportion of Clients with Housing Loans')
plt.ylabel('')
plt.show()

# Plotting a pie chart the personal loan distribution
personal_loan_counts = banking_data['loan'].value_counts()
plt.figure(figsize=(8, 8))
personal_loan_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['lightblue', 'lightcoral'])
plt.title('Proportion of Clients with Personal Loans')
plt.ylabel('')
plt.show()

# Plotting a bar chart for the communication types distribution
plt.figure(figsize=(10, 6))
banking_data['contact'].value_counts().plot(kind='bar', color='orange', edgecolor='k')
plt.title('Communication Types Used for Contacting Clients')
plt.xlabel('Communication Type')
plt.ylabel('Number of Clients')
plt.xticks(rotation=0)
plt.show()

# Plotting a histogram the distribution of the last contact day of the month
plt.figure(figsize=(10, 6))
plt.hist(banking_data['day'], bins=30, edgecolor='k', alpha=0.7, color='skyblue')
plt.title('Distribution of the Last Contact Day of the Month')
plt.xlabel('Day of the Month')
plt.ylabel('Frequency')
plt.show()

# Plotting a bar chart the last contact month distribution
plt.figure(figsize=(10, 6))
banking_data['month'].value_counts().plot(kind='bar', color='lightgreen', edgecolor='k')
plt.title('Last Contact Month Distribution among Clients')
plt.xlabel('Month')
plt.ylabel('Number of Clients')
plt.xticks(rotation=45)
plt.show()

# Plotting a histogram for the distribution of the duration of the last contact
plt.figure(figsize=(10, 6))
plt.hist(banking_data['duration'], bins=30, edgecolor='k', alpha=0.7, color='lightcoral')
plt.title('Distribution of the Duration of the Last Contact')
plt.xlabel('Duration (seconds)')
plt.ylabel('Frequency')
plt.show()

# Plotting a hisyogram for the distribution of the number of contacts performed during the campaign
plt.figure(figsize=(10, 6))
plt.hist(banking_data['campaign'], bins=30, edgecolor='k', alpha=0.7, color='skyblue')
plt.title('Distribution of the Number of Contacts during the Campaign')
plt.xlabel('Number of Contacts')
plt.ylabel('Frequency')
plt.show()

# Plotting a histogram for the distribution of the number of days passed since the client was last contacted
plt.figure(figsize=(10, 6))
plt.hist(banking_data['pdays'], bins=30, edgecolor='k', alpha=0.7, color='lightblue')
plt.title('Distribution of the Number of Days since Last Contact')
plt.xlabel('Number of Days')
plt.ylabel('Frequency')
plt.show()

# Plotting a histogram the distribution of the number of contacts performed before the current campaign
plt.figure(figsize=(10, 6))
plt.hist(banking_data['previous'], bins=30, edgecolor='k', alpha=0.7, color='purple')
plt.title('Distribution of the Number of Contacts before the Current Campaign')
plt.xlabel('Number of Contacts')
plt.ylabel('Frequency')
plt.show()

# Plotting a bar chart for the outcomes of the previous marketing campaigns
plt.figure(figsize=(10, 6))
banking_data['poutcome'].value_counts().plot(kind='bar', color='lightgreen', edgecolor='k')
plt.title('Outcomes of the Previous Marketing Campaigns')
plt.xlabel('Outcome')
plt.ylabel('Number of Clients')
plt.xticks(rotation=0)
plt.show()

# Plotting a pie chart for the distribution of the target variable
plt.figure(figsize=(8, 8))
banking_data['y'].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightcoral'])
plt.title('Distribution of Clients who Subscribed to a Term Deposit')
plt.ylabel('')
plt.show()

# Plotting the correlational heatmap
# Encode categorical variables to numerical for correlation analysis
encoded_data = banking_data.copy()
encoded_data['default'].replace({'yes':1,'no':0},inplace=True)
encoded_data['housing'].replace({'yes':1,'no':0},inplace=True)
encoded_data['loan'].replace({'yes':1,'no':0},inplace=True)
encoded_data['y'].replace({'yes':1,'no':0},inplace=True)
# Choosing the relevant data types
numeric_data=encoded_data.select_dtypes(include=['int64', 'float64']) 
# Calculate the correlation matrix
correlation_matrix = numeric_data.corr()
# Plotting the heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='YlGnBu',fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()
# ADDITONAL VISUALISATIONS

# Relationship between Age and Subscription
plt.figure(figsize=(10, 6))
sns.boxplot(x='y', y='age', data=banking_data)
plt.title('Age vs Subscription to Term Deposit')
plt.xlabel('Subscribed to Term Deposit')
plt.ylabel('Age')
plt.show()

# Impact of Education Level on Subscription
plt.figure(figsize=(12, 6))
sns.countplot(x='education', hue='y', data=banking_data)
plt.title('Education Level vs Subscription to Term Deposit')
plt.xlabel('Education Level')
plt.ylabel('Number of Clients')
plt.xticks(rotation=0)
plt.show()

# Effect of Job Type on Subscription
plt.figure(figsize=(12, 6))
sns.countplot(x='job', hue='y', data=banking_data)
plt.title('Job Type vs Subscription to Term Deposit')
plt.xlabel('Job Type')
plt.ylabel('Number of Clients')
plt.xticks(rotation=15)
plt.show()

# Subscription Rate by Month
plt.figure(figsize=(12, 6))
sns.countplot(x='month', hue='y', data=banking_data)
plt.title('Subscription Rate by Month')
plt.xlabel('Month')
plt.ylabel('Number of Clients')
plt.xticks(rotation=0)
plt.show()

# Duration of Contact and Subscription Rate
plt.figure(figsize=(10, 6))
sns.boxplot(x='y', y='duration', data=banking_data)
plt.title('Duration of Contact vs Subscription to Term Deposit')
plt.xlabel('Subscribed to Term Deposit')
plt.ylabel('Duration of Last Contact (seconds)')
plt.show()

# Balance and Subscription
plt.figure(figsize=(10, 6))
sns.boxplot(x='y', y='balance', data=banking_data)
plt.title('Balance vs Subscription to Term Deposit')
plt.xlabel('Subscribed to Term Deposit')
plt.ylabel('Average Yearly Balance (Euros)')
plt.show()

# Correlation between Multiple Loans and Subscription
loan_subscription = banking_data.groupby(['housing', 'loan'])['y'].value_counts(normalize=False).unstack().plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Multiple Loans vs Subscription to Term Deposit')
plt.xlabel('Housing Loan and Personal Loan Status')
plt.ylabel('Proportion of Clients')
plt.legend(title='Subscribed to Term Deposit', loc='upper right')
plt.xticks(rotation=0)
plt.show()

# Effectiveness of Different Contact Methods
plt.figure(figsize=(10, 6))
sns.countplot(x='contact', hue='y', data=banking_data)
plt.title('Contact Method vs Subscription to Term Deposit')
plt.xlabel('Contact Method')
plt.ylabel('Number of Clients')
plt.show()

# Average Balance by Job Type
avg_balance_by_job = banking_data.groupby('job')['balance'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
avg_balance_by_job.plot(kind='bar', color='skyblue', edgecolor='k')
plt.title('Average Balance by Job Type')
plt.xlabel('Job Type')
plt.ylabel('Average Balance (Euros)')
plt.xticks(rotation=12)
plt.show()

# Subscription Rate by Age Group
banking_data['age_group'] = pd.cut(banking_data['age'], bins=[18, 30, 40, 50, 60, 70, 80, 90])
subscription_rate_by_age_group = banking_data.groupby('age_group')['y'].value_counts(normalize=False).unstack().fillna(0)
subscription_rate_by_age_group.plot(kind='bar', stacked=True, figsize=(12, 6), color=['lightcoral', 'skyblue'])
plt.title('Subscription Rate by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Proportion of Clients')
plt.xticks(rotation=0)
plt.legend(title='Subscribed to Term Deposit')
plt.show()

# Subscription Rate by Education Level
subscription_rate_by_education = banking_data.groupby('education')['y'].value_counts(normalize=False).unstack().fillna(0)
subscription_rate_by_education.plot(kind='bar', stacked=True, figsize=(12, 6), color=['lightcoral', 'skyblue'])
plt.title('Subscription Rate by Education Level')
plt.xlabel('Education Level')
plt.ylabel('Proportion of Clients')
plt.xticks(rotation=0)
plt.legend(title='Subscribed to Term Deposit')
plt.show()

# Subscription Rate by Marital Status
subscription_rate_by_marital = banking_data.groupby('marital')['y'].value_counts(normalize=False).unstack().fillna(0)
subscription_rate_by_marital.plot(kind='bar', stacked=True, figsize=(12, 6), color=['lightcoral', 'skyblue'])
plt.title('Subscription Rate by Marital Status')
plt.xlabel('Marital Status')
plt.ylabel('Proportion of Clients')
plt.xticks(rotation=0)
plt.legend(title='Subscribed to Term Deposit')
plt.show()

# Average Contact Duration by Contact Method
avg_duration_by_contact = banking_data.groupby('contact')['duration'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
avg_duration_by_contact.plot(kind='bar', color='orange', edgecolor='k')
plt.title('Average Contact Duration by Contact Method')
plt.xlabel('Contact Method')
plt.ylabel('Average Duration (seconds)')
plt.xticks(rotation=0)
plt.show()

# Subscription Rate by Housing Loan Status
subscription_rate_by_housing = banking_data.groupby('housing')['y'].value_counts(normalize=False).unstack().fillna(0)
subscription_rate_by_housing.plot(kind='bar', stacked=True, figsize=(12, 6), color=['lightcoral', 'skyblue'])
plt.title('Subscription Rate by Housing Loan Status')
plt.xlabel('Housing Loan Status')
plt.ylabel('Proportion of Clients')
plt.xticks(rotation=0)
plt.legend(title='Subscribed to Term Deposit')
plt.show()

# Subscription Rate by Personal Loan Status
subscription_rate_by_loan = banking_data.groupby('loan')['y'].value_counts(normalize=False).unstack().fillna(0)
subscription_rate_by_loan.plot(kind='bar', stacked=True, figsize=(12, 6), color=['lightcoral', 'skyblue'])
plt.title('Subscription Rate by Personal Loan Status')
plt.xlabel('Personal Loan Status')
plt.ylabel('Proportion of Clients')
plt.xticks(rotation=0)
plt.legend(title='Subscribed to Term Deposit')
plt.show()