# Project: Banking Marketing Campaign

# Step 1: Import Essential Libraries 

In [1]:
# Libraries specialized "expert kits" to plug into Python essential for EDA.
import requests
import os
from pathlib import Path
import pandas as pd
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
import json
from numpy._core.defchararray import upper
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import f_classif, SelectKBest
import pickle # -- Binary (unreadable by humans) -- Can save almost any Python object -- Very fast for complex objects --


# Step 2: Problem Statement / Data Collection
## 2.1 Description of the problem

    - The Portuguese bank is experiencing a decline in revenue, so they want to be able to identify existing customers who are more likely to take out a long-term deposit. This will allow the bank to focus their marketing efforts on those customers and avoid wasting money and time on customers who are unlikely to sign up.
    - To address this problem we will create a ranking algorithm to help predict whether or not a customer will sign up for a long-term deposit.

## 2.2 Inicitial Loading and Inspection

In [2]:
def setup_project_structure(base_path, subfolders):
    """Creates a standard data science directory tree."""
    for folder in subfolders:
        folder_path = Path(base_path) / folder
        folder_path.mkdir(parents=True, exist_ok=True)
    print(f"Project structure initialized in: {base_path}")

def download_dataset_to_raw(url, full_path):
    """Downloads a file to a specific path."""
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(full_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    file.write(chunk)
            
        print(f"Download successful! Saved to: {full_path}")
        return True
    except Exception as e:
        print(f"Download failed: {e}")
        return False

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during the download: {e}")
        return False
    
# --- CONFIGURATION ---
BASE_DIR = "./data"
FOLDERS = ["raw", "processed", "interim"]
DATA_URL = "https://breathecode.herokuapp.com/asset/internal-link?id=413&path=bank-marketing-campaign-data.csv"
TARGET_FILE = "bank-marketing-campaign-data.csv"

# EXECUTION
# Build the whole house
setup_project_structure(BASE_DIR, FOLDERS)

# Put the data in the 'raw' room
RAW_FILE_PATH = Path(BASE_DIR) / "raw" / TARGET_FILE
download_dataset_to_raw(DATA_URL, RAW_FILE_PATH)

Project structure initialized in: ./data
Download successful! Saved to: data/raw/bank-marketing-campaign-data.csv


True

In [3]:
# Define the full path again for reading
try:
    df = pd.read_csv(RAW_FILE_PATH, sep= None, engine= 'python')
    print("\nDataset loaded successfully into a DataFrame:")
    print(tabulate(df.head(), headers= "keys", tablefmt= "psql"))
except FileNotFoundError:
    print(f"\nError: File not found at {RAW_FILE_PATH}")


Dataset loaded successfully into a DataFrame:
+----+-------+-----------+-----------+-------------+-----------+-----------+--------+-----------+---------+---------------+------------+------------+---------+------------+-------------+----------------+------------------+-----------------+-------------+---------------+-----+
|    |   age | job       | marital   | education   | default   | housing   | loan   | contact   | month   | day_of_week   |   duration |   campaign |   pdays |   previous | poutcome    |   emp.var.rate |   cons.price.idx |   cons.conf.idx |   euribor3m |   nr.employed | y   |
|----+-------+-----------+-----------+-------------+-----------+-----------+--------+-----------+---------+---------------+------------+------------+---------+------------+-------------+----------------+------------------+-----------------+-------------+---------------+-----|
|  0 |    56 | housemaid | married   | basic.4y    | no        | no        | no     | telephone | may     | mon           

# Step 3: Exploration and Data Cleaning
## 3.1 Dataset Variables Description
* **Variables**
    1. **age.** Age of customer (numeric)
    2. **job.** Type of job (categorical)
    3. **marital.** Marital status (categorical)
    4. **education.** Level of education (categorical)
    5. **default.** Do you currently have credit (categorical)
    6. **housing.** Do you have a housing loan (categorical)
    7. **loan.** Do you have a personal loan? (categorical)
    8. **contact.** Type of contact communication (categorical)
    9. **month.** Last month in which you have been contacted (categorical)
    10. **day_of_week.** Last day on which you have been contacted (categorical)
    11. **duration.** Duration of previous contact in seconds (numeric)
    12. **campaign.** Number of contacts made during this campaign to the customer (numeric)
    13. **pdays.** Number of days that elapsed since the last campaign until the customer was contacted (numeric)
    14. **previous.** Number of contacts made during the previous campaign to the customer (numeric)
    15. **poutcome.** Result of the previous marketing campaign (categorical)
    16. **emp.var.rate.** Employment variation rate. Quarterly indicator (numeric)
    17. **cons.price.idx.** Consumer price index. Monthly indicator (numeric)
    18. **cons.conf.idx.** Consumer confidence index. Monthly indicator (numeric)
    19. **euribor3m.** EURIBOR 3-month rate. Daily indicator (numeric)
    20. **nr.employed.** Number of employees. Quarterly indicator (numeric)
    21. **y.** TARGET. Whether the customer takes out a long-term deposit or not (categorical)

## 3.2 DataSet Dimension & Tipology Visibility

In [4]:
row, col = df.shape
print(f"The Dataset has {row} number of rows and {col} columns.")

The Dataset has 41188 number of rows and 21 columns.


## 3.3 Top 5 Row View

In [5]:
top_five_data = df.head()
print("These are the DataSet top 5 columns view to be analized:")
print(tabulate(top_five_data, headers='keys', tablefmt='psql'))

These are the DataSet top 5 columns view to be analized:
+----+-------+-----------+-----------+-------------+-----------+-----------+--------+-----------+---------+---------------+------------+------------+---------+------------+-------------+----------------+------------------+-----------------+-------------+---------------+-----+
|    |   age | job       | marital   | education   | default   | housing   | loan   | contact   | month   | day_of_week   |   duration |   campaign |   pdays |   previous | poutcome    |   emp.var.rate |   cons.price.idx |   cons.conf.idx |   euribor3m |   nr.employed | y   |
|----+-------+-----------+-----------+-------------+-----------+-----------+--------+-----------+---------+---------------+------------+------------+---------+------------+-------------+----------------+------------------+-----------------+-------------+---------------+-----|
|  0 |    56 | housemaid | married   | basic.4y    | no        | no        | no     | telephone | may     | mon 

## 3.4 Data Types and Non-Nulls Values Overview.

In [6]:
print("This is the information about Non-Null and Dtype:\n================================================")
print(df.info())

This is the information about Non-Null and Dtype:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-

## 3.5 Check Unique

In [7]:
print("These are the Unique Values for each columns on the Dataset:\n============================================================")
print(df.nunique())

These are the Unique Values for each columns on the Dataset:
age                 78
job                 12
marital              4
education            8
default              3
housing              3
loan                 3
contact              2
month               10
day_of_week          5
duration          1544
campaign            42
pdays               27
previous             8
poutcome             3
emp.var.rate        10
cons.price.idx      26
cons.conf.idx       26
euribor3m          316
nr.employed         11
y                    2
dtype: int64


### 3.5.1 Conclusions about the data info:
- There is a total of 41188 rows and 21 columns.
- INTEGER Dtype are 5 int64 columns.
- FLOAT Dtype are 5 float64.
- OBJECT Dtype are 11 object.
### Non-Null checking:
- int64: 100% ok.
- float64: 100% ok.
- object: 100% ok. 

## 3.6 Check for Duplicates Values

In [8]:
# Since there are no identifiers, duplicate check looked at the entire row.
duplicate_val = df.duplicated().sum()
print(f"There are << {duplicate_val} >> duplicated values in the dataset.")

There are << 12 >> duplicated values in the dataset.


## 3.7 Duplicates Inspection

In [9]:
# Finding those 12 duplicates is a vital cleaning step.
duplicate_rows = df[df.duplicated(keep= False)]
print("Table with duplicates rows:\n======================================================")
print(tabulate(duplicate_rows.head(24), headers= "keys", tablefmt= "psql"))

Table with duplicates rows:
+-------+-------+-------------+-----------+---------------------+-----------+-----------+--------+-----------+---------+---------------+------------+------------+---------+------------+-------------+----------------+------------------+-----------------+-------------+---------------+-----+
|       |   age | job         | marital   | education           | default   | housing   | loan   | contact   | month   | day_of_week   |   duration |   campaign |   pdays |   previous | poutcome    |   emp.var.rate |   cons.price.idx |   cons.conf.idx |   euribor3m |   nr.employed | y   |
|-------+-------+-------------+-----------+---------------------+-----------+-----------+--------+-----------+---------+---------------+------------+------------+---------+------------+-------------+----------------+------------------+-----------------+-------------+---------------+-----|
|  1265 |    39 | blue-collar | married   | basic.6y            | no        | no        | no     | tel

## 3.8 Drop Duplicates

In [10]:
df = df.drop_duplicates().reset_index(drop= True)
print(df.shape)
print(tabulate(df.head(), headers='keys', tablefmt='psql'))

(41176, 21)
+----+-------+-----------+-----------+-------------+-----------+-----------+--------+-----------+---------+---------------+------------+------------+---------+------------+-------------+----------------+------------------+-----------------+-------------+---------------+-----+
|    |   age | job       | marital   | education   | default   | housing   | loan   | contact   | month   | day_of_week   |   duration |   campaign |   pdays |   previous | poutcome    |   emp.var.rate |   cons.price.idx |   cons.conf.idx |   euribor3m |   nr.employed | y   |
|----+-------+-----------+-----------+-------------+-----------+-----------+--------+-----------+---------+---------------+------------+------------+---------+------------+-------------+----------------+------------------+-----------------+-------------+---------------+-----|
|  0 |    56 | housemaid | married   | basic.4y    | no        | no        | no     | telephone | may     | mon           |        261 |          1 |     999

## 3.9 Eliminate Irrelevant Information

- I will eliminate the columns ['duration', 'day_of_eek'] as they might add noise rather than signal on the target variable **y**.
- I hope to see in next steps, if other variable do not correlate or do not affect the target variable so we can also consider deleting it from the dataset.

In [14]:
df = df.drop(['duration','day_of_week'], axis= 1)
print(tabulate(df.head(), headers= "keys", tablefmt= "psql"))

+----+-------+-----------+-----------+-------------+-----------+-----------+--------+-----------+---------+------------+---------+------------+-------------+----------------+------------------+-----------------+-------------+---------------+-----+
|    |   age | job       | marital   | education   | default   | housing   | loan   | contact   | month   |   campaign |   pdays |   previous | poutcome    |   emp.var.rate |   cons.price.idx |   cons.conf.idx |   euribor3m |   nr.employed | y   |
|----+-------+-----------+-----------+-------------+-----------+-----------+--------+-----------+---------+------------+---------+------------+-------------+----------------+------------------+-----------------+-------------+---------------+-----|
|  0 |    56 | housemaid | married   | basic.4y    | no        | no        | no     | telephone | may     |          1 |     999 |          0 | nonexistent |            1.1 |           93.994 |           -36.4 |       4.857 |          5191 | no  |
|  1 |  

# Step 4: Analysis of Univariate Variables
## 4.1 Categorical Variable Analysis

In [18]:
# Select only columns with 'object' data type (strings/categories)
categorical_val = df.select_dtypes(include= ['object']).columns.tolist()
print(f"Categorical Variables: {categorical_val}")

Categorical Variables: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']
