# Banking Marketing Campaign

# Step 1: Problem statement and data collection

In [1]:
import requests
import os
from pathlib import Path

def setup_project_structure(base_path, subfolders):
    """Creates a standard data science directory tree."""
    for folder in subfolders:
        folder_path = Path(base_path) / folder
        folder_path.mkdir(parents=True, exist_ok=True)
    print(f"Project structure initialized in: {base_path}")

def download_dataset_to_raw(url, full_path):
    """Downloads a file to a specific path."""
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(full_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    file.write(chunk)
            
        print(f"Download successful! Saved to: {full_path}")
        return True
    except Exception as e:
        print(f"Download failed: {e}")
        return False

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during the download: {e}")
        return False
    
# --- CONFIGURATION ---
BASE_DIR = "./data"
FOLDERS = ["raw", "processed", "interim"]
DATA_URL = "https://breathecode.herokuapp.com/asset/internal-link?id=413&path=bank-marketing-campaign-data.csv"
TARGET_FILE = "bank-marketing-campaign-data.csv"

# EXECUTION
# Build the whole house
setup_project_structure(BASE_DIR, FOLDERS)

# Put the data in the 'raw' room
RAW_FILE_PATH = Path(BASE_DIR) / "raw" / TARGET_FILE
download_dataset_to_raw(DATA_URL, RAW_FILE_PATH)

Project structure initialized in: ./data
Download successful! Saved to: data/raw/bank-marketing-campaign-data.csv


True

In [2]:
import pandas as pd
# Define the full path again for reading
try:
    df = pd.read_csv(RAW_FILE_PATH, sep= None, engine= 'python')
    print("\nDataset loaded successfully into a DataFrame:")
    print(df.head())
except FileNotFoundError:
    print(f"\nError: File not found at {RAW_FILE_PATH}")


Dataset loaded successfully into a DataFrame:
   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.p

## 1.1 Dataset variables description
* **Variables**
    1. **age.** Age of customer (numeric)
    2. **job.** Type of job (categorical)
    3. **marital.** Marital status (categorical)
    4. **education.** Level of education (categorical)
    5. **default.** Do you currently have credit (categorical)
    6. **housing.** Do you have a housing loan (categorical)
    7. **loan.** Do you have a personal loan? (categorical)
    8. **contact.** Type of contact communication (categorical)
    9. **month.** Last month in which you have been contacted (categorical)
    10. **day_of_week.** Last day on which you have been contacted (categorical)
    11. **duration.** Duration of previous contact in seconds (numeric)
    12. **campaign.** Number of contacts made during this campaign to the customer (numeric)
    13. **pdays.** Number of days that elapsed since the last campaign until the customer was contacted (numeric)
    14. **previous.** Number of contacts made during the previous campaign to the customer (numeric)
    15. **poutcome.** Result of the previous marketing campaign (categorical)
    16. **emp.var.rate.** Employment variation rate. Quarterly indicator (numeric)
    17. **cons.price.idx.** Consumer price index. Monthly indicator (numeric)
    18. **cons.conf.idx.** Consumer confidence index. Monthly indicator (numeric)
    19. **euribor3m.** EURIBOR 3-month rate. Daily indicator (numeric)
    20. **nr.employed.** Number of employees. Quarterly indicator (numeric)
    21. **y.** TARGET. Whether the customer takes out a long-term deposit or not (categorical)

## 1.2 Shape information on the dataset

In [9]:
row, col = df.shape
print(f"The file has {row} rows and {col} columns for the EDA.")

The file has 41176 rows and 21 columns for the EDA.


## 1.3 Dataset complete information for rows & Columns

In [11]:
# This creates a clean summary table
summary = pd.DataFrame({
    'Non-Null Count': df.notnull().sum(),
    'Data Type': df.dtypes
})

print("Dataset Summary:")
display(summary)

Dataset Summary:


Unnamed: 0,Non-Null Count,Data Type
age,41176,int64
job,41176,object
marital,41176,object
education,41176,object
default,41176,object
housing,41176,object
loan,41176,object
contact,41176,object
month,41176,object
day_of_week,41176,object


## 1.4 Unique Values against the Data Type

In [14]:
# Combination unique counts with data types
summary = pd.DataFrame({
    'Unique Values': df.nunique(),
    'Data Type': df.dtypes
})

# Sorting the rows by Unique Values 
summary = summary.sort_values(by='Unique Values')

print("Detailed Column Profile, likely show categorical variables at the top:")
display(summary)

Detailed Column Profile, likely show categorical variables at the top:


Unnamed: 0,Unique Values,Data Type
contact,2,object
y,2,object
default,3,object
loan,3,object
poutcome,3,object
housing,3,object
marital,4,object
day_of_week,5,object
previous,8,int64
education,8,object


In [6]:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [7]:
# Check for duplicates values
df.duplicated().sum()

np.int64(12)

In [8]:
df = df.drop_duplicates(subset= df.columns)
print(f"{df.shape}")
df.head()

(41176, 21)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
