# Load the dataset and get basic information


In [13]:
import pandas as pd

# Load the dataset
data = pd.read_csv('Banking_Call_Data.csv')

In [33]:
print("First 5 rows of the dataset:")
data.head()

First 5 rows of the dataset:


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [17]:
print("\nDataset Information:")
data.info()
print("\nDataset shape:")
print(data.shape)


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB

Dataset shape:
(45211, 17)


In [31]:
data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [36]:
# Describe categorical columns
print("Descriptive statistics for categorical columns:")
data.describe(include='object')

Descriptive statistics for categorical columns:


Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
count,45211,45211,45211,45211,45211,45211,45211,45211,45211,45211
unique,12,3,4,2,2,2,3,12,4,2
top,blue-collar,married,secondary,no,yes,no,cellular,may,unknown,no
freq,9732,27214,23202,44396,25130,37967,29285,13766,36959,39922


# Data Cleaning
This part is about cleaning the data. We will fix missing values and handle some special cases in the dataset. This is important because clean data helps the model work better.

### Check for the missing value

Now we have descriptive statistics for both numerical columns and categorical columns. And right now we also want to print unique values in categorical columns to ensure that the value in categorical columns are good to use.

In [32]:
# Check for unique values in categorical columns
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']

print("\nUnique values in each categorical column:")
for col in categorical_columns:
    if col in data.columns:
        print(f"\n{col}: {data[col].unique()}")


Unique values in each categorical column:

job: ['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']

marital: ['married' 'single' 'divorced']

education: ['tertiary' 'secondary' 'unknown' 'primary']

default: ['no' 'yes']

housing: ['yes' 'no']

loan: ['no' 'yes']

contact: ['unknown' 'cellular' 'telephone']

poutcome: ['unknown' 'failure' 'other' 'success']


According to the information and shape of the dataset we print in the first section, there is no actual missing value (NaN) in the dataset. All 45211 rows have complete data for all 17 columns.

But for the categorical columns, some columns have the value "unknown", including job, education, contact and poutcome.

Since we plan to use **Logistic Regression** and **Decision Trees**, we need to consider how each model handles categorical data:

**Decision Trees:**
- Can handle categorical data directly, including "unknown" values
- Can treat "unknown" as a separate category

**Logistic Regression:**
- Requires numerical input (needs encoding)
- "Unknown" values need to be handled before encoding
- More sensitive to data preprocessing choices

We have several options for handling "unknown" values:
1. **Keep "unknown" as a separate category** - Works well for both models
2. **Replace with most frequent value or just drop it** - Simple but may lose information and shrink the sample size
3. **Use predictive imputation** - More sophisticated but complex
4. **Create indicator variables** - Shows whether data was missing

For our analysis, we'll keep "unknown" as a separate category since it may contain meaningful information about customer behavior.

### Check for implausible values (numerical columns)


In [40]:
data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


According to table, there is no implausible values except "-1" for pdays. However, accoridng to the ReadMe file, -1 means never contacted before. Thus, it is meaningful and we will keep it.

# Feature Engineering
This part is about creating new features that can help the model predict better. We will focus on features that are important for our research question and algorithms.

### Important Features
We already have many good features in the dataset, but we will create a few new ones that are directly related to our research question. These features will help us understand customer behavior and activity better.

### Season
The column `month` tells us when the customer was contacted. We will group months into seasons like Spring, Summer, Fall, and Winter. This will help us see if the season affects the customer's decision.

In [41]:
# Create a new column for season based on the month
def get_season(month):
    if month in ['mar', 'apr', 'may']:
        return 'Spring'
    elif month in ['jun', 'jul', 'aug']:
        return 'Summer'
    elif month in ['sep', 'oct', 'nov']:
        return 'Fall'
    else:
        return 'Winter'

data['season'] = data['month'].apply(get_season)

# Check the new feature
data[['month', 'season']].head()

Unnamed: 0,month,season
0,may,Spring
1,may,Spring
2,may,Spring
3,may,Spring
4,may,Spring


### Duration Per Contact
The column `duration` tells us how long the call lasted, and `campaign` tells us how many times the customer was contacted. We will create a new feature called `duration_per_contact` to see the average call duration per contact.

In [42]:
# Create a new column for average duration per contact
data['duration_per_contact'] = data['duration'] / (data['campaign'] + 1)

# Check the new feature
data[['duration', 'campaign', 'duration_per_contact']].head()

Unnamed: 0,duration,campaign,duration_per_contact
0,261,1,130.5
1,151,1,75.5
2,76,1,38.0
3,92,1,46.0
4,198,1,99.0


### Was Contacted Before
The column `pdays` tells us how many days ago the customer was last contacted. If the value is "Not Contacted", it means the customer was never contacted before. We will create a new feature called `was_contacted_before` to show if the customer was contacted before or not.

In [43]:
# Create a new column for whether the customer was contacted before
data['was_contacted_before'] = data['pdays'].apply(lambda x: 0 if x == 'Not Contacted' else 1)

# Check the new feature
data[['pdays', 'was_contacted_before']].head()

Unnamed: 0,pdays,was_contacted_before
0,-1,1
1,-1,1
2,-1,1
3,-1,1
4,-1,1


## Summary
We created three new features:
1. `season`: Groups months into seasons to analyze seasonal trends.
2. `duration_per_contact`: Shows the average call duration per contact.
3. `was_contacted_before`: Indicates if the customer was contacted before.

These features will help us understand customer behavior and improve the model's predictions.

### One-Hot Encoding
We will encode some columns to make them easier for the model to understand. These columns are `default`, `housing`, `loan`, and `y`. One-Hot Encoding will turn these columns into numbers.

In [44]:
# Perform One-Hot Encoding on selected columns
encoded_columns = ['default', 'housing', 'loan', 'y']
data = pd.get_dummies(data, columns=encoded_columns, drop_first=True)

# Ensure the encoded columns are 0 and 1
data = data.astype({col: 'int' for col in data.columns if col.endswith('_yes')})

# Check the changes
data.head()

Unnamed: 0,age,job,marital,education,balance,contact,day,month,duration,campaign,pdays,previous,poutcome,season,duration_per_contact,was_contacted_before,default_yes,housing_yes,loan_yes,y_yes
0,58,management,married,tertiary,2143,unknown,5,may,261,1,-1,0,unknown,Spring,130.5,1,0,1,0,0
1,44,technician,single,secondary,29,unknown,5,may,151,1,-1,0,unknown,Spring,75.5,1,0,1,0,0
2,33,entrepreneur,married,secondary,2,unknown,5,may,76,1,-1,0,unknown,Spring,38.0,1,0,1,1,0
3,47,blue-collar,married,unknown,1506,unknown,5,may,92,1,-1,0,unknown,Spring,46.0,1,0,1,0,0
4,33,unknown,single,unknown,1,unknown,5,may,198,1,-1,0,unknown,Spring,99.0,1,0,0,0,0


### Save Changes
We will save the cleaned and processed data. This will make sure all the changes we made are kept. We will overwrite the original dataset to keep things simple.

In [47]:
# Save the processed data back to the original variable
processed_data = data 
data.to_csv('Processed_Banking_Call_Data.csv', index=False)

# Check the saved file
print("Data saved successfully!")

Data saved successfully!


In [46]:
data.head()  # Display the first few rows of the processed data

Unnamed: 0,age,job,marital,education,balance,contact,day,month,duration,campaign,pdays,previous,poutcome,season,duration_per_contact,was_contacted_before,default_yes,housing_yes,loan_yes,y_yes
0,58,management,married,tertiary,2143,unknown,5,may,261,1,-1,0,unknown,Spring,130.5,1,0,1,0,0
1,44,technician,single,secondary,29,unknown,5,may,151,1,-1,0,unknown,Spring,75.5,1,0,1,0,0
2,33,entrepreneur,married,secondary,2,unknown,5,may,76,1,-1,0,unknown,Spring,38.0,1,0,1,1,0
3,47,blue-collar,married,unknown,1506,unknown,5,may,92,1,-1,0,unknown,Spring,46.0,1,0,1,0,0
4,33,unknown,single,unknown,1,unknown,5,may,198,1,-1,0,unknown,Spring,99.0,1,0,0,0,0
