In [2]:
import pandas as pd

# Define the path to our data
file_path = 'data/bank-full.csv'

# Load the CSV. Remember the separator is a semicolon!
try:
    df = pd.read_csv(file_path, sep=';')
    print("File loaded successfully!")
    print("--- First 5 Rows ---")
    display(df.head())
    
    print("\n--- Column Info (Data Types) ---")
    df.info()
    
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    print("Please make sure 'bank.csv' is inside the 'data' folder.")

File loaded successfully!
--- First 5 Rows ---


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no



--- Column Info (Data Types) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [3]:
# Based on the df.info() output:
numeric_features = [
    'age', 
    'balance', 
    'day', 
    'duration', 
    'campaign', 
    'pdays', 
    'previous'
]

categorical_features = [
    'job', 
    'marital', 
    'education', 
    'default', 
    'housing', 
    'loan', 
    'contact', 
    'month', 
    'poutcome'
]

# Print them out to confirm
print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

Numeric features: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
Categorical features: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']


In [4]:
# Map target variable 'y'
df['y'] = df['y'].map({'yes': 1, 'no': 0})

# Check the result (this is important!)
print("\n--- Target variable 'y' values (after mapping) ---")
print(df['y'].value_counts())

print("\n--- Let's check for any missing values (NaN) in 'y' ---")
print(f"Missing 'y' values: {df['y'].isna().sum()}")


--- Target variable 'y' values (after mapping) ---
y
0    39922
1     5289
Name: count, dtype: int64

--- Let's check for any missing values (NaN) in 'y' ---
Missing 'y' values: 0
