In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the dataset
file_path = 'bank-data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

# Check for missing values
print("\nMissing values in each column:")
print(data.isnull().sum())

# Encoding categorical variables
data['sex'] = data['sex'].map({'FEMALE': 0, 'MALE': 1})
data['married'] = data['married'].map({'NO': 0, 'YES': 1})
data['car'] = data['car'].map({'NO': 0, 'YES': 1})
data['save_act'] = data['save_act'].map({'NO': 0, 'YES': 1})
data['current_act'] = data['current_act'].map({'NO': 0, 'YES': 1})
data['mortgage'] = data['mortgage'].map({'NO': 0, 'YES': 1})
data['pep'] = data['pep'].map({'NO': 0, 'YES': 1})

# Encoding 'region' column using one-hot encoding
data = pd.get_dummies(data, columns=['region'], drop_first=True)

# Dropping the 'id' column as it is not needed for modeling
data.drop(['id'], axis=1, inplace=True)

# Display the first few rows of the preprocessed dataset
print("First few rows of the preprocessed dataset:")
print(data.head())

# Check column data types to ensure all are numeric
print("\nColumn data types:")
print(data.dtypes)

# Selecting features and target variable
X = data.drop(['pep'], axis=1)
y = data['pep']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check if there are any non-numeric columns or missing values in X_train
print("\nNon-numeric columns in X_train:")
print(X_train.select_dtypes(include=['object']).columns)

print("\nMissing values in X_train:")
print(X_train.isnull().sum().sum())

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

First few rows of the dataset:
        id  age     sex      region   income married  children  car save_act  \
0  ID12101   48  FEMALE  INNER_CITY  17546.0      NO         1   NO       NO   
1  ID12102   40    MALE        TOWN  30085.1     YES         3  YES       NO   
2  ID12103   51  FEMALE  INNER_CITY  16575.4     YES         0  YES      YES   
3  ID12104   23  FEMALE        TOWN  20375.4     YES         3   NO       NO   
4  ID12105   57  FEMALE       RURAL  50576.3     YES         0   NO      YES   

  current_act mortgage  pep  
0          NO       NO  YES  
1         YES      YES   NO  
2         YES       NO   NO  
3         YES       NO   NO  
4          NO       NO   NO  

Missing values in each column:
id             0
age            0
sex            0
region         0
income         0
married        0
children       0
car            0
save_act       0
current_act    0
mortgage       0
pep            0
dtype: int64
First few rows of the preprocessed dataset:
   age  sex   i