In [6]:
!pip install pandas numpy matplotlib seaborn scikit-learn



In [5]:
!c:\users\hii\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.1.1
    Uninstalling pip-20.1.1:
      Successfully uninstalled pip-20.1.1
Successfully installed pip-24.0


In [8]:
!pip show matplotlib

Name: matplotlib
Version: 3.5.1
Summary: Python plotting package
Home-page: https://matplotlib.org
Author: John D. Hunter, Michael Droettboom
Author-email: matplotlib-users@python.org
License: PSF
Location: c:\users\hii\appdata\local\programs\python\python37\lib\site-packages
Requires: cycler, fonttools, kiwisolver, numpy, packaging, pillow, pyparsing, python-dateutil
Required-by: seaborn


In [None]:
#Imports and Setup

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

sns.set(style="whitegrid")

In [10]:
#Load Data

df = pd.read_csv("bank.csv", sep=';')

print("Shape:", df.shape)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'bank.csv'

In [None]:
#EDA

# Check class balance
print(df['y'].value_counts(normalize=True))
sns.countplot(x='y', data=df)
plt.title("Class Balance")
plt.show()

# Explore feature: job vs target
sns.barplot(x='job', y=df['y'].map({'yes': 1, 'no': 0}), data=df)
plt.xticks(rotation=45)
plt.title("Subscription rate by Job")
plt.show()

# Explore feature: balance vs target
sns.boxplot(x='y', y='balance', data=df)
plt.title("Balance distribution by Target")
plt.show()

# You can also check correlation heatmaps for numeric features
df.describe()

In [None]:
#Preprocessing

# Check missing values
print(df.isnull().sum())

# This dataset usually has no missing values, but if any:
# df = df.dropna() or fillna

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('y')  # exclude target

encoder = OneHotEncoder(drop='first', sparse=False)

X_cat = encoder.fit_transform(df[categorical_cols])
X_num = df.drop(columns=categorical_cols + ['y']).values

X = np.hstack((X_num, X_cat))
y = df['y'].map({'yes': 1, 'no': 0}).values

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
#Baseline Model

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]

acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_proba)

print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")

print(classification_report(y_val, y_pred))

In [None]:
#Improvement Step

#Using class_weight='balanced' in Logistic Regression
improved_model = LogisticRegression(max_iter=1000, class_weight='balanced')
improved_model.fit(X_train, y_train)

y_pred_imp = improved_model.predict(X_val)
y_proba_imp = improved_model.predict_proba(X_val)[:, 1]

acc_imp = accuracy_score(y_val, y_pred_imp)
f1_imp = f1_score(y_val, y_pred_imp)
auc_imp = roc_auc_score(y_val, y_proba_imp)

print("After improvement (balanced):")
print(f"Accuracy: {acc_imp:.4f}")
print(f"F1 Score: {f1_imp:.4f}")
print(f"AUC: {auc_imp:.4f}")

print(classification_report(y_val, y_pred_imp))