<a href="https://colab.research.google.com/github/FutureRonin/bank-marketing-mini-portfolio/blob/main/Delete%20Bank_Marketing%20-%20Colab_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bank Marketing Mini-Portfolio

**What I will  do :**

1) Basic cleaning â†’ save `bank_clean.csv`  
2) K-Means clustering (`balance`, `age`, `campaign`)  
3) Simple charts (bar, pie, scatter)  
4) Correlations (numeric subset)  

> This notebook uses **matplotlib** (no seaborn) and does not set custom colors.


In [None]:
# 0) Setup: Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

print('Libraries ready')

## 1) Uploading Dataset (CSV Kaggle Bank Marketing)

In [None]:
from google.colab import files
uploaded = files.upload()  # Choose your CSV here

## 2) Load & quick look

In [None]:
df = pd.read_csv(CSV_NAME)
print('Shape:', df.shape)
df.head()

## 3) Basic understanding & missing values

In [None]:
df.info()

In [None]:
df.describe().T.head(20)

In [None]:
missing = df.isnull().mean().sort_values(ascending=False)
missing.head(20)

## 4) Very light cleaning
- Drop duplicates  
- Fill numeric with median  
- Fill categorical with 'Unknown'  
- Create simple buckets (age, balance) if present


In [None]:
# Drop duplicates
df = df.drop_duplicates().copy()

# Fill numeric/categorical
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()

for col in num_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].median())

for col in cat_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna('Unknown')

# Buckets
if 'age' in df.columns:
    bins_age = [0,25,35,45,55,65,120]
    labels_age = ['<=25','26-35','36-45','46-55','56-65','65+']
    df['age_bucket'] = pd.cut(df['age'], bins=bins_age, labels=labels_age, right=True, include_lowest=True)

if 'balance' in df.columns:
    bins_bal = [-1e12,0,500,1000,5000,10000,1e12]
    labels_bal = ['<=0','1-500','501-1000','1001-5000','5001-10000','>10000']
    df['balance_bucket'] = pd.cut(df['balance'], bins=bins_bal, labels=labels_bal, right=True, include_lowest=True)

print('Cleaned. Preview:')
df.head()

## 5) Save processed CSV (for reproducibility / Tableau later)

In [None]:
PROC_NAME = 'bank_clean.csv'
df.to_csv(PROC_NAME, index=False)
print('Saved:', PROC_NAME)

## 6) K-Means clustering (with scaling)
- Features: `balance`, `age`, `campaign`
- Clusters: 5 (you can change `n_clusters`)


In [None]:
features = ['balance','age','campaign']
# Ensure the columns exist;
features = [f for f in features if f in df.columns]
if len(features) < 2:
    raise ValueError('Not enough numeric features found. Please adjust the feature list to match your CSV columns.')

X = df[features].copy()
X = X.fillna(X.median())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=5, n_init=10, random_state=42)
kmeans.fit(X_scaled)
labels = kmeans.predict(X_scaled)

df['cluster'] = labels
df['cluster'].value_counts()

## 7) Plots (matplotlib only)

In [None]:
# Scatter: balance vs age colored by cluster
if {'balance','age'}.issubset(df.columns):
    plt.scatter(df['balance'], df['age'], c=df['cluster'])
    plt.xlabel('balance'); plt.ylabel('age'); plt.title('K-Means Clusters (balance vs age)')
    plt.show()
else:
    print('Skipping scatter: need both balance and age columns.')

In [None]:
# Bar: number of customers per age
if 'age' in df.columns:
    vc_age = df['age'].value_counts().sort_index()
    plt.bar(vc_age.index, vc_age.values)
    plt.xlabel('Age'); plt.ylabel('Number of customers')
    plt.title('Number of customers in each age')
    plt.show()
else:
    print('Skipping bar: age column not found.')

In [None]:
# Pie: distribution by 'previous'
if 'previous' in df.columns:
    counts = df['previous'].value_counts()
    plt.pie(counts, labels=counts.index, autopct='%1.1f%%')
    plt.title('Customers by previous contacts')
    plt.show()
else:
    print("Skipping pie chart.")

In [None]:
# Scatter: age vs balance (plain)
if {'age','balance'}.issubset(df.columns):
    plt.scatter(df['age'], df['balance'])
    plt.xlabel('Age'); plt.ylabel('Balance')
    plt.title('Customer age vs. balance')
    plt.show()
else:
    print('Skipping scatter.')

## 8) Correlations

In [None]:
subset = [c for c in ['age','balance','day','duration','campaign','pdays','previous'] if c in df.columns]
if subset:
    corr = df[subset].corr()
    corr
    if 'previous' in corr.columns:
        print("\nTop correlated with 'previous':")
        print(corr['previous'].sort_values(ascending=False).head(5))
else:
    print('No matching numeric columns found for correlation subset.')

In [None]:
# Optional: correlation to target y
if 'y' in df.columns:
    df['y_bin'] = (df['y'].astype(str).str.lower()=='yes').astype(int)
    cols_y = list(set(subset + ['y_bin']))
    corr_to_y = df[cols_y].corr()['y_bin'].sort_values(ascending=False)
    print('\nTop correlated with y_bin:')
    print(corr_to_y.head(10))
else:
    print("No column 'y' found; skipping y correlation.")

## 9) Download outputs (optional)
Use the code block below in Colab to download `bank_clean.csv` to your computer.


In [None]:
try:
    from google.colab import files
    files.download('bank_clean.csv')
except Exception as e:
    print('Created in Colab, if not using colab you can manually download bank_clean.csv from the working directory.')