# ===============================================================
#  Project: Banking Insights Dashboard
#  Step 1: Data Familiarization
#  Author: [Your Name]
#  Description:
#    This notebook explores and summarizes the raw banking dataset
#    to understand its structure, relationships, and initial insights.
# ===============================================================

In [1]:
import pandas as pd
import numpy as np

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)

In [2]:
# Load raw CSV files
customers = pd.read_csv('../data/Bank_Customer_Data.csv')
accounts = pd.read_csv('../data/Bank_Account_Data.csv')
branches = pd.read_csv('../data/Bank_Branch_Data.csv')
loans = pd.read_csv('../data/Bank_Loan_Data.csv')
transactions = pd.read_csv('../data/Bank_Transacation_Data.csv')

print("✅ Datasets loaded successfully!")
print(f"Customers: {customers.shape}, Accounts: {accounts.shape}, Branches: {branches.shape}, Loans: {loans.shape}, Transactions: {transactions.shape}")


✅ Datasets loaded successfully!
Customers: (3000, 7), Accounts: (3000, 7), Branches: (150, 3), Loans: (1500, 4), Transactions: (15000, 6)


In [3]:
print("CUSTOMERS DATA:")
display(customers.head())

print("ACCOUNTS DATA:")
display(accounts.head())

print("BRANCHES DATA:")
display(branches.head())

print("LOANS DATA:")
display(loans.head())

print("TRANSACTIONS DATA:")
display(transactions.head())


CUSTOMERS DATA:


Unnamed: 0,CUSTOMER_ID,First_Name,Last_Name,City,Phone_Number,Occupation,DOB
0,C00001,Timothy,Nelson,Davidville,1592081539,Magician,1915-09-03
1,C00002,Courtney,Farley,Port Charlesberg,9356145501,Scientist,1922-05-11
2,C00003,Sheena,Solis,New John,4311466522,Businessman,1933-10-27
3,C00004,Amy,Martinez,Joneston,6112876134,Mechanic,1981-07-29
4,C00005,Emily,Olson,South Christine,8756770692,Repairman,1999-03-01


ACCOUNTS DATA:


Unnamed: 0,ACCOUNT_ID,CUSTOMER_ID,BRANCH_ID,OPENING_BALANCE,ACCOUNT_OPEN_DATE,ACCOUNT_TYPE,ACCOUNT_STATUS
0,A00001,C00001,B00019,520482,2014-03-01,Savings,Pending
1,A00002,C00002,B00038,268824,2018-09-24,Fixed Deposit,Active
2,A00003,C00003,B00041,589218,2021-03-24,Savings,Active
3,A00004,C00004,B00021,202917,2017-11-15,Checking,Closed
4,A00005,C00005,B00049,426326,2015-11-28,Savings,Closed


BRANCHES DATA:


Unnamed: 0,BRANCH_ID,BRANCH_NAME,BRANCH_STATE
0,B00001,ND00000,North Dakota
1,B00002,NE00001,Nebraska
2,B00003,CT00002,Connecticut
3,B00004,MO00003,Missouri
4,B00005,NV00004,Nevada


LOANS DATA:


Unnamed: 0,LOAN_ID,CUSTOMER_ID,BRANCH_ID,LOAN_AMOUNT
0,L00001,C00341,B00033,6411642
1,L00002,C00344,B00009,7470012
2,L00003,C00551,B00028,3698741
3,L00004,C00974,B00004,7179821
4,L00005,C00097,B00022,8802165


TRANSACTIONS DATA:


Unnamed: 0,TRANSCATION_ID,ACCOUNT_ID,TRANSCATION_DATE,TRANSCATION_MEDIA,TRANSCATION_TYPE,TRANSCATION_AMOUNT
0,T00001,A00866,2013-12-15,Credit_Card,Deposit,37615
1,T00002,A00106,2018-08-21,Debit_Card,Transfer,9212
2,T00003,A00233,2020-12-25,Check,Deposit,62526
3,T00004,A00848,2021-10-01,Credit_Card,Transfer,90070
4,T00005,A00404,2018-09-06,Cash,Deposit,80153


In [4]:
# View basic information
print("\n--- Customers Info ---")
customers.info()

print("\n--- Accounts Info ---")
accounts.info()

print("\n--- Transactions Info ---")
transactions.info()

# Summary of numeric columns
print("\n--- Numeric Summary (Customers) ---")
display(customers.describe())

print("\n--- Numeric Summary (Accounts) ---")
display(accounts.describe())

print("\n--- Numeric Summary (Loans) ---")
display(loans.describe())



--- Customers Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CUSTOMER_ID   3000 non-null   object
 1   First_Name    3000 non-null   object
 2   Last_Name     3000 non-null   object
 3   City          3000 non-null   object
 4   Phone_Number  3000 non-null   int64 
 5   Occupation    3000 non-null   object
 6   DOB           3000 non-null   object
dtypes: int64(1), object(6)
memory usage: 164.2+ KB

--- Accounts Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ACCOUNT_ID         3000 non-null   object
 1   CUSTOMER_ID        3000 non-null   object
 2   BRANCH_ID          3000 non-null   object
 3   OPENING_BALANCE    3000 non-null   int64 
 4   ACCOUNT_OPEN_DATE  3000 

Unnamed: 0,Phone_Number
count,3000.0
mean,5014965000.0
std,2897979000.0
min,1472273.0
25%,2466813000.0
50%,5066656000.0
75%,7580025000.0
max,9994012000.0



--- Numeric Summary (Accounts) ---


Unnamed: 0,OPENING_BALANCE
count,3000.0
mean,495450.233333
std,284926.442682
min,213.0
25%,254713.0
50%,496964.0
75%,735207.25
max,999719.0



--- Numeric Summary (Loans) ---


Unnamed: 0,LOAN_AMOUNT
count,1500.0
mean,5124803.0
std,2796704.0
min,11551.0
25%,2904425.0
50%,5037467.0
75%,7563591.0
max,9994199.0


In [5]:
def missing_summary(df, name):
    missing = df.isnull().sum()
    print(f"\nMissing Values Summary for {name}:")
    print(missing[missing > 0])

missing_summary(customers, 'Customers')
missing_summary(accounts, 'Accounts')
missing_summary(branches, 'Branches')
missing_summary(loans, 'Loans')
missing_summary(transactions, 'Transactions')



Missing Values Summary for Customers:
Series([], dtype: int64)

Missing Values Summary for Accounts:
Series([], dtype: int64)

Missing Values Summary for Branches:
Series([], dtype: int64)

Missing Values Summary for Loans:
Series([], dtype: int64)

Missing Values Summary for Transactions:
Series([], dtype: int64)


In [6]:
print("\n--- Unique IDs ---")
print("Unique Customers:", customers['CUSTOMER_ID'].nunique())
print("Unique Accounts:", accounts['ACCOUNT_ID'].nunique())
print("Unique Branches:", branches['BRANCH_ID'].nunique())
print("Unique Loans:", loans['LOAN_ID'].nunique())

print("\n--- Relationship Validation ---")

# Each account should have a valid customer
invalid_accounts = accounts[~accounts['CUSTOMER_ID'].isin(customers['CUSTOMER_ID'])]
print("Invalid Accounts (missing customer link):", len(invalid_accounts))

# Each loan should have a valid customer
invalid_loans = loans[~loans['CUSTOMER_ID'].isin(customers['CUSTOMER_ID'])]
print("Invalid Loans (missing customer link):", len(invalid_loans))

# Each account should belong to a valid branch
invalid_branches = accounts[~accounts['BRANCH_ID'].isin(branches['BRANCH_ID'])]
print("Invalid Accounts (missing branch link):", len(invalid_branches))



--- Unique IDs ---
Unique Customers: 1000
Unique Accounts: 1000
Unique Branches: 50
Unique Loans: 500

--- Relationship Validation ---
Invalid Accounts (missing customer link): 0
Invalid Loans (missing customer link): 0
Invalid Accounts (missing branch link): 0


In [7]:
import IPython.display as display
from IPython.display import Markdown

display.Markdown("""
### 🔗 Data Relationship Summary

| Table | Primary Key | Foreign Keys | Relationship |
|--------|--------------|--------------|---------------|
| Bank_Customer_Data | CUSTOMER_ID | – | Base entity |
| Bank_Account_Data | ACCOUNT_ID | CUSTOMER_ID, BRANCH_ID | Links customers → branches |
| Bank_Branch_Data | BRANCH_ID | – | Connects to accounts and loans |
| Bank_Loan_Data | LOAN_ID | CUSTOMER_ID, BRANCH_ID | Connects to customers and branches |
| Bank_Transacation_Data | TRANSCATION_ID | ACCOUNT_ID | Linked to accounts (many-to-one) |

**Relationship Flow:**
Customer (1) → Account (Many) → Transaction (Many)  
Customer (1) → Loan (Many)  
Branch (1) → Account/Loan (Many)
""")



### 🔗 Data Relationship Summary

| Table | Primary Key | Foreign Keys | Relationship |
|--------|--------------|--------------|---------------|
| Bank_Customer_Data | CUSTOMER_ID | – | Base entity |
| Bank_Account_Data | ACCOUNT_ID | CUSTOMER_ID, BRANCH_ID | Links customers → branches |
| Bank_Branch_Data | BRANCH_ID | – | Connects to accounts and loans |
| Bank_Loan_Data | LOAN_ID | CUSTOMER_ID, BRANCH_ID | Connects to customers and branches |
| Bank_Transacation_Data | TRANSCATION_ID | ACCOUNT_ID | Linked to accounts (many-to-one) |

**Relationship Flow:**
Customer (1) → Account (Many) → Transaction (Many)  
Customer (1) → Loan (Many)  
Branch (1) → Account/Loan (Many)


In [8]:
summary_report = {
    "customers_count": customers['CUSTOMER_ID'].nunique(),
    "accounts_count": accounts['ACCOUNT_ID'].nunique(),
    "branches_count": branches['BRANCH_ID'].nunique(),
    "loans_count": loans['LOAN_ID'].nunique(),
    "transactions_count": transactions['TRANSCATION_ID'].nunique()
}

pd.DataFrame([summary_report]).to_csv('../docs/data_summary.csv', index=False)
print("📄 Summary report saved to docs/data_summary.csv")


📄 Summary report saved to docs/data_summary.csv


In [9]:
display.Markdown("""
## ✅ Next Step: Data Cleaning & Preprocessing

You can now proceed to `02_data_cleaning.ipynb`, where you will:
- Standardize column names
- Handle missing values
- Convert dates to proper format
- Merge datasets (customer → account → transaction → loan)
- Create derived KPIs (Customer Age, Churn Flag, Avg Balance, Loan Conversion)

After cleaning, you’ll have a single file: `merged_dataset.csv` ready for Tableau.
""")



## ✅ Next Step: Data Cleaning & Preprocessing

You can now proceed to `02_data_cleaning.ipynb`, where you will:
- Standardize column names
- Handle missing values
- Convert dates to proper format
- Merge datasets (customer → account → transaction → loan)
- Create derived KPIs (Customer Age, Churn Flag, Avg Balance, Loan Conversion)

After cleaning, you’ll have a single file: `merged_dataset.csv` ready for Tableau.
