In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# 1. Load the bank data
credit_df = pd.read_csv('../data/raw/creditcard.csv')

# 2. EXPLICIT CLASS IMBALANCE DOCUMENTATION
print("--- Credit Card Class Distribution ---")
print(credit_df['Class'].value_counts())
print(f"Fraud Percentage: {credit_df['Class'].mean()*100:.4f}%")

# 3. SCALING (Essential for PCA data)
# Most features (V1-V28) are already scaled. Time and Amount are NOT.
scaler = StandardScaler()
credit_df['scaled_amount'] = scaler.fit_transform(credit_df[['Amount']])
credit_df['scaled_time'] = scaler.fit_transform(credit_df[['Time']])

# Drop the old unscaled columns
credit_df.drop(['Time', 'Amount'], axis=1, inplace=True)

# 4. VISUALIZING THE CHALLENGE
plt.figure(figsize=(8, 5))
sns.countplot(x='Class', data=credit_df)
plt.title('Extreme Class Imbalance in Bank Data')
plt.show()

# 5. SAVE PROCESSED DATA
credit_df.to_csv('../data/processed/processed_creditcard.csv', index=False)
print("Task 1b Complete: Data Scaled and Saved.")

Legit: 284315, Fraud: 492
Percentage of Fraud: 0.1727%
