In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../data/raw/creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
scaler = StandardScaler()
df[['Amount', 'Time']] = scaler.fit_transform(df[['Amount', 'Time']])

In [4]:
X = df.drop('Class', axis=1)
y = df['Class']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [6]:
X_train.to_csv("../data/processed/cc_X_train.csv", index=False)
X_test.to_csv("../data/processed/cc_X_test.csv", index=False)
y_train.to_csv("../data/processed/cc_y_train.csv", index=False)
y_test.to_csv("../data/processed/cc_y_test.csv", index=False)

In [7]:
train_trans = pd.read_csv("../data/raw/train_transaction.csv")
train_id = pd.read_csv("../data/raw/train_identity.csv")

In [8]:
ieee_df = train_trans.merge(train_id, on='TransactionID', how='left')
ieee_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [9]:
ieee_df.fillna(-999, inplace=True)

In [10]:
cat_cols = ieee_df.select_dtypes(include='object').columns

le = LabelEncoder()
for col in cat_cols:
    ieee_df[col] = le.fit_transform(ieee_df[col].astype(str))

In [11]:
X_ieee = ieee_df.drop('isFraud', axis=1)
y_ieee = ieee_df['isFraud']

In [12]:
X_ieee_train, X_ieee_test, y_ieee_train, y_ieee_test = train_test_split(
    X_ieee, y_ieee,
    test_size=0.2,
    stratify=y_ieee,
    random_state=42
)

In [13]:
X_ieee_train.to_csv("../data/processed/ieee_X_train.csv", index=False)
X_ieee_test.to_csv("../data/processed/ieee_X_test.csv", index=False)
y_ieee_train.to_csv("../data/processed/ieee_y_train.csv", index=False)
y_ieee_test.to_csv("../data/processed/ieee_y_test.csv", index=False)


In [14]:
import joblib
joblib.dump(scaler, "../models/scaler.pkl")

['../models/scaler.pkl']

## Preprocessing Summary

### Credit Card Dataset
- Scaled Time and Amount features
- Split into training and testing sets
- Saved processed files

### IEEE Dataset
- Merged transaction and identity tables
- Filled missing values
- Encoded categorical features
- Split into training and testing sets
- Saved processed files

The datasets are now ready for modeling.
