# Purpose of the Data Preprocessing Notebook

# 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler

# 2. Load Raw Dataset

In [2]:
data_path = '../data/raw/creditcard.csv'
df = pd.read_csv(data_path)

In [3]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [4]:
print("Dataset shape:", df.shape)
display(df.head())

Dataset shape: (284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# 3. Drop Irrelevant or Duplicate Columns

In [6]:
# remove any duplicates

df = df.drop_duplicates() 

In [7]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [8]:
print("Shape after removing duplicates:", df.shape)

Shape after removing duplicates: (283726, 31)


# 4. Separate Features and Target

In [9]:
X = df.drop('Class', axis=1)
y = df['Class']

In [10]:
X

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00


In [11]:
y

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 283726, dtype: int64

In [12]:
print("Feature shape:", X.shape)
print("Target shape:", y.shape)

Feature shape: (283726, 30)
Target shape: (283726,)


# 5. Train/Test Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [14]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (226980, 30)
X_test shape: (56746, 30)


# 6. Scaling Features

In [15]:
scaler = RobustScaler()

In [16]:
scaler

In [17]:
X_train['Amount_scaled'] = scaler.fit_transform(X_train[['Amount']])
X_test['Amount_scaled'] = scaler.transform(X_test[['Amount']])

In [18]:
X_train['Amount_scaled']

226238    0.137472
134253   -0.209119
186465   -0.098808
149493   -0.066242
18461     0.026608
            ...   
225116    1.412417
3525      3.322062
242674    0.206763
60594    -0.236696
67057    -0.135394
Name: Amount_scaled, Length: 226980, dtype: float64

In [19]:
X_test['Amount_scaled']

86568    -0.146619
251557   -0.279102
20232     0.958980
68952     2.853659
191852   -0.286447
            ...   
56516    -0.278271
188431   -0.167544
115561   -0.179324
187644   -0.293653
8267     -0.084950
Name: Amount_scaled, Length: 56746, dtype: float64

In [20]:
X_train = X_train.drop(['Amount'], axis=1)
X_test = X_test.drop(['Amount'], axis=1)

In [21]:
X_train

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount_scaled
226238,144549.0,2.238954,-1.724499,-2.151484,-2.577803,0.993668,3.565492,-1.785957,0.860122,-1.264003,...,-0.323810,-0.149574,-0.049333,0.278442,0.684735,-0.219028,-0.159167,0.037920,-0.049932,0.137472
134253,80716.0,-1.315062,1.630783,0.597001,-0.038359,-0.404580,-0.965712,0.212249,0.735381,-1.267926,...,-0.067580,-0.238898,-0.946773,0.323904,0.515632,-0.713000,-0.266503,-0.017794,0.051058,-0.209119
186465,127116.0,1.908801,0.021184,-2.087997,0.129310,1.161468,0.605244,-0.022371,0.180296,0.283819,...,-0.210474,0.293609,1.095842,-0.044874,-1.689517,0.106098,0.007758,0.045164,-0.053068,-0.098808
149493,91342.0,1.811257,0.316556,0.316751,3.880231,0.048454,1.020163,-0.734868,0.233651,0.681423,...,-0.228032,0.138869,0.700422,0.174064,0.702997,-0.212523,-0.010018,-0.017740,-0.038006,-0.066242
18461,29522.0,1.358817,-1.120881,0.550266,-1.547659,-1.194950,0.275448,-1.201843,0.212889,-2.094285,...,-0.361686,-0.340972,-0.636442,0.252758,-0.344160,-0.064282,-0.439622,0.062524,0.013095,0.026608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225116,144113.0,-0.098309,-0.382823,0.202726,-0.732301,0.336342,-0.253872,0.837947,-0.402611,-1.424484,...,0.028820,0.154790,1.009327,0.002124,-0.462865,-0.422526,0.010597,0.041090,-0.004501,1.412417
3525,3013.0,-1.708940,-1.296255,1.953170,-2.277028,-0.510388,0.408807,0.329231,0.477331,1.058119,...,0.557379,0.466843,0.860939,0.361317,-0.264732,0.423478,0.065689,-0.026392,0.077562,3.322062
242674,151594.0,-0.875424,1.159992,0.573970,1.178888,-0.206267,0.260623,0.371124,0.443532,-0.011588,...,-0.353885,-0.100597,-0.157726,0.049751,-0.014156,-0.513855,-0.771982,-0.242970,0.094879,0.206763
60594,49395.0,1.247036,0.348595,0.589407,0.982998,-0.222000,-0.452687,-0.012336,-0.140168,0.052173,...,-0.042753,-0.211495,-0.556038,0.037106,-0.133330,0.403622,-0.595756,0.043911,0.032002,-0.236696


In [22]:
X_test

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount_scaled
86568,61290.0,1.228821,-0.063408,0.274145,0.647465,-0.048135,0.372073,-0.224231,0.079939,0.640759,...,-0.096566,-0.129554,-0.083779,-0.151661,-0.700372,0.598550,0.491409,0.002989,0.001782,-0.146619
251557,155394.0,-0.203154,1.176678,-0.759595,-0.518472,0.629649,-0.721675,0.638893,0.243377,-0.157488,...,-0.098429,0.384629,1.206754,-0.082753,0.508386,-0.710906,-0.234510,0.379640,0.261351,-0.279102
20232,30881.0,-1.672836,1.401297,1.503940,2.175491,0.699791,1.062139,1.114364,-0.535822,-0.252983,...,0.828472,-0.525950,-0.408743,-0.280083,-0.846468,-0.155456,-0.062383,0.007777,0.113900,0.958980
68952,53203.0,0.819379,-1.124913,0.515025,0.513945,-1.009048,0.488484,-0.580672,0.187686,-0.999142,...,-0.110144,-0.057205,-0.168061,-0.198732,-0.337408,0.238538,-0.289454,0.038214,0.058407,2.853659
191852,129429.0,2.009701,0.105635,-1.752759,0.588312,0.374801,-0.637884,0.009260,-0.129487,0.492619,...,-0.135479,0.008970,0.357394,-0.013472,-0.446920,0.111522,0.642944,-0.036998,-0.043404,-0.286447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56516,47464.0,-1.162452,0.795672,1.953356,1.654216,1.069023,-0.183501,0.093182,0.377642,-1.362161,...,0.023948,0.258460,0.502164,-0.121908,0.094908,-0.078996,-0.084585,0.093070,0.146178,-0.278271
188431,127974.0,-2.023289,2.214345,-2.097511,-1.041892,-0.120896,-0.027650,-0.595211,1.820963,-0.391636,...,0.014967,-0.202295,-0.810599,0.165183,-0.360040,-0.103692,0.173059,0.060668,-0.003436,-0.167544
115561,73912.0,1.180651,-0.199219,1.233565,0.869184,-1.138767,-0.315881,-0.660840,0.103571,1.011595,...,-0.134275,-0.097776,-0.114066,0.037897,0.419476,0.236291,0.318145,0.013352,0.026229,-0.179324
187644,127623.0,2.061059,0.136698,-1.830805,0.206364,0.705093,-0.326154,0.107304,-0.093259,0.184680,...,-0.102156,-0.332798,-0.861005,0.274903,0.113347,-0.219966,0.176290,-0.063574,-0.043578,-0.293653


In [23]:
print(X_train.head())

            Time        V1        V2        V3        V4        V5        V6  \
226238  144549.0  2.238954 -1.724499 -2.151484 -2.577803  0.993668  3.565492   
134253   80716.0 -1.315062  1.630783  0.597001 -0.038359 -0.404580 -0.965712   
186465  127116.0  1.908801  0.021184 -2.087997  0.129310  1.161468  0.605244   
149493   91342.0  1.811257  0.316556  0.316751  3.880231  0.048454  1.020163   
18461    29522.0  1.358817 -1.120881  0.550266 -1.547659 -1.194950  0.275448   

              V7        V8        V9  ...       V20       V21       V22  \
226238 -1.785957  0.860122 -1.264003  ... -0.323810 -0.149574 -0.049333   
134253  0.212249  0.735381 -1.267926  ... -0.067580 -0.238898 -0.946773   
186465 -0.022371  0.180296  0.283819  ... -0.210474  0.293609  1.095842   
149493 -0.734868  0.233651  0.681423  ... -0.228032  0.138869  0.700422   
18461  -1.201843  0.212889 -2.094285  ... -0.361686 -0.340972 -0.636442   

             V23       V24       V25       V26       V27       V28  

# 7. Optional: Feature Engineering

In [24]:
X_train['Hour'] = (X_train['Time'] // 3600) % 24
X_test['Hour'] = (X_test['Time'] // 3600) % 24

In [25]:
X_train['Hour']

226238    16.0
134253    22.0
186465    11.0
149493     1.0
18461      8.0
          ... 
225116    16.0
3525       0.0
242674    18.0
60594     13.0
67057     14.0
Name: Hour, Length: 226980, dtype: float64

In [26]:
X_test['Hour']

86568     17.0
251557    19.0
20232      8.0
68952     14.0
191852    11.0
          ... 
56516     13.0
188431    11.0
115561    20.0
187644    11.0
8267       3.0
Name: Hour, Length: 56746, dtype: float64

In [27]:
X_train = X_train.drop(['Time'], axis=1)
X_test = X_test.drop(['Time'], axis=1)

In [28]:
X_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount_scaled,Hour
226238,2.238954,-1.724499,-2.151484,-2.577803,0.993668,3.565492,-1.785957,0.860122,-1.264003,1.567867,...,-0.149574,-0.049333,0.278442,0.684735,-0.219028,-0.159167,0.037920,-0.049932,0.137472,16.0
134253,-1.315062,1.630783,0.597001,-0.038359,-0.404580,-0.965712,0.212249,0.735381,-1.267926,-0.482635,...,-0.238898,-0.946773,0.323904,0.515632,-0.713000,-0.266503,-0.017794,0.051058,-0.209119,22.0
186465,1.908801,0.021184,-2.087997,0.129310,1.161468,0.605244,-0.022371,0.180296,0.283819,-0.497766,...,0.293609,1.095842,-0.044874,-1.689517,0.106098,0.007758,0.045164,-0.053068,-0.098808,11.0
149493,1.811257,0.316556,0.316751,3.880231,0.048454,1.020163,-0.734868,0.233651,0.681423,1.146705,...,0.138869,0.700422,0.174064,0.702997,-0.212523,-0.010018,-0.017740,-0.038006,-0.066242,1.0
18461,1.358817,-1.120881,0.550266,-1.547659,-1.194950,0.275448,-1.201843,0.212889,-2.094285,1.492821,...,-0.340972,-0.636442,0.252758,-0.344160,-0.064282,-0.439622,0.062524,0.013095,0.026608,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225116,-0.098309,-0.382823,0.202726,-0.732301,0.336342,-0.253872,0.837947,-0.402611,-1.424484,0.665378,...,0.154790,1.009327,0.002124,-0.462865,-0.422526,0.010597,0.041090,-0.004501,1.412417,16.0
3525,-1.708940,-1.296255,1.953170,-2.277028,-0.510388,0.408807,0.329231,0.477331,1.058119,-2.015791,...,0.466843,0.860939,0.361317,-0.264732,0.423478,0.065689,-0.026392,0.077562,3.322062,0.0
242674,-0.875424,1.159992,0.573970,1.178888,-0.206267,0.260623,0.371124,0.443532,-0.011588,-0.192711,...,-0.100597,-0.157726,0.049751,-0.014156,-0.513855,-0.771982,-0.242970,0.094879,0.206763,18.0
60594,1.247036,0.348595,0.589407,0.982998,-0.222000,-0.452687,-0.012336,-0.140168,0.052173,-0.069423,...,-0.211495,-0.556038,0.037106,-0.133330,0.403622,-0.595756,0.043911,0.032002,-0.236696,13.0


In [29]:
X_test

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount_scaled,Hour
86568,1.228821,-0.063408,0.274145,0.647465,-0.048135,0.372073,-0.224231,0.079939,0.640759,-0.273054,...,-0.129554,-0.083779,-0.151661,-0.700372,0.598550,0.491409,0.002989,0.001782,-0.146619,17.0
251557,-0.203154,1.176678,-0.759595,-0.518472,0.629649,-0.721675,0.638893,0.243377,-0.157488,-0.506595,...,0.384629,1.206754,-0.082753,0.508386,-0.710906,-0.234510,0.379640,0.261351,-0.279102,19.0
20232,-1.672836,1.401297,1.503940,2.175491,0.699791,1.062139,1.114364,-0.535822,-0.252983,3.071219,...,-0.525950,-0.408743,-0.280083,-0.846468,-0.155456,-0.062383,0.007777,0.113900,0.958980,8.0
68952,0.819379,-1.124913,0.515025,0.513945,-1.009048,0.488484,-0.580672,0.187686,-0.999142,0.871966,...,-0.057205,-0.168061,-0.198732,-0.337408,0.238538,-0.289454,0.038214,0.058407,2.853659,14.0
191852,2.009701,0.105635,-1.752759,0.588312,0.374801,-0.637884,0.009260,-0.129487,0.492619,-0.527043,...,0.008970,0.357394,-0.013472,-0.446920,0.111522,0.642944,-0.036998,-0.043404,-0.286447,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56516,-1.162452,0.795672,1.953356,1.654216,1.069023,-0.183501,0.093182,0.377642,-1.362161,-0.398247,...,0.258460,0.502164,-0.121908,0.094908,-0.078996,-0.084585,0.093070,0.146178,-0.278271,13.0
188431,-2.023289,2.214345,-2.097511,-1.041892,-0.120896,-0.027650,-0.595211,1.820963,-0.391636,-0.233509,...,-0.202295,-0.810599,0.165183,-0.360040,-0.103692,0.173059,0.060668,-0.003436,-0.167544,11.0
115561,1.180651,-0.199219,1.233565,0.869184,-1.138767,-0.315881,-0.660840,0.103571,1.011595,-0.224579,...,-0.097776,-0.114066,0.037897,0.419476,0.236291,0.318145,0.013352,0.026229,-0.179324,20.0
187644,2.061059,0.136698,-1.830805,0.206364,0.705093,-0.326154,0.107304,-0.093259,0.184680,-0.223070,...,-0.332798,-0.861005,0.274903,0.113347,-0.219966,0.176290,-0.063574,-0.043578,-0.293653,11.0


# 8. Check Class Distribution in Train/Test

In [30]:
print("Training class distribution:")
print(y_train.value_counts(normalize=True))
print("\nTesting class distribution:")
print(y_test.value_counts(normalize=True))


Training class distribution:
Class
0    0.998335
1    0.001665
Name: proportion, dtype: float64

Testing class distribution:
Class
0    0.998326
1    0.001674
Name: proportion, dtype: float64


# 9. Save Preprocessed Data

In [31]:
import joblib
import os

os.makedirs('../data/preprocessed', exist_ok=True)

In [32]:
X_train.to_csv('../data/preprocessed/X_train.csv', index=False)
X_test.to_csv('../data/preprocessed/X_test.csv', index=False)

In [33]:
y_train.to_csv('../data/preprocessed/y_train.csv', index=False)
y_test.to_csv('../data/preprocessed/y_test.csv', index=False)

In [34]:
joblib.dump(scaler, '../models/scaler.pkl')

['../models/scaler.pkl']

# Summary