In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# Plotting settings
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


In [2]:
pd.set_option("display.max_columns", 31)
pd.set_option("display.precision",2)

In [3]:
df = pd.read_csv("../data/raw/creditcard.csv")
shape_df = df.shape

In [4]:
print(df.duplicated().sum())
df_nodup = df.drop_duplicates()
shape_nodup = df_nodup.shape
print(shape_nodup)
print(shape_df)
print(f"There were {shape_df[0] - shape_nodup[0]} duplicates, that have been removed")

1081
(283726, 31)
(284807, 31)
There were 1081 duplicates, that have been removed


In [5]:
from sklearn.model_selection import train_test_split
X = df_nodup.drop(columns = ["Class"])
y = df_nodup["Class"]

print(X.shape)
print(y.shape)

(283726, 30)
(283726,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, stratify = y, random_state = 42)

In [7]:
print(f"X_train : \n {X_train.shape}")
print(f"X_test: \n {X_test.shape}")
print(f"y_train: \n {y_train.shape}\n y_test: \n {y_test.shape}")

X_train : 
 (226980, 30)
X_test: 
 (56746, 30)
y_train: 
 (226980,)
 y_test: 
 (56746,)


In [8]:
X_train["Hour"] = (X_train["Time"] /3600%24).round()

print(X_train.columns)
X_test["Hour"] = (X_test["Time"] /3600%24)
print(X_test.columns)

X_train.head()

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Hour'],
      dtype='object')
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Hour'],
      dtype='object')


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Hour
226238,144549.0,2.24,-1.72,-2.15,-2.58,0.99,3.57,-1.79,0.86,-1.26,1.57,-0.24,-0.71,0.36,-0.31,0.29,-0.58,0.38,-0.18,-0.32,-0.32,-0.15,-0.05,0.28,0.68,-0.22,-0.159,0.04,-0.05,32.0,16.0
134253,80716.0,-1.32,1.63,0.6,-0.04,-0.4,-0.97,0.21,0.74,-1.27,-0.48,1.44,1.76,1.25,0.86,-0.27,0.39,-0.27,-0.34,0.39,-0.07,-0.24,-0.95,0.32,0.52,-0.71,-0.267,-0.02,0.05,6.99,22.0
186465,127116.0,1.91,0.02,-2.09,0.13,1.16,0.61,-0.02,0.18,0.28,-0.5,1.53,1.04,0.48,-0.69,0.96,-0.28,0.79,-0.16,-0.99,-0.21,0.29,1.1,-0.04,-1.69,0.11,0.00776,0.05,-0.05,14.95,11.0
149493,91342.0,1.81,0.32,0.32,3.88,0.05,1.02,-0.73,0.23,0.68,1.15,1.34,-1.78,2.18,1.24,-2.04,1.34,-0.48,0.82,-1.69,-0.23,0.14,0.7,0.17,0.7,-0.21,-0.01,-0.02,-0.04,17.3,1.0
18461,29522.0,1.36,-1.12,0.55,-1.55,-1.19,0.28,-1.2,0.21,-2.09,1.49,1.74,0.01,0.33,-0.04,0.47,-0.66,0.79,-0.64,-0.59,-0.36,-0.34,-0.64,0.25,-0.34,-0.06,-0.44,0.06,0.01,24.0,8.0


In [9]:
X_train["is_night"] = np.where((X_train["Hour"]>=0) & (X_train["Hour"] <=6), 1 , 0 )
print(X_train.head(20))
X_test["is_night"] = np.where((X_test["Hour"]>=0) & (X_test["Hour"] <=6), 1 , 0 )

            Time    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10  \
226238  144549.0  2.24 -1.72 -2.15 -2.58  0.99  3.57 -1.79  0.86 -1.26  1.57   
134253   80716.0 -1.32  1.63  0.60 -0.04 -0.40 -0.97  0.21  0.74 -1.27 -0.48   
186465  127116.0  1.91  0.02 -2.09  0.13  1.16  0.61 -0.02  0.18  0.28 -0.50   
149493   91342.0  1.81  0.32  0.32  3.88  0.05  1.02 -0.73  0.23  0.68  1.15   
18461    29522.0  1.36 -1.12  0.55 -1.55 -1.19  0.28 -1.20  0.21 -2.09  1.49   
85069    60586.0 -5.86  4.83 -0.98 -1.25 -0.79 -1.28  1.08 -0.35  4.52  7.41   
35270    38042.0  1.24  0.02  0.32  0.56 -0.18 -0.16 -0.03 -0.12  0.43 -0.27   
279124  168663.0 -0.88  0.28 -0.79 -3.79 -0.06 -1.98  0.98 -0.10 -0.30 -1.10   
251528  155381.0  2.01  0.12 -1.60  0.35  0.40 -0.66  0.11 -0.11  0.19 -0.22   
248025  153773.0 -2.49  2.91 -2.49 -1.53  0.10 -1.46  0.58  0.89  0.49  1.41   
116723   74401.0  1.13 -0.26  1.34  0.90 -1.16 -0.05 -0.78  0.26  1.08 -0.25   
33895    37429.0  1.15 -1.03  0.03 -2.41

In [10]:
X_train["log_amount"] = np.log1p(X_train["Amount"])


In [11]:
X_test["log_amount"] = np.log1p(X_test["Amount"])

In [12]:
from numpy import isnan

In [13]:


print(X_train.shape)
print(X_test.shape)
print((X_train["Hour"]>24).any() | (X_train["Hour"]<0).any())
print(X_train["log_amount"].isna().any())
print(X_train['is_night'].value_counts())
print(X_train['Hour'].min(), X_train['Hour'].max())
print(X_train['log_amount'].describe())




(226980, 33)
(56746, 33)
False
False
is_night
0    206586
1     20394
Name: count, dtype: int64
0.0 24.0
count    226980.00
mean          3.16
std           1.66
min           0.00
25%           1.90
50%           3.14
75%           4.37
max           9.89
Name: log_amount, dtype: float64


In [14]:
print(y_test.value_counts(normalize = True))
print(y_train.value_counts(normalize = True))

Class
0    9.98e-01
1    1.67e-03
Name: proportion, dtype: float64
Class
0    9.98e-01
1    1.67e-03
Name: proportion, dtype: float64


In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()

In [17]:
X_train_scaled = scaler.fit_transform(X_train)


In [18]:
X_test_scaled = scaler.transform(X_test)

In [19]:
print(X_test.shape)
print(X_train.shape)
print(X_train_scaled.shape)
print(X_test_scaled.shape)


(56746, 33)
(226980, 33)
(226980, 33)
(56746, 33)


In [20]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns, index = X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns, index = X_test.index)

In [21]:
print(X_train.head())
X_train_scaled.head()

            Time    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10  \
226238  144549.0  2.24 -1.72 -2.15 -2.58  0.99  3.57 -1.79  0.86 -1.26  1.57   
134253   80716.0 -1.32  1.63  0.60 -0.04 -0.40 -0.97  0.21  0.74 -1.27 -0.48   
186465  127116.0  1.91  0.02 -2.09  0.13  1.16  0.61 -0.02  0.18  0.28 -0.50   
149493   91342.0  1.81  0.32  0.32  3.88  0.05  1.02 -0.73  0.23  0.68  1.15   
18461    29522.0  1.36 -1.12  0.55 -1.55 -1.19  0.28 -1.20  0.21 -2.09  1.49   

         V11   V12   V13   V14  ...   V18   V19   V20   V21   V22   V23   V24  \
226238 -0.24 -0.71  0.36 -0.31  ... -0.18 -0.32 -0.32 -0.15 -0.05  0.28  0.68   
134253  1.44  1.76  1.25  0.86  ... -0.34  0.39 -0.07 -0.24 -0.95  0.32  0.52   
186465  1.53  1.04  0.48 -0.69  ... -0.16 -0.99 -0.21  0.29  1.10 -0.04 -1.69   
149493  1.34 -1.78  2.18  1.24  ...  0.82 -1.69 -0.23  0.14  0.70  0.17  0.70   
18461   1.74  0.01  0.33 -0.04  ... -0.64 -0.59 -0.36 -0.34 -0.64  0.25 -0.34   

         V25       V26   V27   V

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,...,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Hour,is_night,log_amount
226238,1.05,1.15,-1.04,-1.43,-1.82,0.73,2.7,-1.49,0.74,-1.15,1.46,-0.23,-0.71,0.37,-0.33,...,-0.22,-0.39,-0.43,-0.21,-0.07,0.45,1.13,-0.42,-0.33,0.09,-0.16,-0.23,0.25,-0.31,0.21
134253,-0.3,-0.68,0.99,0.39,-0.03,-0.3,-0.73,0.18,0.63,-1.16,-0.45,1.41,1.78,1.26,0.91,...,-0.41,0.48,-0.09,-0.33,-1.31,0.52,0.85,-1.37,-0.55,-0.05,0.15,-0.33,1.27,-0.31,-0.65
186465,0.68,0.98,0.02,-1.39,0.09,0.85,0.46,-0.02,0.15,0.26,-0.46,1.5,1.05,0.48,-0.72,...,-0.19,-1.22,-0.28,0.41,1.51,-0.07,-2.79,0.2,0.02,0.11,-0.17,-0.3,-0.61,-0.31,-0.23
149493,-0.07,0.93,0.2,0.21,2.74,0.03,0.77,-0.61,0.2,0.63,1.07,1.31,-1.79,2.19,1.3,...,0.98,-2.08,-0.3,0.19,0.97,0.28,1.16,-0.41,-0.02,-0.05,-0.12,-0.29,-2.31,3.18,-0.15
18461,-1.38,0.7,-0.68,0.36,-1.09,-0.88,0.21,-1.0,0.18,-1.91,1.39,1.71,0.01,0.33,-0.05,...,-0.77,-0.73,-0.48,-0.47,-0.88,0.4,-0.57,-0.12,-0.91,0.16,0.04,-0.26,-1.12,-0.31,0.04


In [25]:
print(X_train_scaled["Amount"].mean(axis = 0))
print(X_train_scaled["Hour"].mean(axis = 0))

7.48796468384069e-17
-1.2671940234191935e-16


In [26]:
print(X_train_scaled["Amount"].std(axis = 0))
print(X_train_scaled["Hour"].std(axis = 0))

1.000002202844533
1.0000022028445332


In [24]:
print(X_train["Amount"].iloc[0])
print(X_train_scaled["Amount"].iloc[0])

32.0
-0.22943372384467148


In [31]:
from imblearn.over_sampling import SMOTE

In [32]:
sm = SMOTE(random_state = 42)

In [35]:
X_train_smote, y_train_smote = sm.fit_resample(X_train_scaled,y_train)

In [45]:
print(X_train_smote.shape, y_train_smote.shape)
print(X_train_smote.head())
print(y_train_smote.value_counts())

(453204, 33) (453204,)
0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64
Class
0    226602
1    226602
Name: count, dtype: int64


In [43]:
from imblearn.under_sampling import RandomUnderSampler


In [44]:
rus = RandomUnderSampler(random_state = 42)

In [46]:
X_train_under, y_train_under = rus.fit_resample(X_train_scaled, y_train)

In [50]:
print(y_train.value_counts())
print(X_train_under.shape, y_train_under.shape)
print(X_train_under.head())
print(y_train_under.value_counts())

Class
0    226602
1       378
Name: count, dtype: int64
(756, 33) (756,)
        Time    V1        V2    V3    V4    V5    V6    V7    V8        V9  \
227442  1.06  1.05  4.57e-02 -1.21  0.17  0.42 -0.30  0.04 -0.04  2.91e-01   
218717  0.98  0.99 -7.45e-01 -0.29 -0.49 -0.97 -0.64 -0.64 -0.17  7.03e-03   
76902  -0.80 -1.01  7.73e-01  0.81  0.22 -0.94  0.12 -0.83  1.21 -2.14e-01   
86663  -0.71 -0.63  7.08e-04 -0.09 -0.92 -0.27 -1.52  0.20  0.07 -1.21e+00   
167195  0.50  0.98 -1.16e-01 -0.19  1.13 -0.35 -0.23 -0.23  0.02  1.15e+00   

             V10   V11   V12   V13   V14  ...   V18   V19   V20   V21   V22  \
227442 -1.82e-01  0.78  0.53 -0.38 -0.61  ...  0.40  0.56 -0.21 -0.47 -1.27   
218717  6.05e-01 -0.62 -0.14  0.68 -0.53  ... -1.86  0.71  0.29  0.04 -0.18   
76902  -1.16e+00 -0.92  1.24  1.19  0.22  ... -0.64 -0.26 -0.50  0.26  0.42   
86663   5.17e-01 -0.62 -0.07  0.16  0.52  ...  0.82 -1.77 -0.43 -0.30 -0.32   
167195 -1.40e-03 -1.32  0.47 -0.76 -0.16  ... -1.09  0.17 -0.43

In [51]:
from imblearn.pipeline import Pipeline

In [52]:
pipeline = Pipeline([
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('under', RandomUnderSampler(sampling_strategy=1.0, random_state=42))
])
X_train_combined, y_train_combined = pipeline.fit_resample(X_train_scaled, y_train)


In [53]:
print(X_train_combined.shape, y_train_combined.shape)

(226602, 33) (226602,)


In [54]:
print(y_train_combined.value_counts())

Class
0    113301
1    113301
Name: count, dtype: int64


In [59]:
print(X_train_scaled.shape,y_train.shape)
print(X_train_smote.shape, y_train_smote.shape)
print(X_train_under.shape, y_train_under.shape)
print(X_train_combined.shape, y_train_combined.shape)

(226980, 33) (226980,)
(453204, 33) (453204,)
(756, 33) (756,)
(226602, 33) (226602,)


In [61]:
print(y_train.value_counts())
print(y_train_smote.value_counts())
print(y_train_under.value_counts())
print(y_train_combined.value_counts())


Class
0    226602
1       378
Name: count, dtype: int64
Class
0    226602
1    226602
Name: count, dtype: int64
Class
0    378
1    378
Name: count, dtype: int64
Class
0    113301
1    113301
Name: count, dtype: int64
