In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

# Load dataset
df = pd.read_csv("creditcard.csv")
print("Original Dataset:\n", df.head())

# Normalize 'Amount' and 'Time' columns
scaler = StandardScaler()
df["Amount"] = scaler.fit_transform(df[["Amount"]])
df["Time"] = scaler.fit_transform(df[["Time"]])
print("\nAfter Normalization:\n", df.head())

# Separate majority and minority classes
df_majority = df[df.Class == 0]
df_minority = df[df.Class == 1]

# Undersample majority class (10:1 ratio)
df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority) * 10,
                                   random_state=42)

# Combine and shuffle dataset
df_balanced = pd.concat([df_majority_downsampled, df_minority]).sample(frac=1, random_state=42).reset_index(drop=True)
print("\nBalanced Dataset:\n", df_balanced.head())


Original Dataset:
    Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26  

In [None]:
from sklearn.model_selection import train_test_split

# Separate features and target variable
X_balanced = df_balanced.drop(columns=["Class"])
y_balanced = df_balanced["Class"]

# Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

print("\nTraining Set:\n", X_train.head())
print("\nTesting Set:\n", X_test.head())



Training Set:
           Time        V1        V2        V3        V4        V5        V6  \
4357 -0.086482 -0.255455 -4.467641 -1.795101  0.295300 -2.030577 -0.114797   
3334 -1.210828  1.438450 -1.108186  0.765152 -1.183354 -2.056695 -1.345164   
3392 -1.185116 -1.197948 -1.966561 -0.411709 -0.861518 -1.198300  0.249998   
3775 -0.590756 -0.830841  1.157957  1.052082  1.231561 -0.211692  0.385083   
2997  1.390058  0.208758  0.597888  0.117824 -0.770795  0.751306 -0.377900   

            V7        V8        V9  ...       V20       V21       V22  \
4357  0.611983 -0.262286  2.787857  ...  2.088147  0.750652 -0.230519   
3334 -1.044208 -0.147907 -1.590913  ... -0.429631 -0.430673 -1.031936   
3392 -0.286543 -2.835812 -1.638606  ...  1.990830 -0.545109  0.947411   
3775  0.244006  0.695039 -0.672355  ...  0.071039  0.070641  0.335528   
2997  0.723687 -0.110737  0.127532  ... -0.156048 -0.019723  0.041042   

           V23       V24       V25       V26       V27       V28    Amount  

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Print feature importances
print("\nFeature Importances:\n", clf.feature_importances_)



Feature Importances:
 [0.00478315 0.00564469 0.02325376 0.0555345  0.03802834 0.00790276
 0.008018   0.01677808 0.00544525 0.02834126 0.13142138 0.05872614
 0.12572202 0.00571568 0.16760984 0.00566437 0.08825614 0.13362368
 0.016886   0.0089506  0.00561887 0.00789739 0.00557432 0.00557771
 0.00421314 0.00444894 0.00598159 0.01210398 0.00460469 0.00767374]


In [None]:
from sklearn.metrics import classification_report

# Predict on test set
y_pred = clf.predict(X_test)

# Print evaluation metrics
print("\nModel Performance:\n", classification_report(y_test, y_pred))



Model Performance:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       985
           1       0.95      0.79      0.86        98

    accuracy                           0.98      1083
   macro avg       0.96      0.89      0.92      1083
weighted avg       0.98      0.98      0.98      1083

