In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

                                              IMPORTING DATASET

In [2]:
df = pd.read_csv("/content/PS_20174392719_1491204439457_log.csv")
print(df.head())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [3]:
print(df.tail())

         step      type      amount     nameOrig  oldbalanceOrg  \
6362615   743  CASH_OUT   339682.13   C786484425      339682.13   
6362616   743  TRANSFER  6311409.28  C1529008245     6311409.28   
6362617   743  CASH_OUT  6311409.28  C1162922333     6311409.28   
6362618   743  TRANSFER   850002.52  C1685995037      850002.52   
6362619   743  CASH_OUT   850002.52  C1280323807      850002.52   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
6362615             0.0   C776919290            0.00       339682.13        1   
6362616             0.0  C1881841831            0.00            0.00        1   
6362617             0.0  C1365125890        68488.84      6379898.11        1   
6362618             0.0  C2080388513            0.00            0.00        1   
6362619             0.0   C873221189      6510099.11      7360101.63        1   

         isFlaggedFraud  
6362615               0  
6362616               0  
6362617               0  
636261

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None


                                                                  CHECKING MISSING VALUES IN THE DATASET

In [5]:
missing_values = df.isnull().sum()
print("Missing Values in Each Column:\n", missing_values)

missing_percentage = (df.isnull().sum() / len(df)) * 100
print("\nPercentage of Missing Values in Each Column:\n", missing_percentage)

Missing Values in Each Column:
 step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

Percentage of Missing Values in Each Column:
 step              0.0
type              0.0
amount            0.0
nameOrig          0.0
oldbalanceOrg     0.0
newbalanceOrig    0.0
nameDest          0.0
oldbalanceDest    0.0
newbalanceDest    0.0
isFraud           0.0
isFlaggedFraud    0.0
dtype: float64


                                                                  CHECKING NO OF FRAUD AND NON FRAUD TRANSACTION

In [6]:
fraud_counts = df['isFraud'].value_counts()
print("Number of non-fraud transactions:", fraud_counts[0])
print("Number of fraud transactions:", fraud_counts[1])

Number of non-fraud transactions: 6354407
Number of fraud transactions: 8213


                                                                   NO OF VALUES IN EACH COLUMN

In [7]:
for column in df.columns:
  print(f"{column}: {df[column].nunique()}")

step: 743
type: 5
amount: 5316900
nameOrig: 6353307
oldbalanceOrg: 1845844
newbalanceOrig: 2682586
nameDest: 2722362
oldbalanceDest: 3614697
newbalanceDest: 3555499
isFraud: 2
isFlaggedFraud: 2


In [8]:
df.type.value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
CASH_OUT,2237500
PAYMENT,2151495
CASH_IN,1399284
TRANSFER,532909
DEBIT,41432


                                                                  HANDLING ALL STRING VALUES IN MY DATASET

In [9]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

high_cardinality_columns = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() > 10]
low_cardinality_columns = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() <= 10]


label_encoder = LabelEncoder()
for col in high_cardinality_columns:
    df[col] = label_encoder.fit_transform(df[col])


df = pd.get_dummies(df, columns=low_cardinality_columns, drop_first=True)

target_encoder = ce.TargetEncoder(cols=high_cardinality_columns)
df[high_cardinality_columns] = target_encoder.fit_transform(df[high_cardinality_columns], df['isFraud'])


for col in df.select_dtypes(include='object').columns:
    df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)


for col in df.columns:
    if df[col].dropna().isin([0, 1]).all():
        df[col] = df[col].astype(bool)

print("Encoding and conversion completed.")

Encoding and conversion completed.


In [11]:
print(df.dtypes)

step                int64
amount            float64
nameOrig          float64
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          float64
oldbalanceDest    float64
newbalanceDest    float64
isFraud              bool
isFlaggedFraud       bool
type_CASH_OUT        bool
type_DEBIT           bool
type_PAYMENT         bool
type_TRANSFER        bool
dtype: object


In [12]:
non_numeric_columns = df.select_dtypes(include=['object']).columns
print(non_numeric_columns)

Index([], dtype='object')


                                                                  FEATURE SCALING USING STANDARDIZATION

In [13]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

scaler = StandardScaler()

df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

print(df.head())

       step    amount   nameOrig  oldbalanceOrg  newbalanceOrig  nameDest  \
0 -1.703042 -0.281560  -0.035978      -0.229810       -0.237622 -0.022437   
1 -1.703042 -0.294767  -0.035978      -0.281359       -0.285812 -0.022437   
2 -1.703042 -0.297555  27.834774      -0.288654       -0.292442  5.003338   
3 -1.703042 -0.297555  27.834774      -0.288654       -0.292442  5.238083   
4 -1.703042 -0.278532  -0.035978      -0.274329       -0.282221 -0.022437   

   oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  type_CASH_OUT  \
0       -0.323814       -0.333411    False           False          False   
1       -0.323814       -0.333411    False           False          False   
2       -0.323814       -0.333411     True           False          False   
3       -0.317582       -0.333411     True           False           True   
4       -0.323814       -0.333411    False           False          False   

   type_DEBIT  type_PAYMENT  type_TRANSFER  
0       False          True  

                                                                  NOW HANDLING THE IMBALANCE DATA USING OVERSAMPLING

In [14]:
!pip install imbalanced-learn



                                                                  SEPERATING THE DATASET FOR ANALYSIS

In [15]:
legit = df[df.isFraud == 0]
fraud = df[df.isFraud == 1]

In [16]:
print(legit.shape)
print(fraud.shape)

(6354407, 14)
(8213, 14)


In [17]:
pd.options.display.float_format = '{:.6f}'.format
print(legit.amount.describe())

count   6354407.000000
mean         -0.002757
std           0.987379
min          -0.297855
25%          -0.275716
50%          -0.174175
75%           0.047201
max         152.793582
Name: amount, dtype: float64


In [18]:
pd.options.display.float_format = '{:.6f}'.format
print(fraud.amount.describe())

count   8213.000000
mean       2.133126
std        3.981486
min       -0.297855
25%       -0.087389
50%        0.433151
75%        2.215602
max       16.262325
Name: amount, dtype: float64


In [19]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

X = df.drop(columns=['isFraud'])
y = df['isFraud']

oversampler = RandomOverSampler(sampling_strategy={1: y.value_counts()[0]}, random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

df_resampled = pd.concat([X_resampled, y_resampled], axis=1)

print("Counts after upscaling fraud transactions:")
print(df_resampled['isFraud'].value_counts())

  oversampler = RandomOverSampler(sampling_strategy={1: y.value_counts()[0]}, random_state=42)


Counts after upscaling fraud transactions:
isFraud
False    6354407
True     6354407
Name: count, dtype: int64


                                                                  SPLITTING FOR TRAINING AND TESTING

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled)
print("Training set class distribution:")
print(y_train.value_counts())

print("\nTesting set class distribution:")
print(y_test.value_counts())

Training set class distribution:
isFraud
True     4448085
False    4448084
Name: count, dtype: int64

Testing set class distribution:
isFraud
False    1906323
True     1906322
Name: count, dtype: int64


In [21]:
print(X_train.dtypes)

step              float64
amount            float64
nameOrig          float64
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          float64
oldbalanceDest    float64
newbalanceDest    float64
isFlaggedFraud       bool
type_CASH_OUT        bool
type_DEBIT           bool
type_PAYMENT         bool
type_TRANSFER        bool
dtype: object


                                                                  FITTING MODEL ON TRAINING DATA

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [23]:
dt_model = DecisionTreeClassifier(random_state=42)

In [24]:
dt_model.fit(X_train, y_train)

                                                                  PREDICTING AND EVALUATING MODEL

In [25]:
y_pred_dt = dt_model.predict(X_test)

In [26]:
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Accuracy:", accuracy_dt)

Accuracy: 1.0


In [29]:
print("Classification Report:\n", classification_report(y_test, y_pred_dt))

Classification Report:
               precision    recall  f1-score   support

       False       1.00      1.00      1.00   1906323
        True       1.00      1.00      1.00   1906322

    accuracy                           1.00   3812645
   macro avg       1.00      1.00      1.00   3812645
weighted avg       1.00      1.00      1.00   3812645

