In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

file_path = r"data/Fin_data.csv"

df = pd.read_csv(file_path)

In [83]:
# Using pandas' get_dummies for one-hot encoding
df = df.drop(columns=[ 'nameDest', 'nameOrig', 'isFlaggedFraud', 'step'])

data_encoded = pd.get_dummies(df, columns=['type'], dtype= int, drop_first= True)



data_encoded.to_csv("/Users/sa8/Desktop/Financial Fraud P2TL1/Financial_fraud_machine-1/data/cleaned_data.csv", index= False)

In [84]:
data_encoded.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,1,0
1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,1,0
2,181.0,181.0,0.0,0.0,0.0,1,0,0,0,1
3,181.0,181.0,0.0,21182.0,0.0,1,1,0,0,0
4,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,1,0


### We are adjusting the dataset to enhance its suitability for predictive modeling:

- Dropping nameDest & nameOrig: These columns contain unique identifiers that don’t add much value for prediction.
- Dropping isFlaggedFraud as only 16 transfers accounts were flagged out of 6+ million accounts so this column doesn't hold 
the value we need to help with predicting. 
- Removing step as the time interval doesn't effect when fraud occurs. 
 Removing them simplifies the dataset and reduces unnecessary complexity.
- One-Hot Encoding the type Column: Since type is a categorical feature, we need to convert it into a numerical format for better model performance. Using one-hot encoding, we break it down into separate binary columns, making it easier for the model to understand different transaction types.

In [85]:
# Check missing values
print(data_encoded.isnull().sum())


amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
type_CASH_OUT     0
type_DEBIT        0
type_PAYMENT      0
type_TRANSFER     0
dtype: int64


In [86]:
data_encoded['newbalanceOrig'].value_counts()

newbalanceOrig
0.00         3609566
5888.64            4
15073.44           4
5122.00            4
36875.73           4
              ...   
51345.64           1
46527.23           1
30839.51           1
25521.75           1
160296.36          1
Name: count, Length: 2682586, dtype: int64

In [87]:
data_encoded['oldbalanceDest'].value_counts()

oldbalanceDest
0.00           2704388
10000000.00        615
20000000.00        219
30000000.00         86
40000000.00         31
                ...   
967022.27            1
327279.67            1
2039554.04           1
587552.25            1
6510099.11           1
Name: count, Length: 3614697, dtype: int64

In [88]:
data_encoded['amount'].value_counts()

amount
10000000.00    3207
10000.00         88
5000.00          79
15000.00         68
500.00           65
               ... 
8513.48           1
189137.90         1
1615.78           1
128891.19         1
12257.89          1
Name: count, Length: 5316900, dtype: int64

In [89]:
data_encoded.drop_duplicates()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,9839.64,170136.00,160296.36,0.00,0.00,0,0,0,1,0
1,1864.28,21249.00,19384.72,0.00,0.00,0,0,0,1,0
2,181.00,181.00,0.00,0.00,0.00,1,0,0,0,1
3,181.00,181.00,0.00,21182.00,0.00,1,1,0,0,0
4,11668.14,41554.00,29885.86,0.00,0.00,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
6362615,339682.13,339682.13,0.00,0.00,339682.13,1,1,0,0,0
6362616,6311409.28,6311409.28,0.00,0.00,0.00,1,0,0,0,1
6362617,6311409.28,6311409.28,0.00,68488.84,6379898.11,1,1,0,0,0
6362618,850002.52,850002.52,0.00,0.00,0.00,1,0,0,0,1
