In [12]:
import pandas as pd
import numpy as np

In [5]:
# I have used a dataset which was similar to the provided one but it is from Kaggle
data=pd.read_csv('onlinefraud.csv')

In [11]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1.0,0.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,0.0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [8]:
data.shape

(56203, 11)

In [10]:
# Exploring transaction type
print(data.type.value_counts())

type
PAYMENT     24242
CASH_OUT    15672
CASH_IN     10321
TRANSFER     5267
DEBIT         700
PAYMEN          1
Name: count, dtype: int64


In [16]:
# Convert the 'type' column to a categorical data type
data['type'] = data['type'].astype('category')

# Calculate the correlation matrix
# only include numerical features for correlation analysis
correlation = data.select_dtypes(include=np.number).corr()
print(correlation["isFraud"].sort_values(ascending=False))

isFraud           1.000000
amount            0.053073
oldbalanceOrg    -0.002842
newbalanceDest   -0.007438
oldbalanceDest   -0.011274
newbalanceOrig   -0.013623
step             -0.052039
isFlaggedFraud         NaN
Name: isFraud, dtype: float64


In [17]:
# Now let’s transform the categorical features into numerical. Here we will also transform the values of the isFraud column into
# No Fraud and Fraud labels to have a better understanding of the output
# Changing CASH_OUT to 1, PAYMENT to 2, CASH_IN to 3, TRANSFER to 4 and DEBIT to 5

data["type"] = data["type"].map({"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4, "DEBIT": 5})
data["isFraud"] = data["isFraud"].map({0: "No Fraud", 1: "Fraud"})
print(data.head())

   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   2.0   9839.64  C1231006815       170136.0       160296.36   
1     1   2.0   1864.28  C1666544295        21249.0        19384.72   
2     1   4.0    181.00  C1305486145          181.0            0.00   
3     1   1.0    181.00   C840083671          181.0            0.00   
4     1   2.0  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest   isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0  No Fraud             0.0  
1  M2044282225             0.0             0.0  No Fraud             0.0  
2   C553264065             0.0             0.0     Fraud             0.0  
3    C38997010         21182.0             0.0     Fraud             0.0  
4  M1230701703             0.0             0.0  No Fraud             0.0  


In [21]:
# Check for NaN values in the entire DataFrame
print(data.isnull().sum())

# Handle NaN values - Choose one of the following methods or a combination of both

# 1. Remove rows with NaN values
data.dropna(inplace=True)

# 2. Impute NaN values with a suitable replacement (e.g., mean, median, or a constant)
# Exclude columns with non-numeric types from the mean calculation
numeric_data = data.select_dtypes(include=np.number)
data.fillna(numeric_data.mean(), inplace=True)  # Example: Replace with mean

# Split the data after handling NaN values
x = np.array(data[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(data[["isFraud"]])

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64
0.9971535314001068


In [22]:
features = np.array([[1, 8900.2, 8990.2, 0.0]])
print(model.predict(features))

['No Fraud']
