In [13]:

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("/content/transaction.csv")
print(data.head())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [3]:
print(data.isnull().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [4]:
# Exploring transaction type
print(data.type.value_counts())

type
PAYMENT     232
CASH_OUT    120
CASH_IN     109
TRANSFER     69
DEBIT        28
Name: count, dtype: int64


In [5]:
type = data["type"].value_counts()
transactions = type.index
quantity = type.values

import plotly.express as px
figure = px.pie(data,
             values=quantity,
             names=transactions,hole = 0.5,
             title="Distribution of Transaction Type")
figure.show()

In [6]:
# Checking correlation
correlation = data.corr(numeric_only=True) # this parameter to only include numeric columns
print(correlation["isFraud"].sort_values(ascending=False))

isFraud           1.000000
oldbalanceDest   -0.026970
newbalanceDest   -0.027843
oldbalanceOrg    -0.037329
newbalanceOrig   -0.037593
amount           -0.039868
step                   NaN
isFlaggedFraud         NaN
Name: isFraud, dtype: float64


In [7]:
data["type"] = data["type"].map({"CASH_OUT": 1, "PAYMENT": 2,
                                 "CASH_IN": 3, "TRANSFER": 4,
                                 "DEBIT": 5})
data["isFraud"] = data["isFraud"].map({0: "No Fraud", 1: "Fraud"})
print(data.head())

   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     2   9839.64  C1231006815       170136.0       160296.36   
1     1     2   1864.28  C1666544295        21249.0        19384.72   
2     1     4    181.00  C1305486145          181.0            0.00   
3     1     1    181.00   C840083671          181.0            0.00   
4     1     2  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest   isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0  No Fraud               0  
1  M2044282225             0.0             0.0  No Fraud               0  
2   C553264065             0.0             0.0     Fraud               0  
3    C38997010         21182.0             0.0     Fraud               0  
4  M1230701703             0.0             0.0  No Fraud               0  


In [8]:
# splitting the data
from sklearn.model_selection import train_test_split
x = np.array(data[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(data[["isFraud"]])

In [15]:
# training a machine learning model
imputer = SimpleImputer(strategy='most_frequent') # Replace 'most_frequent' with other strategies if needed
x = imputer.fit_transform(x)

# Convert 'isFraud' column back to numeric representation
# Handle potential errors during mapping and fill NaNs
y = np.array(data["isFraud"].map({"No Fraud": 0, "Fraud": 1}, na_action='ignore'))
y = np.nan_to_num(y, nan=0) # Replace NaNs with 0, adjust as needed

# Convert 'type' column to numerical representation using one-hot encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
x_encoded = encoder.fit_transform(x[:, 0].reshape(-1, 1))  # One-hot encode the 'type' column
x = np.concatenate((x_encoded.toarray(), x[:, 1:]), axis=1)  # Concatenate encoded 'type' with other features

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))

0.9821428571428571


In [23]:
# prediction
#features = [type, amount, oldbalanceOrg, newbalanceOrig]
features = np.array([[2, 11668.14, 41554, 29885.86,2, 7817.77, 53860, 46042.29,0,0,0]])
print(model.predict(features))

[0]


In [24]:
if model.predict(features)==1:
  print("Fraud")
else:
  print("No Fraud")

No Fraud


In [None]:
from google.colab import drive
drive.mount('/content/drive')