In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

In [None]:
data = pd.read_csv("fraudolent.csv")
print(data.head())

In [None]:
# There are no missing values
print(data.isna().sum())

In [None]:
# I displayed the type of transactions on a graph, divided by the transactions which are fraduolent and the ones that are not.
# It's not so visible, due to the amount of data
# However, CASH_OUT and TRANSFER transactions are the ones where fraud is involved.
sns.countplot(data=data, x = 'type', hue = 'isFraud', order=data['type'].value_counts().index, linewidth = 2)

In [None]:
# Focusing on the observations that have been recorded as fraudolent, one can see that the amount of the transaction 
# does not equal the difference between the new balance destination and the old balance destination
isfraud = data[(data['isFraud'] == 1)]
print(isfraud.head(10))

In [None]:
# next I will prepare the dataset for the supervised model
# isFraud column is going to be the target of the model
# I will select the features I will use to train and test the model: type (on which will require a label encoder), amount, oldbalanceDest, newbalanceDest

le = LabelEncoder()
data['type'] = le.fit_transform(data['type'])
print(data.head())

In [None]:
data_to_use = data.iloc[:,[1,2,7,8]]
data_to_target = data.iloc[:,9]

# 75% training set and 25% test set
x_train, x_test, y_train, y_test = train_test_split(data_to_use, data_to_target, test_size = 0.25, random_state = 0)

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train) # Scaling  training set
x_test = scaler.transform(x_test) # Scaling test set

In [None]:
# I choosed the logistic regression beacuse this model has a binary output, therefore I'm classifing a transaction as fraudolent or not

logisticRegression_model = LogisticRegression(random_state=0) # Logistic Regression model
logisticRegression_model.fit(x_train, y_train) # Training the model
prediction = logisticRegression_model.predict(x_test) # Predict the response

# 99%
print("Accuracy: ", metrics.accuracy_score(y_test, prediction))