In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing data

In [2]:
# importing my dataset
data = pd.read_csv("../../dataset/fraud_data.csv")
print(data.sample(5)) # looking at some random samples from the dataset to understand the content
data.shape

       TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
53881        3363183        0        9402702            15.0         W   5033   
24646        3150843        0        3464645            59.0         W   7703   
41078        3039387        0        1221512           144.0         W  12469   
41912        3104249        0        2291684           150.0         R  11218   
52206        3380641        0        9898529            34.5         W   1682   

       card2  card3       card4  card5  ...        id_31  id_32     id_33  \
53881  269.0  150.0  mastercard  224.0  ...          NaN    NaN       NaN   
24646  360.0  150.0  mastercard  166.0  ...          NaN    NaN       NaN   
41078  360.0  150.0  mastercard  166.0  ...          NaN    NaN       NaN   
41912  579.0  150.0        visa  226.0  ...  chrome 63.0   24.0  1366x768   
52206  409.0  150.0        visa  226.0  ...          NaN    NaN       NaN   

                id_34  id_35 id_36 id_37  id_38  D

(59054, 434)

In [3]:
print(data.isnull().sum())
print("Unique values in card4:",data["card4"].unique())

TransactionID         0
isFraud               0
TransactionDT         0
TransactionAmt        0
ProductCD             0
                  ...  
id_36             44849
id_37             44849
id_38             44849
DeviceType        44869
DeviceInfo        47133
Length: 434, dtype: int64
Unique values in card4: ['visa' 'mastercard' 'discover' 'american express' nan]


# Data Understanding

In [4]:
# checking for missing values
cols_with_missing = data.isnull().sum()
print("*"*5, "Columns with corresponding missing values", "*"*5)
print(cols_with_missing[cols_with_missing > 0])
print("*"*5, "Proportion with missing values", "*"*5)
print(cols_with_missing[cols_with_missing > 0]/data.shape[0])

***** Columns with corresponding missing values *****
card2           915
card3           158
card4           162
card5           444
card6           159
              ...  
id_36         44849
id_37         44849
id_38         44849
DeviceType    44869
DeviceInfo    47133
Length: 414, dtype: int64
***** Proportion with missing values *****
card2         0.015494
card3         0.002676
card4         0.002743
card5         0.007519
card6         0.002692
                ...   
id_36         0.759457
id_37         0.759457
id_38         0.759457
DeviceType    0.759796
DeviceInfo    0.798134
Length: 414, dtype: float64


In [5]:
print("Total number of transaction in the data: ", len(data))
print("Number of transaction in the data who were frauds: ", len(data[data["isFraud"] == 1]))

Total number of transaction in the data:  59054
Number of transaction in the data who were frauds:  2005


In [6]:
print("Percentage of transactions that were visa:", 100*np.mean(data["isFraud"][data["card4"] == 'visa']))
print("Percentage of transactions that were mastercard:", 100*np.mean(data["isFraud"][data["card4"] == 'mastercard']))

Percentage of transactions that were visa: 3.3878413269697205
Percentage of transactions that were mastercard: 3.2822182543850436


In [7]:
features = data.drop(columns=["isFraud"])
label = data.isFraud

# Data Preparation

In [8]:
# Delete every column that has more than 70 percent null values
null_percentage = features.isnull().mean()
columns_to_delete = null_percentage[null_percentage > 0.7].index
data = data.drop(columns=columns_to_delete)
data

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2994681,0,242834,25.000,H,9803,583.0,150.0,visa,226.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
1,3557242,0,15123000,117.000,W,7919,194.0,150.0,mastercard,166.0,...,234.000000,0.000000,225.500000,0.000000,288.000000,1707.000000,1707.000000,0.0,0.0,0.0
2,3327470,0,8378575,73.773,C,12778,500.0,185.0,mastercard,224.0,...,0.000000,0.000000,73.772797,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,3118781,0,2607840,400.000,R,12316,548.0,150.0,visa,195.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,3459772,0,12226544,31.950,W,9002,453.0,150.0,visa,226.0,...,0.000000,0.000000,99.900002,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59049,3549150,0,14898396,335.000,W,2616,327.0,150.0,discover,102.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
59050,3497600,0,13371647,20.950,W,3277,111.0,150.0,visa,226.0,...,47.950001,47.950001,138.850006,47.950001,0.000000,0.000000,0.000000,0.0,0.0,0.0
59051,3575755,0,15775109,59.000,W,9486,127.0,150.0,mastercard,224.0,...,0.000000,49.000000,49.000000,49.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
59052,3521246,0,14071016,25.950,W,2392,360.0,150.0,mastercard,126.0,...,25.950001,25.950001,25.950001,25.950001,51.900002,51.900002,51.900002,0.0,0.0,0.0


In [9]:
# # Delete every column that has more than 70 percent null values
zero_percentage = (features == 0).mean()
columns_to_delete = zero_percentage[zero_percentage > 0.7].index
data = data.drop(columns=columns_to_delete)
data

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V130,V282,V283,V285,V290,V291,V292,V305,V307,V310
0,2994681,0,242834,25.000,H,9803,583.0,150.0,visa,226.0,...,0.000000,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
1,3557242,0,15123000,117.000,W,7919,194.0,150.0,mastercard,166.0,...,0.000000,0.0,0.0,2.0,2.0,5.0,5.0,1.0,1941.000000,234.000000
2,3327470,0,8378575,73.773,C,12778,500.0,185.0,mastercard,224.0,...,0.000000,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
3,3118781,0,2607840,400.000,R,12316,548.0,150.0,visa,195.0,...,0.000000,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
4,3459772,0,12226544,31.950,W,9002,453.0,150.0,visa,226.0,...,35.950001,0.0,0.0,1.0,1.0,1.0,1.0,1.0,35.950001,35.950001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59049,3549150,0,14898396,335.000,W,2616,327.0,150.0,discover,102.0,...,0.000000,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
59050,3497600,0,13371647,20.950,W,3277,111.0,150.0,visa,226.0,...,138.850006,1.0,1.0,3.0,1.0,1.0,1.0,1.0,138.850006,138.850006
59051,3575755,0,15775109,59.000,W,9486,127.0,150.0,mastercard,224.0,...,49.000000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,49.000000,49.000000
59052,3521246,0,14071016,25.950,W,2392,360.0,150.0,mastercard,126.0,...,25.950001,4.0,4.0,1.0,1.0,1.0,1.0,1.0,77.849998,25.950001


In [10]:
print(data.isFraud.value_counts())
print("*"*15)
print(data.card4.value_counts())

isFraud
0    57049
1     2005
Name: count, dtype: int64
***************
card4
visa                38343
mastercard          19042
american express      831
discover              676
Name: count, dtype: int64


In [11]:
X = data.drop(columns=["isFraud"])
y = data["isFraud"]

In [12]:
numeric = X.select_dtypes(include=np.number)
categoric = X.select_dtypes(exclude=np.number)

In [13]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(numeric)
X[numeric.columns] = imputer.transform(numeric)
X

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V130,V282,V283,V285,V290,V291,V292,V305,V307,V310
0,2994681.0,242834.0,25.000,H,9803.0,583.0,150.0,visa,226.0,credit,...,0.000000,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
1,3557242.0,15123000.0,117.000,W,7919.0,194.0,150.0,mastercard,166.0,debit,...,0.000000,0.0,0.0,2.0,2.0,5.0,5.0,1.0,1941.000000,234.000000
2,3327470.0,8378575.0,73.773,C,12778.0,500.0,185.0,mastercard,224.0,credit,...,0.000000,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
3,3118781.0,2607840.0,400.000,R,12316.0,548.0,150.0,visa,195.0,credit,...,0.000000,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
4,3459772.0,12226544.0,31.950,W,9002.0,453.0,150.0,visa,226.0,debit,...,35.950001,0.0,0.0,1.0,1.0,1.0,1.0,1.0,35.950001,35.950001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59049,3549150.0,14898396.0,335.000,W,2616.0,327.0,150.0,discover,102.0,credit,...,0.000000,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
59050,3497600.0,13371647.0,20.950,W,3277.0,111.0,150.0,visa,226.0,debit,...,138.850006,1.0,1.0,3.0,1.0,1.0,1.0,1.0,138.850006,138.850006
59051,3575755.0,15775109.0,59.000,W,9486.0,127.0,150.0,mastercard,224.0,debit,...,49.000000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,49.000000,49.000000
59052,3521246.0,14071016.0,25.950,W,2392.0,360.0,150.0,mastercard,126.0,debit,...,25.950001,4.0,4.0,1.0,1.0,1.0,1.0,1.0,77.849998,25.950001


In [14]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(categoric)
X[categoric.columns] = imputer.transform(categoric)
X

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V130,V282,V283,V285,V290,V291,V292,V305,V307,V310
0,2994681.0,242834.0,25.000,H,9803.0,583.0,150.0,visa,226.0,credit,...,0.000000,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
1,3557242.0,15123000.0,117.000,W,7919.0,194.0,150.0,mastercard,166.0,debit,...,0.000000,0.0,0.0,2.0,2.0,5.0,5.0,1.0,1941.000000,234.000000
2,3327470.0,8378575.0,73.773,C,12778.0,500.0,185.0,mastercard,224.0,credit,...,0.000000,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
3,3118781.0,2607840.0,400.000,R,12316.0,548.0,150.0,visa,195.0,credit,...,0.000000,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
4,3459772.0,12226544.0,31.950,W,9002.0,453.0,150.0,visa,226.0,debit,...,35.950001,0.0,0.0,1.0,1.0,1.0,1.0,1.0,35.950001,35.950001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59049,3549150.0,14898396.0,335.000,W,2616.0,327.0,150.0,discover,102.0,credit,...,0.000000,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.000000,0.000000
59050,3497600.0,13371647.0,20.950,W,3277.0,111.0,150.0,visa,226.0,debit,...,138.850006,1.0,1.0,3.0,1.0,1.0,1.0,1.0,138.850006,138.850006
59051,3575755.0,15775109.0,59.000,W,9486.0,127.0,150.0,mastercard,224.0,debit,...,49.000000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,49.000000,49.000000
59052,3521246.0,14071016.0,25.950,W,2392.0,360.0,150.0,mastercard,126.0,debit,...,25.950001,4.0,4.0,1.0,1.0,1.0,1.0,1.0,77.849998,25.950001


In [15]:
categoric_encoded = pd.get_dummies(data=categoric)
X = X.drop(columns=categoric.columns)
X = pd.concat([X, categoric_encoded], axis=1)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [18]:
from sklearn.metrics import accuracy_score
print("Training accuracy:", accuracy_score(y_train, model.predict(X_train)))
print("Validation accuracy:", accuracy_score(y_test, model.predict(X_test)))

Training accuracy: 1.0
Validation accuracy: 0.9531523395608738


In [19]:
# from sklearn.tree import export_graphviz
# import graphviz

# dot_data = export_graphviz(model, out_file=None, filled=True, rounded=True, special_characters=True, feature_names=X.columns.values)
# graph = graphviz.Source(dot_data)
# graph

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x242a90034d0>

In [None]:
# model_improved = DecisionTreeClassifier(max_depth=4)
# model_improved.fit(X_train, y_train)

In [None]:
# print("Train score:", accuracy_score(y_train, model_improved.predict(X_train)))
# print("Test score:", accuracy_score(y_test, model_improved.predict(X_test)))

In [None]:
# export_graphviz(model_improved, "dtree1.tree", feature_names=X.columns)