In [17]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import svm

In [3]:
data=pd.read_csv("creditcard.csv")

In [4]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
Number_Of_Tranactions = len(data)                                                 # Total Number of Transactions
Number_Of_Fraudulent_Transactions = len(data[data['Class']==1])                   # Total Number of Fraudulent Transactions
Number_Of_Valid_Transactions = len(data[data['Class']==0])                        # Total Number of Valid Transactions
Fraud_Percentage = Number_Of_Fraudulent_Transactions*100/Number_Of_Tranactions    # Fraud Percentage
print("Number of Transanctions: " + str(Number_Of_Tranactions))
print("Number of Fraudulent Transanctions: " + str(Number_Of_Fraudulent_Transactions))
print("Number of Valid Transanctions: " + str(Number_Of_Valid_Transactions))
print("Percentage of Fraudulent Transanctions: " + str(Fraud_Percentage))

Number of Transanctions: 284807
Number of Fraudulent Transanctions: 492
Number of Valid Transanctions: 284315
Percentage of Fraudulent Transanctions: 0.1727485630620034


In [6]:
data.info() # No null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [7]:
data.drop(['Time'], axis=1, inplace=True) # Remove the column 'Time'

In [8]:
data.shape # Shape of data before removing duplicates

(284807, 30)

In [9]:
data.drop_duplicates(inplace=True) # Remove duplicate rows

In [10]:
data.shape # Shape of data after removing duplicates

(275663, 30)

In [11]:

scaler = StandardScaler()
data['Amount']=scaler.fit_transform(data['Amount'].values.reshape(-1,1)) # Normalising the Amount

In [12]:
X = data.drop('Class', axis = 1).values # Independent Variables
y = data['Class'].values # Dependant Variable

In [13]:


X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8) # Split the data into training and (validation + testing) data

# X_validation, X_test, y_validation, y_test = train_test_split(X_test_validation, y_test_validation, test_size=0.5) # Further split the (validation + testing) data into validation and testing data


### Decision Tree

In [14]:
Decision_Tree_Classifier = DecisionTreeClassifier(max_depth=35)
Decision_Tree_Classifier.fit(X_train,y_train)
y_predicted_validation = Decision_Tree_Classifier.predict(X_validation)


print('accuracy :{}'.format(accuracy_score(y_validation, y_predicted_validation)))
print('precision :{}'.format(precision_score(y_validation, y_predicted_validation)))
print('recall: {}'.format(recall_score(y_validation, y_predicted_validation)))
print('f1 score: {}'.format(f1_score(y_validation, y_predicted_validation)))

accuracy :0.9990024123483213
precision :0.7731958762886598
recall: 0.6944444444444444
f1 score: 0.7317073170731706


### Random Forest

In [15]:
Random_Forest_Classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0)
Random_Forest_Classifier.fit(X_train, y_train)
y_predicted_validation = Random_Forest_Classifier.predict(X_validation)

print('accuracy :{}'.format(accuracy_score(y_validation, y_predicted_validation)))
print('precision :{}'.format(precision_score(y_validation, y_predicted_validation)))
print('recall: {}'.format(recall_score(y_validation, y_predicted_validation)))
print('f1 score: {}'.format(f1_score(y_validation, y_predicted_validation)))

accuracy :0.9994014474089928
precision :0.9629629629629629
recall: 0.7222222222222222
f1 score: 0.8253968253968254


### SVM

In [16]:
clf = svm.SVC()
clf.fit(X_train, y_train)
y_predicted_validation = clf.predict(X_validation)
print('accuracy :{}'.format(accuracy_score(y_validation, y_predicted_validation)))
print('precision :{}'.format(precision_score(y_validation, y_predicted_validation)))
print('recall: {}'.format(recall_score(y_validation, y_predicted_validation)))
print('f1 score: {}'.format(f1_score(y_validation, y_predicted_validation)))

accuracy :0.9991112400921408
precision :0.927536231884058
recall: 0.5925925925925926
f1 score: 0.7231638418079096


In [None]:
wclf = svm.SVC(kernel="linear", class_weight={1: 600})
wclf.fit(X_train, y_train)
y_predicted_validation = wclf.predict(X_validation)
print('accuracy :{}'.format(accuracy_score(y_validation, y_predicted_validation)))
print('precision :{}'.format(precision_score(y_validation, y_predicted_validation)))
print('recall: {}'.format(recall_score(y_validation, y_predicted_validation)))
print('f1 score: {}'.format(f1_score(y_validation, y_predicted_validation)))

### Logistic Regression

In [18]:
logistic = LogisticRegression(C=0.01)
logistic_model = logistic.fit(X_train, y_train)
y_predicted_validation = logistic_model.predict(X_validation)

print('accuracy :{}'.format(accuracy_score(y_validation, y_predicted_validation)))
print('precision :{}'.format(precision_score(y_validation, y_predicted_validation)))
print('recall: {}'.format(recall_score(y_validation, y_predicted_validation)))
print('f1 score: {}'.format(f1_score(y_validation, y_predicted_validation)))

accuracy :0.9990386882629279
precision :0.9661016949152542
recall: 0.5277777777777778
f1 score: 0.6826347305389222


# Creating Balanced DataSet

In [19]:
fraud_data=data[data['Class']==1]
non_fraud_data=data[data['Class']==0]

In [20]:
non_fraud_data_sample = non_fraud_data.sample(n = fraud_data.size)

In [21]:
bal_data = pd.concat([non_fraud_data_sample, fraud_data])

In [22]:
X_bal = data.drop('Class', axis = 1).values # Independent Variables
y_bal = data['Class'].values # Dependant Variable

X_bal_train, X_bal_validation, y_bal_train, y_bal_validation = train_test_split(X_bal, y_bal, train_size=0.8) # Split the data into training and (validation + testing) data

# X_validation, X_test, y_validation, y_test = train_test_split(X_test_validation, y_test_validation, test_size=0.5) # Further split the (validation + testing) data into validation and testing data


In [None]:
Decision_Tree_Classifier = DecisionTreeClassifier(max_depth=35)
Decision_Tree_Classifier.fit(X_bal_train,y_bal_train)
y_bal_predicted_validation = Decision_Tree_Classifier.predict(X_bal_validation)


print('accuracy :{}'.format(accuracy_score(y_bal_validation, y_bal_predicted_validation)))
print('precision :{}'.format(precision_score(y_bal_validation, y_bal_predicted_validation)))
print('recall: {}'.format(recall_score(y_bal_validation, y_bal_predicted_validation)))
print('f1 score: {}'.format(f1_score(y_bal_validation, y_bal_predicted_validation)))

accuracy :0.9994558612809026
precision :0.8481012658227848
recall: 0.788235294117647
f1 score: 0.8170731707317074


In [25]:
Random_Forest_Classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0)
Random_Forest_Classifier.fit(X_bal_train,y_bal_train)
y_bal_predicted_validation = Random_Forest_Classifier.predict(X_bal_validation)


print('accuracy :{}'.format(accuracy_score(y_bal_validation, y_bal_predicted_validation)))
print('precision :{}'.format(precision_score(y_bal_validation, y_bal_predicted_validation)))
print('recall: {}'.format(recall_score(y_bal_validation, y_bal_predicted_validation)))
print('f1 score: {}'.format(f1_score(y_bal_validation, y_bal_predicted_validation)))

accuracy :0.9994377233235993
precision :0.92
recall: 0.7340425531914894
f1 score: 0.8165680473372782


In [26]:
clf = svm.SVC()
clf.fit(X_bal_train, y_bal_train)
y_bal_predicted_validation = clf.predict(X_bal_validation)

print('accuracy :{}'.format(accuracy_score(y_bal_validation, y_bal_predicted_validation)))
print('precision :{}'.format(precision_score(y_bal_validation, y_bal_predicted_validation)))
print('recall: {}'.format(recall_score(y_bal_validation, y_bal_predicted_validation)))
print('f1 score: {}'.format(f1_score(y_bal_validation, y_bal_predicted_validation)))

accuracy :0.9992563437505668
precision :0.9344262295081968
recall: 0.6063829787234043
f1 score: 0.7354838709677419


In [27]:
logistic = LogisticRegression(C=0.01)
logistic_model = logistic.fit(X_train, y_train)
y_bal_predicted_validation = logistic_model.predict(X_bal_validation)

print('accuracy :{}'.format(accuracy_score(y_bal_validation, y_bal_predicted_validation)))
print('precision :{}'.format(precision_score(y_bal_validation, y_bal_predicted_validation)))
print('recall: {}'.format(recall_score(y_bal_validation, y_bal_predicted_validation)))
print('f1 score: {}'.format(f1_score(y_bal_validation, y_bal_predicted_validation)))

accuracy :0.9990386882629279
precision :0.847457627118644
recall: 0.5319148936170213
f1 score: 0.6535947712418301
