In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report

In [2]:
credit_card = pd.read_csv('creditcard.csv')

In [3]:
credit_card.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
credit_card['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

From above result we can tell that the data is unbalanced

Now creating new data set which is balanced for building the model

In [5]:
legal = credit_card[credit_card.Class == 0]

In [6]:
fraud = credit_card[credit_card.Class == 1]

Now taking only few samples from the legal data

In [7]:
new_legal = legal.sample(n = 492)

Now creating new dataset from the reduced data of legal and fraud transactions

In [8]:
new_credit_card = pd.concat([new_legal,fraud])

In [9]:
new_credit_card.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
5299,5174.0,-1.09954,-0.139671,1.623854,-0.310035,-2.252651,1.374979,1.806673,-0.195528,1.350469,...,-0.159353,0.002188,0.184613,0.040062,-0.35617,0.902065,0.125368,-0.034955,500.0,0
8511,11420.0,-0.402882,1.515974,2.150619,2.900177,-0.033952,0.079388,0.333082,0.134993,-0.292746,...,-0.292147,-0.593691,0.03698,0.417048,-0.525484,-0.106153,0.344009,0.186072,4.34,0
256125,157548.0,-0.482639,0.169882,-0.031977,-2.808843,1.168869,-0.352734,1.594909,-0.438407,-1.912391,...,0.233998,0.496542,-0.701734,-1.370966,1.356873,0.088284,-0.265383,-0.227007,82.0,0
189991,128640.0,-1.535643,-0.888103,0.801893,-0.627376,1.687012,-0.961929,0.532624,-0.280646,0.557993,...,-0.175339,-0.280979,-0.201788,0.699289,0.043588,-0.144714,-0.113044,-0.045926,99.92,0
221601,142629.0,-0.038505,0.764303,-0.517801,-0.685428,1.526381,0.645075,0.6406,0.380761,0.042724,...,-0.25513,-0.524701,0.211014,-0.467987,-0.671708,0.212106,0.268836,0.073066,4.49,0


spliting the data into x,y 

In [10]:
x = new_credit_card.drop(columns='Class')

In [11]:
y = new_credit_card['Class']

Now splitting the data into training data and testing data

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0,stratify=y)

In [13]:
model = RandomForestClassifier(n_estimators=641,random_state=0)

Fitting the data into the model

In [14]:
model.fit(x_train,y_train)

Predicting the train data using the model

In [15]:
train_predict = model.predict(x_train)

Predicting the test data using the model

In [16]:
test_predict = model.predict(x_test)

testing the accuracy

In [20]:
# for train data
train_accuracy = accuracy_score(train_predict,y_train)
print(train_accuracy)

1.0


In [21]:
# for test data 
test_accuracy = accuracy_score(test_predict,y_test)
print(test_accuracy)

0.9137055837563451


In [23]:
# Classification report for train data
print(classification_report(train_predict,y_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       393
           1       1.00      1.00      1.00       394

    accuracy                           1.00       787
   macro avg       1.00      1.00      1.00       787
weighted avg       1.00      1.00      1.00       787



In [25]:
# Classification report for test data
print(classification_report(test_predict,y_test))

              precision    recall  f1-score   support

           0       0.96      0.88      0.92       108
           1       0.87      0.96      0.91        89

    accuracy                           0.91       197
   macro avg       0.91      0.92      0.91       197
weighted avg       0.92      0.91      0.91       197

