In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# set seaborn style because it prettier
sns.set()

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc

import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

In [48]:
data=pd.read_csv('transaction1.csv')

In [49]:
data.head()

Unnamed: 0,TransactionID,TransactionDate,Time,TransactionAmount (INR),Mobile,IsFraud
0,T1,02-08-2022,6242,25.0,9088909087,0
1,T2,02-08-2022,3761,27999.0,9088909088,0
2,T3,02-08-2022,5747,459.0,9088909089,0
3,T4,02-08-2022,3590,2060.0,9088909090,0
4,T5,02-08-2022,6329,1762.5,9088909091,0


In [50]:
data.shape

(90779, 6)

In [51]:
len(pd.unique(data['TransactionID']))

86373

In [52]:
len(pd.unique(data['Mobile']))

3122

In [53]:
duplicate=data[data.duplicated(['TransactionID'])]

In [54]:
duplicate

Unnamed: 0,TransactionID,TransactionDate,Time,TransactionAmount (INR),Mobile,IsFraud
693,T484,21-10-2022,3871,177.0,9088909738,0
694,T485,21-10-2022,740,2060.0,9088909739,1
695,T486,21-10-2022,1328,295.0,9088909740,0
696,T487,21-10-2022,726,1540.0,9088909741,1
697,T488,21-10-2022,4513,30.0,9088909742,0
...,...,...,...,...,...,...
90774,T5844,01-08-1930,3727,1526.0,9088910265,0
90775,T5845,01-08-1930,1521,1527.0,9088910266,0
90776,T5846,01-08-1930,6265,1528.0,9088910267,0
90777,T5847,01-08-1930,17,1529.0,9088910268,1


In [55]:
to_update=data.copy()

In [56]:
to_update

Unnamed: 0,TransactionID,TransactionDate,Time,TransactionAmount (INR),Mobile,IsFraud
0,T1,02-08-2022,6242,25.0,9088909087,0
1,T2,02-08-2022,3761,27999.0,9088909088,0
2,T3,02-08-2022,5747,459.0,9088909089,0
3,T4,02-08-2022,3590,2060.0,9088909090,0
4,T5,02-08-2022,6329,1762.5,9088909091,0
...,...,...,...,...,...,...
90774,T5844,01-08-1930,3727,1526.0,9088910265,0
90775,T5845,01-08-1930,1521,1527.0,9088910266,0
90776,T5846,01-08-1930,6265,1528.0,9088910267,0
90777,T5847,01-08-1930,17,1529.0,9088910268,1


In [57]:
del to_update['TransactionDate']

In [58]:
data

Unnamed: 0,TransactionID,TransactionDate,Time,TransactionAmount (INR),Mobile,IsFraud
0,T1,02-08-2022,6242,25.0,9088909087,0
1,T2,02-08-2022,3761,27999.0,9088909088,0
2,T3,02-08-2022,5747,459.0,9088909089,0
3,T4,02-08-2022,3590,2060.0,9088909090,0
4,T5,02-08-2022,6329,1762.5,9088909091,0
...,...,...,...,...,...,...
90774,T5844,01-08-1930,3727,1526.0,9088910265,0
90775,T5845,01-08-1930,1521,1527.0,9088910266,0
90776,T5846,01-08-1930,6265,1528.0,9088910267,0
90777,T5847,01-08-1930,17,1529.0,9088910268,1


In [59]:
to_update

Unnamed: 0,TransactionID,Time,TransactionAmount (INR),Mobile,IsFraud
0,T1,6242,25.0,9088909087,0
1,T2,3761,27999.0,9088909088,0
2,T3,5747,459.0,9088909089,0
3,T4,3590,2060.0,9088909090,0
4,T5,6329,1762.5,9088909091,0
...,...,...,...,...,...
90774,T5844,3727,1526.0,9088910265,0
90775,T5845,1521,1527.0,9088910266,0
90776,T5846,6265,1528.0,9088910267,0
90777,T5847,17,1529.0,9088910268,1


In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90779 entries, 0 to 90778
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TransactionID            90779 non-null  object 
 1   TransactionDate          90779 non-null  object 
 2   Time                     90779 non-null  int64  
 3   TransactionAmount (INR)  90779 non-null  float64
 4   Mobile                   90779 non-null  int64  
 5   IsFraud                  90779 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 4.2+ MB


In [61]:
to_update.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90779 entries, 0 to 90778
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TransactionID            90779 non-null  object 
 1   Time                     90779 non-null  int64  
 2   TransactionAmount (INR)  90779 non-null  float64
 3   Mobile                   90779 non-null  int64  
 4   IsFraud                  90779 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 3.5+ MB


In [62]:
df_fraud=to_update.loc[to_update.IsFraud==1]
df_not_fraud=to_update.loc[to_update.IsFraud==0]

In [63]:
print('number of non fraud examples: ',len(df_not_fraud))
print('number of fraud examples: ',len(df_fraud))

number of non fraud examples:  77513
number of fraud examples:  13266


In [64]:
df_fraud.head()

Unnamed: 0,TransactionID,Time,TransactionAmount (INR),Mobile,IsFraud
10,T11,611,259.0,9088909097,1
15,T16,776,250.0,9088909102,1
17,T18,516,54.0,9088909104,1
21,T22,77,27.0,9088909108,1
24,T25,880,1892.0,9088909090,1


In [65]:
x=to_update.drop(['IsFraud'],axis=1)
y=to_update['IsFraud']

In [66]:
x.head()

Unnamed: 0,TransactionID,Time,TransactionAmount (INR),Mobile
0,T1,6242,25.0,9088909087
1,T2,3761,27999.0,9088909088
2,T3,5747,459.0,9088909089
3,T4,3590,2060.0,9088909090
4,T5,6329,1762.5,9088909091


In [67]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: IsFraud, dtype: int64

In [68]:
x.columns

Index(['TransactionID', 'Time', 'TransactionAmount (INR)', 'Mobile'], dtype='object')

In [69]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
x['TransactionID']=le.fit_transform(x.TransactionID.values)

In [70]:
le.transform(np.array(['T2']))

array([10176])

In [71]:
x

Unnamed: 0,TransactionID,Time,TransactionAmount (INR),Mobile
0,0,6242,25.0,9088909087
1,10176,3761,27999.0,9088909088
2,20931,5747,459.0,9088909089
3,31041,3590,2060.0,9088909090
4,41248,6329,1762.5,9088909091
...,...,...,...,...
90774,50627,3727,1526.0,9088910265
90775,50638,1521,1527.0,9088910266
90776,50649,6265,1528.0,9088910267
90777,50660,17,1529.0,9088910268


In [72]:
from sklearn.preprocessing import StandardScaler

In [73]:
sc=StandardScaler()

x3=sc.fit_transform(x)

In [74]:
x3

array([[-1.73246167e+00,  1.35841208e+00, -2.40000538e-01,
        -4.01080430e+00],
       [-1.32021657e+00,  1.29333240e-01,  4.18091775e+00,
        -4.00614501e+00],
       [-8.84515318e-01,  1.11319079e+00, -1.71412615e-01,
        -4.00148572e+00],
       ...,
       [ 3.19405643e-01,  1.36980620e+00, -2.47139691e-03,
         1.48716060e+00],
       [ 3.19851269e-01, -1.72543144e+00, -2.31336022e-03,
         1.49181990e+00],
       [ 3.20296896e-01, -1.20031110e+00, -2.15532354e-03,
         1.49647919e+00]])

In [75]:
xtrain,xtest,ytrain,ytest=train_test_split(np.array(x3),np.array(y),test_size=0.3,shuffle=True)

In [76]:
print("Size of the Training set:",len(xtrain))
print("Size of the Testing set:",len(xtest))

Size of the Training set: 63545
Size of the Testing set: 27234


In [77]:
ytrain

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [78]:
i,j=0,0
for a in ytrain:
    if a==1:
        i+=1
    else:
        j+=1
        
print("The fraudulent data in training set:",i)
print("the non fraudulent data in training set",j)

The fraudulent data in training set: 9238
the non fraudulent data in training set 54307


In [79]:
i,j=0,0
for a in ytest:
    if a==1:
        i+=1
    else:
        j+=1
        
print("The fraudulent data in testing set:",i)
print("the non fraudulent data in testing set",j)

The fraudulent data in testing set: 4028
the non fraudulent data in testing set 23206


In [80]:
rf_clf = RandomForestClassifier(n_estimators=100,max_depth=10,random_state=42,
                                verbose=1,class_weight="balanced")

In [81]:
rf_clf.fit(xtrain,ytrain)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.9s finished


In [82]:
y_pred1=rf_clf.predict(xtest)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


In [83]:
y_pred1

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [84]:
ytest

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [85]:
print("Classification Report for Random Forest: \n", classification_report(ytest, y_pred1))

print("Confusion Matrix of Random Forest: \n", confusion_matrix(ytest,y_pred1))

Classification Report for Random Forest: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     23206
           1       1.00      0.98      0.99      4028

    accuracy                           1.00     27234
   macro avg       1.00      0.99      0.99     27234
weighted avg       1.00      1.00      1.00     27234

Confusion Matrix of Random Forest: 
 [[23204     2]
 [   72  3956]]


In [86]:
from sklearn.metrics import accuracy_score
print('the accuracy is :',accuracy_score(ytest,y_pred1))

the accuracy is : 0.9972828082543879


In [87]:
id=le.transform(np.array(['T5']))
id[0]

41248

In [88]:
l=np.array([[id[0],8300,0,'9088909903']])
l=sc.transform(l)



In [89]:
l

array([[-0.06144303,  2.37793818, -0.24395145, -0.20882179]])

In [90]:
rf_clf.predict(l)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


array([1], dtype=int64)

In [76]:
print("Input")
print("enter the TransactionId")
id=input()
print("enter the Transaction date")
dt=input()
print("enter the Transactions time gap between each mobile")
time=float(input())
print("enter the Transaction Amount")
amount=int(input())
print("enter the mobile number")
number=input()

Input
enter the TransactionId
T1
enter the Transaction date
12/12/2000
enter the Transactions time gap between each mobile
1200
enter the Transaction Amount
0
enter the mobile number
9740653908


In [77]:
id=le.transform(np.array([id]))
id[0]

0

In [84]:
l=np.array([[id[0],time,8,number]])
l=sc.transform(l)



In [85]:
from datetime import date
from datetime import datetime
today = date.today()

# dd/mm/YY
d1 = today.strftime("%d/%m/%Y")
print("d1 =", d1)
d1 = datetime.strptime(d1, "%d/%m/%Y")
d2 = datetime.strptime(dt, "%d/%m/%Y")

d1 = 23/09/2022


In [87]:
if d1>=d2:
    print(rf_clf.predict(l))
else:
    print(1)

[0]


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [507]:
knn = KNeighborsClassifier(n_neighbors=5,p=1)

knn.fit(xtrain,ytrain)

In [508]:
y_pred = knn.predict(xtest)

In [509]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [510]:
ytest

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [566]:
xtest.shape

(27234, 4)

In [567]:
ytest.shape

(27234,)

In [571]:
print(ytest.reshape(len(ytest),1).shape)

(27234, 1)


In [580]:
print(len(y_pred),len(ytest))

27234 27234


In [581]:
print("Classification Report for K-Nearest Neighbours: \n", classification_report(ytest, y_pred))

print("Confusion Matrix of K-Nearest Neigbours: \n", confusion_matrix(ytest,y_pred))

Classification Report for K-Nearest Neighbours: 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     23218
           1       0.99      0.96      0.97      4016

    accuracy                           0.99     27234
   macro avg       0.99      0.98      0.98     27234
weighted avg       0.99      0.99      0.99     27234

Confusion Matrix of K-Nearest Neigbours: 
 [[23163    55]
 [  172  3844]]


In [582]:
from sklearn.metrics import accuracy_score

In [583]:
print('the accuracy is :',accuracy_score(ytest,y_pred))

the accuracy is : 0.991664830726298
