In [1]:
import numpy as np
import pandas as pd

In [38]:
import warnings
warnings.filterwarnings('ignore')

### Lode Data

In [13]:
df = pd.read_csv('D:/Data science/notebook/data/Hack.csv')

In [14]:
df.head()

Unnamed: 0,x1,x2,y
0,-119.366669,115.0,1
1,-101.108044,97.777159,1
2,-130.278658,106.767654,1
3,-114.703415,101.195477,1
4,-119.366669,115.0,1


In [15]:
df.shape

(2227, 3)

### Missing Value Treatment

In [16]:
df.isnull().sum()

x1    5
x2    3
y     0
dtype: int64

In [17]:
df = df.dropna(axis = 0)

In [18]:
df.isnull().sum()

x1    0
x2    0
y     0
dtype: int64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2219 entries, 0 to 2226
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      2219 non-null   float64
 1   x2      2219 non-null   float64
 2   y       2219 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 69.3 KB


In [20]:
df.describe(percentiles=[.25, .5, .75, .90, .95, .99])

Unnamed: 0,x1,x2,y
count,2219.0,2219.0,2219.0
mean,-4.771837,89677.82,0.442091
std,74.938264,2986680.0,0.496747
min,-134.36916,-134.8761,0.0
25%,-71.581453,-80.25966,0.0
50%,-9.75084,-1.566092,0.0
75%,59.963056,63.90448,1.0
90%,97.633331,99.27624,1.0
95%,113.027748,113.785,1.0
99%,128.120693,129.1919,1.0


### outliers Treating by use IQR method

In [21]:
Q1=df['x2'].quantile(0.25)
Q3=df['x2'].quantile(0.75)
IQR=Q3-Q1
print(Q1)
print(Q3)
print(IQR)
Lower_Whisker = Q1-(1.5*IQR)
Upper_Whisker = Q3+(1.5*IQR)
print(Lower_Whisker, Upper_Whisker)

-80.25965764
63.904483264999996
144.164140905
-296.5058689975 280.1506946225


In [22]:
df = df[df['x2']< Upper_Whisker]

In [23]:
df.describe(percentiles=[.25, .5, .75, .90, .95, .99])

Unnamed: 0,x1,x2,y
count,2217.0,2217.0,2217.0
mean,-4.855645,-6.730568,0.44249
std,74.917277,79.843938,0.496794
min,-134.36916,-134.876132,0.0
25%,-71.626564,-80.519315,0.0
50%,-9.922271,-1.827336,0.0
75%,59.633331,63.622498,1.0
90%,97.633331,99.063025,1.0
95%,113.066172,113.402165,1.0
99%,128.122964,129.004086,1.0


In [26]:
df['y'].value_counts()

0    1236
1     981
Name: y, dtype: int64

### UNDERSAMPLING

In [27]:
df = df.sample(frac=1)

# amount of fraud classes 492 rows.
fraud_df = df.loc[df['y'] == 1]
non_fraud_df = df.loc[df['y'] == 0][:981]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)

new_df.head()

Unnamed: 0,x1,x2,y
1695,-103.765356,-131.228886,0
156,15.019865,-34.907744,1
40,-106.665999,102.548363,1
22,-122.156732,90.476993,1
1347,43.899533,14.320576,0


In [28]:
new_df['y'].value_counts()

1    981
0    981
Name: y, dtype: int64

### Sandardise the data

In [29]:
from sklearn.preprocessing import StandardScaler
features = new_df.columns
standardized_data = StandardScaler().fit_transform(new_df)
print(standardized_data.shape)

(1962, 3)


In [30]:
print(standardized_data)

[[-1.30941453 -1.57766615 -1.        ]
 [ 0.27466611 -0.36604639  1.        ]
 [-1.34809656  1.36300845  1.        ]
 ...
 [ 0.00279801 -0.22883831  1.        ]
 [ 1.0232563   0.87925097 -1.        ]
 [-1.64758251 -1.27150854 -1.        ]]


In [31]:
df1 = pd.DataFrame(standardized_data, columns = features)
df1.head()

Unnamed: 0,x1,x2,y
0,-1.309415,-1.577666,-1.0
1,0.274666,-0.366046,1.0
2,-1.348097,1.363008,1.0
3,-1.554676,1.211163,1.0
4,0.659796,0.253195,-1.0


### Train Test Split

In [32]:
y = df1.pop('y')
X = df1

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, \
                                                    random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1569, 2) (1569,)
(393, 2) (393,)


### Model Building - Logistic Regression

***Perform appropriate Hyperparameter Tuning***

In [34]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

In [48]:
from sklearn.linear_model import LogisticRegression

***RandomizedSearchCV***

In [40]:
tuned_parameters = [{'C':[1, 2, 3, 4, 5, 6, 7, 8],'penalty':['l1','l2','l3']}]
A = ['accuracy','precision','recall']
for i in A:
    clf = RandomizedSearchCV(
        LogisticRegression(), tuned_parameters, scoring='{}'.format(i)
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print(clf.best_params_)
    print(clf.best_estimator_)
    print('for {}'.format(i))

    print(clf.score(X_test, y_test))

Best parameters set found on train set:
{'penalty': 'l2', 'C': 5}
LogisticRegression(C=5)
for accuracy
0.6641221374045801
Best parameters set found on train set:
{'penalty': 'l2', 'C': 6}
LogisticRegression(C=6)
for precision
0.8074074074074075
Best parameters set found on train set:
{'penalty': 'l2', 'C': 5}
LogisticRegression(C=5)
for recall
0.5069767441860465


In [41]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(C=6,penalty='l2')
classifier.fit(X_train, y_train)

LogisticRegression(C=6)

In [42]:
y_pred = classifier.predict(X_test)

In [43]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

0.6641221374045801

In [44]:
metrics.roc_auc_score(y_test, y_pred)

0.6804546642278546

In [45]:
metrics.f1_score(y_test, y_pred)

0.6228571428571429

### Model Building - SVM(rbf)

***Perform appropriate Hyperparameter Tuning***

In [46]:
from sklearn.svm import SVC

***RandomizedSearchCV***

In [57]:
tuned_parameters = [{'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}]
A = ['accuracy','precision','recall']
for i in A:
    clf = RandomizedSearchCV(
        SVC(), tuned_parameters, scoring='{}'.format(i)
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print(clf.best_params_)
    print(clf.best_estimator_)
    print('for {}'.format(i))

    print(clf.score(X_test, y_test))

Best parameters set found on train set:
{'kernel': 'rbf', 'gamma': 1, 'C': 0.1}
SVC(C=0.1, gamma=1)
for accuracy
0.9618320610687023
Best parameters set found on train set:
{'kernel': 'rbf', 'gamma': 0.01, 'C': 1}
SVC(C=1, gamma=0.01)
for precision
1.0
Best parameters set found on train set:
{'kernel': 'rbf', 'gamma': 1, 'C': 10}
SVC(C=10, gamma=1)
for recall
0.9813953488372092


In [58]:
classifiers = SVC(C=1,gamma=0.01,kernel='rbf')
classifiers.fit(X_train,y_train)
y_pred1 = classifiers.predict(X_test)

In [59]:
metrics.accuracy_score(y_test, y_pred1)

0.6997455470737913

In [60]:
metrics.roc_auc_score(y_test, y_pred1)

0.7255813953488373

In [61]:
metrics.f1_score(y_test, y_pred1)

0.6217948717948717

### Model Building - SVM(linear)

In [62]:
classifiers = SVC(C=1,gamma=0.01,kernel='linear')
classifiers.fit(X_train,y_train)
y_pred2 = classifiers.predict(X_test)

In [63]:
metrics.accuracy_score(y_test, y_pred2)

0.6793893129770993

In [64]:
metrics.roc_auc_score(y_test, y_pred2)

0.7069767441860465

In [65]:
metrics.f1_score(y_test, y_pred2)

0.5855263157894737

### Model Building - KNN

***Perform appropriate Hyperparameter Tuning***

In [66]:
from sklearn.neighbors import KNeighborsClassifier

***RandomizedSearchCV***

In [68]:
tuned_parameters = [{'n_neighbors': [2,3,4,5,6], 'weights': ['uniform','distance']}]
A = ['accuracy','precision','recall']
for i in A:
    clf = RandomizedSearchCV(
        KNeighborsClassifier(), tuned_parameters, scoring='{}'.format(i)
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print(clf.best_params_)
    print(clf.best_estimator_)
    print('for {}'.format(i))

    print(clf.score(X_test, y_test))

Best parameters set found on train set:
{'weights': 'uniform', 'n_neighbors': 5}
KNeighborsClassifier()
for accuracy
0.9440203562340967
Best parameters set found on train set:
{'weights': 'uniform', 'n_neighbors': 2}
KNeighborsClassifier(n_neighbors=2)
for precision
0.9846938775510204
Best parameters set found on train set:
{'weights': 'uniform', 'n_neighbors': 5}
KNeighborsClassifier()
for recall
0.9441860465116279


In [70]:
classifiers = KNeighborsClassifier(n_neighbors=2,weights= 'uniform')
classifiers.fit(X_train,y_train)
y_pred3 = classifiers.predict(X_test)

In [71]:
metrics.accuracy_score(y_test, y_pred3)

0.9363867684478372

In [72]:
metrics.roc_auc_score(y_test, y_pred3)

0.9404102430101908

In [73]:
metrics.f1_score(y_test, y_pred3)

0.9391727493917276

### Model Building - Decision tree(DT)

In [74]:
from sklearn.tree import DecisionTreeClassifier

***RandomizedSearchCV***

In [81]:
tuned_parameters = [{'max_depth' :[1,2,3,4,5,6,7,8]}]
A = ['accuracy','precision','recall']
for i in A:
    clf = RandomizedSearchCV(
        DecisionTreeClassifier(), tuned_parameters, scoring='{}'.format(i)
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print(clf.best_params_)
    print(clf.best_estimator_)
    print('for {}'.format(i))

    print(clf.score(X_test, y_test))

Best parameters set found on train set:
{'max_depth': 6}
DecisionTreeClassifier(max_depth=6)
for accuracy
0.9134860050890585
Best parameters set found on train set:
{'max_depth': 6}
DecisionTreeClassifier(max_depth=6)
for precision
0.9371980676328503
Best parameters set found on train set:
{'max_depth': 6}
DecisionTreeClassifier(max_depth=6)
for recall
0.9023255813953488


In [82]:
classifiers = DecisionTreeClassifier(max_depth = 6)
classifiers.fit(X_train,y_train)
y_pred4 = classifiers.predict(X_test)

In [83]:
metrics.accuracy_score(y_test, y_pred4)

0.9134860050890585

In [84]:
metrics.roc_auc_score(y_test, y_pred4)

0.9146459367650902

In [85]:
metrics.f1_score(y_test, y_pred4)

0.9194312796208531

***From all the above models KNN model has the highest accuracy that is 94%***