In [None]:
#LOF и ISOLATION FOREST

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [2]:
df = pd.read_csv('creditcard.csv')

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [6]:
df_copy = df.copy()
to_model_cols = df_copy.columns[0:30]
clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(.00172), 
                        n_jobs=-1, random_state=0, verbose=1)
clf.fit(df_copy[to_model_cols])
pred = clf.predict(df_copy[to_model_cols])
df_copy['Class'] = pred
outliers = df_copy.loc[df_copy['Class']==-1]
outlier_index=list(outliers.index)
print(df_copy['Class'].value_counts())

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.4s remaining:    2.6s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.9s


Class
 1    284317
-1       490
Name: count, dtype: int64


In [7]:
data = df.copy()
n_outliers = len(data[data['Class']==1])
outlier_fraction = len(data[data['Class']==1])/float(len(data[data['Class']==0]))
clf = IsolationForest(n_estimators=100, max_samples= 'auto', 
                      contamination=outlier_fraction, random_state=0, verbose=1)
X = data.loc[:,data.columns!='Class']
Y = data['Class']
clf.fit(X)
scores_prediction = clf.decision_function(X)
y_pred = clf.predict(X)
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1
n_errors = (y_pred != Y).sum()

print("{}: {}".format("No. of Anomalous Points with Isolation Forest ",n_errors))
print("Classification Report :")
print(classification_report(Y,y_pred))

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.2s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.0s


No. of Anomalous Points with Isolation Forest : 753
Classification Report :
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    284315
           1       0.24      0.24      0.24       492

    accuracy                           1.00    284807
   macro avg       0.62      0.62      0.62    284807
weighted avg       1.00      1.00      1.00    284807



In [37]:
data = df.copy()
non_fraud = data[data['Class'] == 0].sample(n=5000, random_state=42)
fraud = data[data['Class'] == 1]
balanced_data = pd.concat([non_fraud, fraud])
X = balanced_data.drop("Class", axis=1)
Y = balanced_data["Class"]
outlier_fraction = len(fraud) / float(len(balanced_data))
clflof = LocalOutlierFactor(n_neighbors=200, contamination=outlier_fraction)
y_pred = clflof.fit_predict(X)
y_pred = np.where(y_pred == 1, 0, 1)
n_errors = (y_pred != Y).sum()
print("Classification Report :")
print(classification_report(Y,y_pred))

Classification Report :
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      5000
           1       0.21      0.21      0.21       492

    accuracy                           0.86      5492
   macro avg       0.57      0.57      0.57      5492
weighted avg       0.86      0.86      0.86      5492



In [10]:
data = df.drop(['Time'], axis=1)

In [11]:
from sklearn.preprocessing import StandardScaler
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

In [12]:

non_fraud = data[data['Class'] == 0]
fraud = data[data['Class'] == 1]

df = pd.concat([non_fraud, fraud]).sample(frac=1).reset_index(drop=True)

X = df.drop(['Class'], axis=1).values
Y = df["Class"].values


In [13]:
X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)
X_train_fraud = X_train[X_train.Class == 1]
X_train = X_train[X_train.Class == 0]
X_train = X_train.drop(['Class'], axis=1)
y_test = X_test['Class']
X_test = X_test.drop(['Class'], axis=1)
X_train = X_train.values
X_test = X_test.values
X_train.shape

(227451, 29)

In [14]:
from keras.layers import Input, Dense
from keras import regularizers
input_layer = Input(shape=(X.shape[1],))

encoded = Dense(100, activation='tanh', activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoded = Dense(50, activation='relu')(encoded)

decoded = Dense(50, activation='tanh')(encoded)
decoded = Dense(100, activation='tanh')(decoded)

output_layer = Dense(X.shape[1], activation='relu')(decoded)

In [15]:
from keras.models import Model
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="adadelta", loss="mse")


In [24]:
from sklearn.preprocessing import MinMaxScaler
x = data.drop(["Class"], axis=1)
y = data["Class"].values

x_scale = MinMaxScaler().fit_transform(x.values)
x_norm, x_fraud = x_scale[y == 0], x_scale[y == 1]

autoencoder.fit(x_norm, x_norm, 
                batch_size = 128, epochs = 10, 
                shuffle = True, validation_split = 0.20);

Epoch 1/10
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.2943 - val_loss: 0.2467
Epoch 2/10
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.2381 - val_loss: 0.2117
Epoch 3/10
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.2087 - val_loss: 0.1940
Epoch 4/10
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.1939 - val_loss: 0.1849
Epoch 5/10
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.1847 - val_loss: 0.1660
Epoch 6/10
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.1649 - val_loss: 0.1563
Epoch 7/10
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.1577 - val_loss: 0.1527
Epoch 8/10
[1m1777/1777[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.1545 - val_loss: 0.1504
Epoch 9/10
[1m1777/1777

In [25]:
from keras.models import Sequential
hidden_representation = Sequential()
hidden_representation.add(autoencoder.layers[0])
hidden_representation.add(autoencoder.layers[1])
hidden_representation.add(autoencoder.layers[2])
     

In [26]:
norm_hid_rep = hidden_representation.predict(x_norm[:5000])
fraud_hid_rep = hidden_representation.predict(x_fraud)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


In [27]:
rep_x = np.append(norm_hid_rep, fraud_hid_rep, axis = 0)
y_n = np.zeros(norm_hid_rep.shape[0])
y_f = np.ones(fraud_hid_rep.shape[0])
rep_y = np.append(y_n, y_f)

In [28]:
train_x, val_x, train_y, val_y = train_test_split(rep_x, rep_y, test_size=0.25)

In [29]:
clf = LogisticRegression(solver="lbfgs").fit(train_x, train_y)
pred_y = clf.predict(val_x)

print ("")
print ("Classification Report: ")
print (classification_report(val_y, pred_y))

print ("")
print ("Accuracy Score: ", accuracy_score(val_y, pred_y))


Classification Report: 
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98      1250
         1.0       1.00      0.68      0.81       123

    accuracy                           0.97      1373
   macro avg       0.98      0.84      0.90      1373
weighted avg       0.97      0.97      0.97      1373


Accuracy Score:  0.9715950473415877
