In [2]:
import pandas as pd
#Robust scaler is used when we want to be robust against outliers:
# It scales the data according to the quantile range
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
from keras.models import Sequential, load_model
from keras.layers import InputLayer, Dense, BatchNormalization
from keras.callbacks import ModelCheckpoint
from matplotlib import pyplot as plt
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
import warnings
warnings.filterwarnings("ignore")

def predict_class_nn(model, x):
  return (model.predict(x).flatten() > 0.5).astype(int)


#there are 284807 rows × 31 columns
df = pd.read_csv('../input/creditcardfraud/creditcard.csv')
# print(df.head())
#There are 2 classes Fraud(1) and Not Fraud(0):
#   0 class:   284315
#   1 class:   492
# print(df['Class'].value_counts())

#plot histograms of different columns:
# df.hist(bins=30, figsize=(30, 30))
#==>Except for Time, all other columns are centered around 0 with a 
#distribution resembling normal dist 
#We can notice that most Amounts are pretty small
# print(df.describe())

new_df = df.copy()
new_df['Amount'] = RobustScaler().fit_transform(new_df['Amount'].to_numpy().reshape(-1, 1))
new_df['Time'] = MinMaxScaler().fit_transform(new_df['Time'].to_numpy().reshape(-1, 1))
# print(new_df.head())

X, y = new_df.iloc[:, :-1], new_df.iloc[:,-1]  
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,stratify=y, random_state=0)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.9,stratify=y_train, random_state=0)
print(y_train.value_counts())
print(y_test.value_counts())
print(y_valid.value_counts())

#Train Logistic Regression

logistic_model = LogisticRegression(solver='liblinear')
logistic_model.fit(X_train, y_train)
logistic_model.score(X_train, y_train)
print('**** LogReg *****')
#Got 99.9% Accuracy on train
print(classification_report(y_valid, logistic_model.predict(X_valid), target_names=['Not Fraud', 'Fraud']))
#Train Random Forest
rf = RandomForestClassifier(max_depth=4, n_jobs=-1)
rf.fit(X_train, y_train)
print('**** RF *****')
print(classification_report(y_valid, rf.predict(X_valid), target_names=['Not Fraud', 'Fraud']))
#Train Gradient Boost
print('**** GradientBoost *****')
gbc = GradientBoostingClassifier(n_estimators=70, max_depth=3, random_state=0)
gbc.fit(X_train, y_train)
print(classification_report(y_valid, gbc.predict(X_valid), target_names=['Not Fraud', 'Fraud']))

#Train XGBoost
print('**** XGBoost *****')
xgb = XGBClassifier(random_state=0)
xgb.fit(X_train, y_train)
print(classification_report(y_valid, xgb.predict(X_valid), target_names=['Not Fraud', 'Fraud']))

#Train svc
print('**** SVC *****')
svc = LinearSVC(class_weight='balanced')
svc.fit(X_train, y_train)
print(classification_report(y_valid, svc.predict(X_valid), target_names=['Not Fraud', 'Fraud']))

#lgbm
lg = LGBMClassifier()
lg.fit(X_train, y_train)
print('**** lgbm *****')
print(classification_report(y_valid, lg.predict(X_valid), target_names=['Not Fraud', 'Fraud']))
#MLP
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
print('**** mlp *****')
print(classification_report(y_valid, mlp.predict(X_valid), target_names=['Not Fraud', 'Fraud']))

#Train NN
nn = Sequential()
nn.add(InputLayer((X_train.shape[1],)))
nn.add(Dense(2, 'relu'))
nn.add(BatchNormalization())
nn.add(Dense(1, 'sigmoid'))

checkpoint = ModelCheckpoint(filepath = 'nn.h5', save_best_only=True)
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
r = nn.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=5, callbacks=checkpoint)
model = load_model('nn.h5')
print('**** nn *****')
print(classification_report(y_valid, predict_class_nn(model, X_valid),
target_names=['Not Fraud', 'Fraud']))


# plot loss during training
plt.subplot(211)
plt.title('Loss for nn')
plt.plot(r.history['loss'], label='train')
plt.plot(r.history['val_loss'], label='test')
plt.legend()
# plot accuracy during training
plt.subplot(212)
plt.title('Accuracy for nn')
plt.plot(r.history['accuracy'], label='train')
plt.plot(r.history['val_accuracy'], label='test')
plt.legend()
plt.show()


In [4]:
from sklearn.ensemble import ExtraTreesClassifier

#ExtraTrees
ex = ExtraTreesClassifier(n_estimators=1111,n_jobs=-1,verbose=0,random_state=1)
ex.fit(X_train, y_train) 
print('**** ExtraTrees *****')
print(classification_report(y_valid, ex.predict(X_valid), target_names=['Not Fraud', 'Fraud']))


In [10]:
from imblearn.over_sampling import SMOTE
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_res, y_res = sm.fit_resample(X, y)

print('After OverSampling, the shape of train_X: {}'.format(X_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_res==0)))

In [11]:
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_res, y_res, train_size=0.8, random_state=0)
X_train_res, X_valid_res, y_train_res, y_valid_res = train_test_split(X_train_res, y_train_res, train_size=0.9, random_state=0)
print(y_train_res.value_counts())
print(y_test_res.value_counts())
print(y_valid_res.value_counts())

In [14]:
#ExtraTrees
ex = ExtraTreesClassifier(n_estimators=1111,n_jobs=-1,verbose=0,random_state=1)
ex.fit(X_train_res, y_train_res) 
print('**** ExtraTrees *****')
print(classification_report(y_valid_res, ex.predict(X_valid_res), target_names=['Not Fraud', 'Fraud']))


In [12]:
#Train Logistic Regression

logistic_model = LogisticRegression(solver='liblinear')
logistic_model.fit(X_train_res, y_train_res)
logistic_model.score(X_train_res, y_train_res)
print('**** LogReg *****')
#Got 99.9% Accuracy on train
print(classification_report(y_valid_res, logistic_model.predict(X_valid_res), target_names=['Not Fraud', 'Fraud']))
#Train Random Forest
rf = RandomForestClassifier(max_depth=4, n_jobs=-1)
rf.fit(X_train_res, y_train_res)
print('**** RF *****')
print(classification_report(y_valid_res, rf.predict(X_valid_res), target_names=['Not Fraud', 'Fraud']))
#Train Gradient Boost
print('**** GradientBoost *****')
gbc = GradientBoostingClassifier(n_estimators=70, max_depth=3, random_state=0)
gbc.fit(X_train_res, y_train_res)
print(classification_report(y_valid_res, gbc.predict(X_valid_res), target_names=['Not Fraud', 'Fraud']))

#Train XGBoost
print('**** XGBoost *****')
xgb = XGBClassifier(random_state=0)
xgb.fit(X_train_res, y_train_res)
print(classification_report(y_valid_res, xgb.predict(X_valid_res), target_names=['Not Fraud', 'Fraud']))

#Train svc
print('**** SVC *****')
svc = LinearSVC(class_weight='balanced')
svc.fit(X_train_res, y_train_res)
print(classification_report(y_valid_res, svc.predict(X_valid_res), target_names=['Not Fraud', 'Fraud']))

#lgbm
lg = LGBMClassifier()
lg.fit(X_train_res, y_train_res)
print('**** lgbm *****')
print(classification_report(y_valid_res, lg.predict(X_valid_res), target_names=['Not Fraud', 'Fraud']))
#MLP
mlp = MLPClassifier()
mlp.fit(X_train_res, y_train_res)
print('**** mlp *****')
print(classification_report(y_valid_res, mlp.predict(X_valid_res), target_names=['Not Fraud', 'Fraud']))

#Train NN
nn = Sequential()
nn.add(InputLayer((X_train_res.shape[1],)))
nn.add(Dense(2, 'relu'))
nn.add(BatchNormalization())
nn.add(Dense(1, 'sigmoid'))

checkpoint = ModelCheckpoint(filepath = 'nn.h5', save_best_only=True)
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
r = nn.fit(X_train_res, y_train_res, validation_data=(X_valid_res, y_valid_res), epochs=5, callbacks=checkpoint)
model = load_model('nn.h5')
print('**** nn *****')
print(classification_report(y_valid_res, predict_class_nn(model, X_valid_res),
target_names=['Not Fraud', 'Fraud']))


# plot loss during training
plt.subplot(211)
plt.title('Loss for nn')
plt.plot(r.history['loss'], label='train')
plt.plot(r.history['val_loss'], label='test')
plt.legend()
# plot accuracy during training
plt.subplot(212)
plt.title('Accuracy for nn')
plt.plot(r.history['accuracy'], label='train')
plt.plot(r.history['val_accuracy'], label='test')
plt.legend()
plt.show()


In [16]:
print('**** xgb *****')
%time print(classification_report(y_test_res, xgb.predict(X_test_res), target_names=['Not Fraud', 'Fraud']))
print('**** lgb *****')
%time print(classification_report(y_test_res, lg.predict(X_test_res), target_names=['Not Fraud', 'Fraud']))
print('**** mlp *****')
%time print(classification_report(y_test_res, mlp.predict(X_test_res), target_names=['Not Fraud', 'Fraud']))
print('**** ExtraTrees *****')
%time print(classification_report(y_test_res, ex.predict(X_test_res), target_names=['Not Fraud', 'Fraud']))
