In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn import svm
from scipy import stats
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
#from sklearn.neighbors import LocalOutlierFactor

rng = np.random.RandomState(42)

In [2]:
transactions = pd.read_csv('data/transactions_small_clean.csv')

In [3]:
labels = transactions['Label']

In [4]:
del transactions['Label']

In [5]:
X_train = transactions.values

In [6]:
outliers_fraction = sum(labels/float(len(labels)))
print(outliers_fraction)

0.000383585337276


# One-class SVM

In [7]:
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma='auto', verbose=True)

In [None]:
clf.fit(X_train)

[LibSVM]

In [None]:
pred = clf.predict(X_train)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
%matplotlib inline

conf_matrix = confusion_matrix(labels['Class'], pred)
plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=[0, 1], yticklabels=[0, 1], annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

# Robust covariance

In [None]:
clf_rc = EllipticEnvelope(contamination=outliers_fraction)

In [None]:
clf_rc.fit(X_train)

In [None]:
pred_rc = clf_rc.predict(X_train)

In [None]:
print(len(pred_rc))
pred_rc = (pred_rc-1)*(-1)/2
print(len(pred_rc[pred_rc==0]))
print(len(pred_rc[pred_rc==1]))

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
%matplotlib inline

conf_matrix = confusion_matrix(labels['Class'], pred_rc)
plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=[0, 1], yticklabels=[0, 1], annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

# Isolation Forest

In [None]:
clf_if = IsolationForest(max_samples=20000, contamination=outliers_fraction, random_state=rng, verbose=True)

In [None]:
clf_if.fit(X_train)

In [None]:
pred_if = clf_if.predict(X_train)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
%matplotlib inline

conf_matrix = confusion_matrix(labels['Class'], pred_if)
plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=[0, 1], yticklabels=[0, 1], annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

# Local Outlier Factor

In [None]:
clf_lof = LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction, verbose=True)

In [None]:
pred_lof = clf_lof.fit_predict(X_train)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
%matplotlib inline

conf_matrix = confusion_matrix(labels['Class'], pred_lof)
plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=[0, 1], yticklabels=[0, 1], annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()