In [18]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np


SEED = 0

In [8]:
data = pd.read_csv("creditcard.csv", header=0)
#print(data.info())

In [9]:
#Handle Duplicated values
print("Duplicates", data.duplicated().sum())
data = data.drop_duplicates()
print("Rows {}, Columns {}".format(data.shape[0], data.shape[1]))

Duplicates 1081
Rows 283726, Columns 31


In [10]:
#Convert All majority class to 1 and minority to -1 for oneclass svm
data['Class'] = data['Class'].replace({0: 1, 1: -1})
#data['Class'].unique()
data['Class'].value_counts()

Class
 1    283253
-1       473
Name: count, dtype: int64

In [11]:
x_majority = data[data["Class"]==1]
y_majority = x_majority["Class"]
x_majority = x_majority.drop(columns=["Class"])
x_minority = data[data["Class"]==-1]
y_minority = x_minority["Class"]
x_minority = x_minority.drop(columns=["Class"])

In [12]:
X_train, X_test, y_train, y_test = train_test_split( x_majority, y_majority, test_size=0.2, random_state=SEED )
X_test = pd.concat([X_test,x_minority],axis=0)
y_test = pd.concat([y_test,y_minority],axis=0)


In [22]:
%%time
LOF = LocalOutlierFactor()
param_grid = {
    'n_neighbors': [5, 10, 20],
    'metric': ['euclidean'],
    'contamination': ['auto'],
    'novelty' : [False]
}
grid_best_lof = GridSearchCV(LOF, param_grid=param_grid, scoring='f1',return_train_score=True,verbose=3,n_jobs=-1,cv=2)
grid_search_lof = grid_best_lof.fit(X_train,np.ones(X_train.shape[0],dtype=np.int8))
print(grid_search_lof.best_params_)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


Traceback (most recent call last):
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/utils/_available_if.py", line 32, in __get__
    if not sel

[CV 2/2] END contamination=auto, metric=euclidean, n_neighbors=5, novelty=False;, score=(train=nan, test=nan) total time=  40.9s
[CV 2/2] END contamination=auto, metric=euclidean, n_neighbors=10, novelty=False;, score=(train=nan, test=nan) total time=  41.1s
[CV 1/2] END contamination=auto, metric=euclidean, n_neighbors=10, novelty=False;, score=(train=nan, test=nan) total time=  41.1s


Traceback (most recent call last):
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/utils/_available_if.py", line 32, in __get__
    if not sel

[CV 1/2] END contamination=auto, metric=euclidean, n_neighbors=5, novelty=False;, score=(train=nan, test=nan) total time=  41.6s


Traceback (most recent call last):
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/Users/arjunramesh/Desktop/Spring23/Projects/jupyter_test/.venv/lib/python3.10/site-packages/sklearn/utils/_available_if.py", line 32, in __get__
    if not sel

[CV 1/2] END contamination=auto, metric=euclidean, n_neighbors=20, novelty=False;, score=(train=nan, test=nan) total time=  42.0s
[CV 2/2] END contamination=auto, metric=euclidean, n_neighbors=20, novelty=False;, score=(train=nan, test=nan) total time=  42.0s
{'contamination': 'auto', 'metric': 'euclidean', 'n_neighbors': 5, 'novelty': False}
CPU times: user 3min 7s, sys: 1.2 s, total: 3min 9s
Wall time: 1min 8s
