# Support Vector Classifier for Healthcare Fraud Detection
### Deborah Leong, Sam Nuzbrokh and Doug Devens

Import pandas and numpy packages to be able to transform and manipulation data.

In [2]:
import numpy as np
import pandas as pd

Read in modified feature file from post_providerinout_mods.py, with a large number of features.

In [3]:
x_train_inout_mod = pd.read_csv('x_train_inout_mod.csv')
x_train_inout_mod.columns

Index(['Unnamed: 0', 'Age_in', 'Age_out', 'AttendingPhysician_in',
       'AttendingPhysician_out', 'ClaimDays_in', 'ClaimDays_out',
       'DeductibleAmtPaid_in', 'DeductibleAmtPaid_out', 'Gender_in',
       'Gender_out', 'InscClaimAmtReimbursed_in', 'InscClaimAmtReimbursed_out',
       'NumChronics_in', 'NumChronics_out', 'NumDiag_in', 'NumDiag_out',
       'NumProc_in', 'NumProc_out', 'State_in', 'State_out', 'WhetherDead_in',
       'WhetherDead_out', 'ClaimDays_in_Range', 'ClaimDays_out_Range',
       'InscClaimAmtReimbursed_in_Range', 'InscClaimAmtReimbursed_out_Range',
       'NumChronics_in_Range', 'NumChronics_out_Range', 'NumDiag_in_Range',
       'NumDiag_out_Range', 'NumProc_in_Range', 'NumProc_out_Range',
       'Provider', 'PotentialFraud', 'docDegMax', 'docBtwnMean', 'docEignMean',
       'docMANN', 'patDegMax', 'patBtwnMean', 'patEignMean', 'patMANN',
       'ClmsPerPhysician_in', 'ClmsPerPhysician_out', 'ClmsPerPatient_in',
       'ClmsPerPatient_out', 'DrPerPatient_in

Remove junk columns created during read in.  Set 'potential fraud' as target (already converted to 1s and 0s.) Scale the input variables and ensure there are no NAs.  Split into test and train sets.

In [4]:
from sklearn import model_selection as ms
x_train_inout_mod = x_train_inout_mod.drop(columns = 'Unnamed: 0')
y = x_train_inout_mod['PotentialFraud']
X = x_train_inout_mod.drop(columns = ['Provider', 'PotentialFraud'])
X_scaled = (X-X.min(axis=0))/(X.max(axis=0)-X.min(axis=0))
X_scaled=X_scaled.fillna(0)
from sklearn import model_selection as ms
X_train, X_test, y_train, y_test = ms.train_test_split(X_scaled, y, test_size=0.20, random_state=42)

Import packages, including classification report and confusion matrix to be able to assess model classification performance.

In [5]:
from sklearn import ensemble
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


Import Support Vector Machine, and set up dictionary of values to examine for kernals, scales and penalty parameter.  Set class weight in model parameters as balanced since the classification is so imbalanced (90/10)

In [18]:
from sklearn import svm
svm_model = svm.SVC(class_weight='balanced')

Fit model with default parameters and assess performance on test set.  Performance isn't outstanding.  We are using the F1 score as a proxy for overall performance given the imbalanced ratio of classes.

In [19]:
svm_model.fit(X_train,y_train)
print(classification_report(y_test,svm_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.88      0.93       977
           1       0.45      0.88      0.59       105

    accuracy                           0.88      1082
   macro avg       0.72      0.88      0.76      1082
weighted avg       0.93      0.88      0.90      1082



In [20]:
grid_para_svc = [
    {'C': np.linspace(0.75,3,8),
     'kernel': ['linear', 'rbf'],
     'gamma': ['scale']}]
grid_search_svm = ms.GridSearchCV(svm_model, grid_para_svc, scoring='f1_weighted', cv=5, iid=True,  n_jobs=3)
grid_search_svm.fit(X_train, y_train)



GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight='balanced', coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid=True, n_jobs=3,
             param_grid=[{'C': array([0.75      , 1.07142857, 1.39285714, 1.71428571, 2.03571429,
       2.35714286, 2.67857143, 3.        ]),
                          'gamma': ['scale'], 'kernel': ['linear', 'rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_weighted', verbose=0)

In [21]:
grid_search_svm.best_params_

{'C': 3.0, 'gamma': 'scale', 'kernel': 'linear'}

Noticed that optimal values were at ends of ranges specified, so will shift range.

In [23]:
grid_para_svc = [
    {'C': np.linspace(1.5,10,4),
     'kernel': ['rbf','linear'],
     'gamma': ['scale']}]
grid_search_svm = ms.GridSearchCV(svm_model, grid_para_svc, scoring='f1_weighted', cv=5,  n_jobs=3)
grid_search_svm.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight='balanced', coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=3,
             param_grid=[{'C': array([ 1.5       ,  4.33333333,  7.16666667, 10.        ]),
                          'gamma': ['scale'], 'kernel': ['rbf', 'linear']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_weighted', verbose=0)

In [24]:
grid_search_svm.best_params_

{'C': 10.0, 'gamma': 'scale', 'kernel': 'rbf'}

Again see that optimal parameters are toward end of specified range, but will examine performance at these values.  Again, using confusion matrix and F1 as the target score to maximize for imbalanced classes

In [25]:
svm_model.set_params(C= grid_search_svm.best_params_['C'], gamma = grid_search_svm.best_params_['gamma'], \
                     kernel = grid_search_svm.best_params_['kernel'])
svm_model.fit(X_train, y_train)
print('   0    1    predicted is columns')
print(confusion_matrix(y_test, svm_model.predict(X_test)))
print(classification_report(y_test, svm_model.predict(X_test)))

   0    1    predicted is columns
[[887  90]
 [ 19  86]]
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       977
           1       0.49      0.82      0.61       105

    accuracy                           0.90      1082
   macro avg       0.73      0.86      0.78      1082
weighted avg       0.93      0.90      0.91      1082



Decided to try one more grid search with higher C values.  This is a second iteration after a first (erased) iteration, to find an optimal C value for the rbf kernel. 

In [42]:
grid_para_svc = [
    {'C': [21,21.5,22],
     'kernel': ['rbf'],
     'gamma': ['scale','auto']}]
grid_search_svm = ms.GridSearchCV(svm_model, grid_para_svc, scoring='f1_weighted', cv=5,  n_jobs=3)
grid_search_svm.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=23, break_ties=False, cache_size=200,
                           class_weight='balanced', coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=3,
             param_grid=[{'C': [21, 21.5, 22], 'gamma': ['scale', 'auto'],
                          'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_weighted', verbose=0)

In [43]:
grid_search_svm.best_params_

{'C': 22, 'gamma': 'scale', 'kernel': 'rbf'}

In [49]:
svm_model.set_params(C= 22, gamma = grid_search_svm.best_params_['gamma'], \
                     kernel = grid_search_svm.best_params_['kernel'])
svm_model.fit(X_train, y_train)
print('   0    1    predicted is columns')
print(confusion_matrix(y_test, svm_model.predict(X_test)))
print(classification_report(y_test, svm_model.predict(X_test)))

   0    1    predicted is columns
[[897  80]
 [ 24  81]]
              precision    recall  f1-score   support

           0       0.97      0.92      0.95       977
           1       0.50      0.77      0.61       105

    accuracy                           0.90      1082
   macro avg       0.74      0.84      0.78      1082
weighted avg       0.93      0.90      0.91      1082



We see that the number of false positives has decreased, and F1 is unchanged.  However, we do not pursue this model since the number of false positives is still approximately equal to the number of true positives.