In [1]:
import sys
import pandas as pd
import numpy as np
import yaml

sys.path.append('../../../ludwig')
from ludwig.api import LudwigModel

%load_ext autoreload
%autoreload 2

# Load Data

In [2]:
input_data = pd.read_csv('creditcard.csv')

In [3]:
input_data.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

# Model 1
- No dataset manipulation

##### Initialize Config File

In [4]:
with open("model1_config.yaml", 'r') as stream:
    try:
        config1 = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

##### Build Model

In [5]:
model1 = LudwigModel(config1)
stats1, preprocessed_data1, output_url1 = model1.train(dataset=input_data)
evaluation_statistics1, predictions1, output_directory1 = model1.evaluate(dataset=input_data)



# Model 2
- Undersampling Majority
- Minority class set to 50% of majority class

##### Initialize Config File

In [6]:
with open("model2_config.yaml", 'r') as stream:
    try:
        config2 = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

##### Build Model

In [7]:
model2 = LudwigModel(config2)
stats2, preprocessed_data2, output_url2 = model2.train(dataset=input_data)
evaluation_statistics2, predictions2, output_directory2 = model2.evaluate(dataset=input_data)



# Model 3
- Minority class oversampling
- Minority class at 50% of majority class

##### Initialize Config File

In [8]:
with open("model3_config.yaml", 'r') as stream:
    try:
        config3 = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

##### Build Model

In [9]:
model3 = LudwigModel(config3)
stats3, preprocessed_data3, output_url3 = model3.train(dataset=input_data)
evaluation_statistics3, predictions3, output_directory3 = model3.evaluate(dataset=input_data)



# Evalutaion

##### Model 1: No Dataset Balancing

In [42]:
evaluation_statistics1

{'Class': {'loss': 0.16574536263942719,
  'accuracy': 0.9987359642982483,
  'roc_auc': 0.7724238038063049},
 'combined': {'loss': 0.042273156344890594}}

In [44]:
bins = np.bincount(preprocessed_data1[2].dataset['Class_mZFLky'])
majority_class_count1 = bins[0]
minority_class_count1 = bins[1]
print("Test Set Majority Class Count: {}".format(majority_class_count1))
print("Test Set Minority Class Count: {}".format(minority_class_count1))

Test Set Majority Class Count: 57020
Test Set Minority Class Count: 86


##### Model 2: Majority Undersampling

In [11]:
evaluation_statistics2

{'Class': {'loss': 414.2647705078125,
  'accuracy': 0.8794037699699402,
  'roc_auc': 0.9176716804504395},
 'combined': {'loss': 0.6246074438095093}}

In [45]:
bins = np.bincount(preprocessed_data2[2].dataset['Class_mZFLky'])
majority_class_count2 = bins[0]
minority_class_count2 = bins[1]
print("Test Set Majority Class Count: {}".format(majority_class_count2))
print("Test Set Minority Class Count: {}".format(minority_class_count2))

Test Set Majority Class Count: 204
Test Set Minority Class Count: 104


##### Model 3: Minority Oversampling

In [12]:
evaluation_statistics3

{'Class': {'loss': 1.2576910257339478,
  'accuracy': 0.952880859375,
  'roc_auc': 0.9804296493530273},
 'combined': {'loss': 0.21803104877471924}}

In [46]:
bins = np.bincount(preprocessed_data3[2].dataset['Class_mZFLky'])
majority_class_count3 = bins[0]
minority_class_count3 = bins[1]
print("Test Set Majority Class Count: {}".format(majority_class_count3))
print("Test Set Minority Class Count: {}".format(minority_class_count3))

Test Set Majority Class Count: 56906
Test Set Minority Class Count: 28445
