In [1]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.6/235.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.11.0 imblearn-0.0


In [2]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from collections import Counter
import sagemaker
import boto3
import os

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
bucket_etl = "ofer-ml-data-etl"
data = pd.read_csv('processed_data_classification_v2.csv', delimiter=',')

In [4]:
# Random sampling 90% of the data
data = data.loc[np.random.choice(data.index, int(0.9*len(data)), replace=False)]
len(data)

4950

In [5]:
data.shape

(4950, 50)

In [6]:
data['fraudulent_provider'].value_counts()

0    4497
1     453
Name: fraudulent_provider, dtype: int64

In [7]:
feature_columns = data.columns[1:]
label_column = data.columns[0]

features = data[feature_columns].values.astype('float32')
labels = (data[label_column].values).astype('float32')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.1, stratify=labels)

In [9]:
# Apply SMOTE
over = SMOTE(sampling_strategy=0.25)
under = RandomUnderSampler(sampling_strategy=1)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_smote, y_smote = pipeline.fit_resample(X_train, y_train)

# check for imbalance again
print(sorted(Counter(y_smote).items()))

[(0.0, 1011), (1.0, 1011)]


In [10]:
X_smote_train, X_smote_validation, y_smote_train, y_smote_validation = train_test_split(
    X_smote, y_smote, test_size=0.1, stratify=y_smote)

In [11]:
# first column is target column
trainX_concate = np.concatenate((y_smote_train.reshape(len(y_smote_train),1), X_smote_train), axis=1)
trainX = pd.DataFrame(trainX_concate, index=None, columns=None)

validationX_concate = np.concatenate((y_smote_validation.reshape(len(y_smote_validation),1), X_smote_validation), axis=1)
validationX = pd.DataFrame(validationX_concate, index=None, columns=None)

testX_concate = np.concatenate((y_test.reshape(len(y_test),1), X_test), axis=1)
testX = pd.DataFrame(testX_concate, index=None, columns=None)

In [12]:
trainX.shape

(1819, 50)

In [13]:
testX.shape

(495, 50)

In [14]:
validationX.shape

(203, 50)

In [15]:
trainX.to_csv("cms_payment_train.csv", header=False, index=False)
validationX.to_csv("cms_payment_validation.csv", header=False, index=False)
testX.to_csv("cms_payment_test.csv", header=False, index=False)

In [16]:
session = sagemaker.Session(default_bucket=bucket_etl)
bucket = session.default_bucket()
prefix = 'fraud-detect-demo'

# Upload training and validation data to a S3 bucket in client account
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, "cms_payment_train.csv")).upload_file("cms_payment_train.csv")
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, "cms_payment_validation.csv")).upload_file("cms_payment_validation.csv")

# Testing dataset is used in server account for testing purpose only

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
