# Importing the necessary libraries

In [None]:
import pandas as pd
from dateutil.parser import parse
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns


# Reading the CreditCard dataset

In [None]:
df = pd.read_csv("creditcard.csv")

In [None]:
df.head()

Uploading the dataset to S3 bucket for backup

In [None]:
import sagemaker
prefix='sagemaker/credit-card-transactions'
sess=sagemaker.Session()

uri=sess.upload_data(path="./creditcard.csv",key_prefix=prefix)
print(uri)

# Checking for null values, we have 0 null values

In [None]:
df.isna().sum()

# Exploratory Data Analysis

In [None]:
df.info()

In [None]:
display(df.describe())

The below countplot shows that the dataset is highly imbalanced and is leaning towards Class Value 0

In [None]:
sns.countplot(x=df['Class'],data=df)
plt.show()

Using pairplot to see the relationship between different variables 

In [None]:
sns.set(style="ticks")
sns.pairplot(df[["V1","V3","V8","Class"]], hue="Class")

Correlation Matrix

In [None]:
correlation_matrix=df.corr()
sns.heatmap(correlation_matrix,
            xticklabels=correlation_matrix.columns.values,
            yticklabels=correlation_matrix.columns.values)

# Feature Engineering

In [None]:
df = df.drop(['Time'], axis=1)

In [None]:

def data_type(dataset):
    numerical=[]
    categorical=[]
    for datatype in dataset.columns:
        if df[datatype].dtype=="float64" or df[datatype].dtype=="int64":
            numerical.append(datatype)
        else:
            categorical.append(datatype)
    return numerical,categorical

            
numerical,categorical=data_type(df)
#removing the binary columns from numerical list for scaling
def binary_columns(dataset):
    binary_cols=[]
    for col in dataset.select_dtypes(include=['int','float']).columns:
        unique_values=df[col].unique()
        if np.in1d(unique_values,[0,1]).all():
            binary_cols.append(col)
    return binary_cols

binary_cols=binary_columns(df)

for i in binary_cols:
    numerical.remove(i)

# Scaling the entire dataset

In [None]:
from sklearn.preprocessing import StandardScaler

def feature_scaling(dataset,numerical):
    sc_x=StandardScaler()
    dataset[numerical]=sc_x.fit_transform(dataset[numerical])
    return dataset

df=feature_scaling(df,numerical)
    

In [None]:
df.columns

Splitting the data into input(X) and target(y) variables

In [None]:
X = df.drop(['Class'], axis=1)
y = df[['Class']]

# Splitting the data into train and test datasets

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


# Need to apply balancing to this highly imbalanced dataset
1.If you see the training  score on the original dataset , it is 99.9%

    This means that the model has overfitted and has memorized the training data.
    This has happened purely because Class attribute in the dataset has more than 99% values as 0

2.To tackle this problem, we will use SMOTE over sampling method
    
    Please keep in mind, we are not going with random undersampling or random oversampling 
    
    Because with random oversampling ,we add random set of copies of minority class examples to the data.
    This may increase the likelihood of overfitting.
    
    Using random undersampling method,we delete data from the majority class.
    This can be highly problematic, as the loss of such data can make the decision boundary 
    between minority and majority instances harder to learn, resulting in a loss in classification performance.

3.Hence we are going with SMOTE

    It is an oversampling technique where the synthetic samples are generated for the minority class.
    This algorithm helps to overcome the overfitting problem posed by random oversampling. 
    



In [None]:
from imblearn.over_sampling import SMOTE 

sm = SMOTE(sampling_strategy = 0.9, k_neighbors = 3, random_state = 100) 
X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train.values.ravel()) 
  
# Print the oversampling results
print(f"\n\t After applying SMOTE ,the shape of  X_train: {X_train_SMOTE.shape}") 
print(f"\n\t After applying SMOTE ,the shape of y_train: {y_train_SMOTE.shape}") 
  
print("After applying SMOTE, count '1': {}".format(sum(y_train_SMOTE == 1))) 
print("After applying SMOTE, count '0': {}".format(sum(y_train_SMOTE == 0)))

In [None]:
training_df = pd.DataFrame(X_train_SMOTE)
training_df['Class'] = y_train_SMOTE

testing_df = pd.DataFrame(X_test)
testing_df['Class'] = y_test

Uploading training and test data into CSV files

In [None]:

training_df.to_csv('credit_card_train.csv')
testing_df.to_csv('credit_card_test.csv')


# Uploading training and test data in S3 Buckets

In [None]:
import boto3
m_boto3 = boto3.client('sagemaker') 

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  #  Bucket is a logical unit of storage in AWS S3

print('Using bucket ' + bucket)

Uploading training and test data to S3



In [None]:
trainpath = sess.upload_data(
    path='credit_card_train.csv', bucket=bucket,
    key_prefix='sagemaker/credit-card-transactions')

testpath = sess.upload_data(
    path='credit_card_test.csv', bucket=bucket,
    key_prefix='sagemaker/credit-card-transactions')

In [None]:
%%writefile script.py

import argparse
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.metrics import explained_variance_score, r2_score



# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ =='__main__':

    print('extracting arguments')
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
   
    parser.add_argument('--n-estimators', type=int, default=30)
    parser.add_argument('--max_leaf_nodes', type=int, default=5)
    parser.add_argument('--max_depth', type=int, default=2)
    parser.add_argument('--min_samples_split', type=int, default=3)
    parser.add_argument('--random_state', type=int, default=22)
    
    

    # Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default='credit_card_train.csv')
    parser.add_argument('--test-file', type=str, default='credit_card_test.csv')
    
    
    args, _ = parser.parse_known_args()
    
    print('reading data')
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print('building training and testing datasets')
    columns = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
    X_train = train_df[columns]
    X_test = test_df[columns]
    y_train = train_df['Class']
    y_test = test_df['Class']
    
    # train
   
    print('training model')
    model = RandomForestClassifier(
        n_estimators=args.n_estimators,
        max_leaf_nodes =args.max_leaf_nodes,
        max_depth=args.max_depth,
        min_samples_split=args.min_samples_split,
        random_state=args.random_state,
        n_jobs=1)
    
    model.fit(X_train, y_train)
     
    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print('model persisted at ' + path)
   

In [None]:
! python script.py --n-estimators 30 \
                   --max-leaf-nodes 5 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \
                   --max_depth 2 \
                   --min_samples_split 3  \
                   --random_state 22 \

In [None]:
%%writefile requirements.txt

scikit-learn
pandas
numpy
argparse
fsspec
s3fs

# Using sagemaker estimator to create a training job

In [None]:

from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point='script.py',
    role = sagemaker.get_execution_role(),
    train_instance_count=1,
    train_instance_type='ml.c5.2xlarge', 
    framework_version='0.20.0',
    base_job_name='rf-scikit',
    #hyperparameteres
    hyperparameters = {'n-estimators': 30,
                       'max_leaf_nodes': 5,
                       'max_depth': 2,
                       'min_samples_split': 3,
                       'random_state': 22
                       })

# Training the model by estimator

In [None]:


sklearn_estimator.fit({'train':trainpath, 'test': testpath})


# Creating the Model Artifact

In [None]:
artifact = m_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

print('Model artifact persisted at ' + artifact)

# Deploying the model

This will create an endpoint

In [None]:
predictor = sklearn_estimator.deploy(instance_type='ml.c5.4xlarge',initial_instance_count=1)

In [None]:
columns = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

predictions = predictor.predict(testing_df[columns])

# Evaluating the model

In [None]:

from sklearn.metrics import confusion_matrix,classification_report,precision_recall_curve,roc_auc_score

print(f"Classification Report :- \n {classification_report(y_test, predictions)}")
print(f"AROC score :- \n {roc_auc_score(y_test, predictions)}")




The ROC AUC Score score has improved in this model, which shows the model is predicting better now. We would like this score to be as close to 1 as possible.

# Confusion Matrix on Balanced data

In [None]:
sns.heatmap(confusion_matrix(y_test, predictions), annot = True,fmt ='.5g')

In [None]:

precision, recall, thresholds = precision_recall_curve(y_test, predictions)

fig, ax = plt.subplots()
ax.plot(recall, precision, color='blue')
#add axis labels to plot
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
#display plot
plt.show() 



In the above curve at (1, 1), the threshold is 0.0.
This means that our precision and recall are high, and the model makes distinctions perfectly