## Scikit-Learn PCA and Logistic Regression Pipeline
### Using BREASTCANCER_VIEW from SAP Datasphere. This view has 569 records

## Install fedml aws library

In [None]:
pip install fedml-aws --force-reinstall

## Import Libraries 

In [None]:
from fedml_aws import DwcSagemaker
from fedml_aws import DbConnection
import numpy as np
import pandas as pd
import json

## Create DwcSagemaker instance to access libraries functions

In [None]:
dwcs = DwcSagemaker(prefix='<prefix>', bucket_name='<bucket_name>')

## Create DbConnection instance to get data from SAP Datasphere

Before running the following cell, you should have a config.json file in the same directory as this notebook with the specified values to allow you to access to SAP Datasphere.

You should also have the follow view `BREASTCANCER_VIEW` created in your SAP Datasphere. To gather this data, please refer to https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [None]:
%%time
db = DbConnection()
res, column_headers = db.get_data_with_headers(table_name="BREASTCANCER_VIEW", size=1)
data = pd.DataFrame(res, columns=column_headers)
data

In [None]:
data.columns

## Train SciKit Model¶
`train_data` is the data you want to train your model with. 

In order to deploy a model to AWS using the Scikit-learn Sagemaker SDK, you must have a script that tells Sagemaker how to train and deploy the model. The path to the script is passed to the `train_sklearn_model` function in the `train_script` parameter.

`instance_type` specifies how much computing power we want AWS to allocate for our services.

In [None]:
clf = dwcs.train_sklearn_model(data,
                               train_script='pca_pipeline_script.py',
                               instance_type='ml.c4.xlarge',
                              wait=True,
                              download_output=False,
                              hyperparameters={'n_components':3})

## Using the fedml_aws deploy to kyma function

In [None]:
!aws configure set aws_access_key_id '<aws_access_key_id>' --profile 'sample-pr'
!aws configure set aws_secret_access_key '<aws_secret_access_key>' --profile 'sample-pr'
!aws configure set region '<region>' --profile 'sample-pr'

In [None]:
dwcs.deploy_to_kyma(clf, initial_instance_count=1, profile_name='sample-pr')

## Using the fedml_aws invoke kyma endpoint function

In [None]:
org_data = data.sample(frac=1).reset_index(drop=True)
org_data = org_data[500:]
org_data.fillna(0, inplace=True)
y = org_data['diagnosis']
X = org_data.drop(['diagnosis'], axis=1)

In [None]:
result = dwcs.invoke_kyma_endpoint(api='<endpoint>', 
             payload=X.to_json(), 
             content_type='application/json')

In [None]:
result = result.content.decode()

In [None]:
result

## Write back to SAP Datasphere

In [None]:
X.columns

In [None]:
X.dtypes

In [None]:
# ['ID', 'Units_Sold', 'Unit_Price', 'Unit_Cost', 'Total_Revenue','Total_Cost', 'totalprofit']

db.create_table("CREATE TABLE PCA_Pipeline_Table (ID INTEGER PRIMARY KEY, radius_mean FLOAT(2), texture_mean FLOAT(2), perimeter_mean FLOAT(2), area_mean FLOAT(2), smoothness_mean FLOAT(2), compactness_mean FLOAT(2), concavity_mean FLOAT(2), concave_points_mean FLOAT(2), symmetry_mean FLOAT(2), fractal_dimension_mean FLOAT(2), radius_se FLOAT(2), texture_se FLOAT(2), perimeter_se FLOAT(2), area_se FLOAT(2), smoothness_se FLOAT(2), compactness_se FLOAT(2), concavity_se FLOAT(2), concave_points_se FLOAT(2), symmetry_se FLOAT(2), fractal_dimension_se FLOAT(2), radius_worst FLOAT(2), texture_worst FLOAT(2), perimeter_worst FLOAT(2), area_worst FLOAT(2), smoothness_worst FLOAT(2), compactness_worst FLOAT(2), concavity_worst FLOAT(2), concave_points_worst FLOAT(2), symmetry_worst FLOAT(2), fractal_dimension_worst FLOAT(2), column32 INTEGER, diagnosis_predict VARCHAR(100))")


In [None]:
res = result.strip('][').split(', ')
res

In [None]:
dwc_data = X
dwc_data = dwc_data.assign(diagnosis_predict = res)

In [None]:
dwc_data.columns = ['id', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave_points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'column32', 'diagnosis_predict']

In [None]:
for i in dwc_data.columns[1:-1]:
    dwc_data[i] = dwc_data[i].astype('float64')

In [None]:
dwc_data

In [None]:
dwc_data.dtypes

In [None]:
db.insert_into_table('PCA_Pipeline_Table', dwc_data)