### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import gzip
import pickle
import logging
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator

plt.rcParams['figure.figsize'] = [10, 7]
plt.rcParams['font.size'] = 15
sns.set(color_codes=True)
sns.set(font_scale=1.5)
sns.set_palette("bright")
sns.set_style("whitegrid")

In [2]:
!pwd

/home/datascience/readmission/model_build


In [3]:
df = pd.read_csv("../feature_csvs/RAP_DF.csv")

df.shape

# pd.DataFrame({'Data type': df.dtypes}).to_csv("./tmp.csv")

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df.select_dtypes(include =numerics)

In [4]:
df[[col for col in df.columns if col not in newdf.columns]].columns

Index(['PATIENT', 'READMISSION', 'RACE', 'ETHNICITY', 'GENDER'], dtype='object')

In [5]:
df.head()

Unnamed: 0,PATIENT,READMISSION,allergy_1191,allergy_5640,allergy_7984,allergy_10831,allergy_25037,allergy_29046,allergy_3718001,allergy_84489001,...,Procedures_763302001,Procedures_866148006,Procedures_868187001,Procedures_112001000119100,Procedures_112011000119102,Procedures_426701000119108,Procedures_428211000124100,Procedures_449381000124108,Procedures_454711000124102,Procedures_16335031000119103
0,0360958b-500a-8b24-07dc-c6ec34186b7e,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,56d5a077-96c5-a053-a55c-292484e5fd67,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11a9315b-b71a-d87e-56cf-4ffb471b2523,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,223aa945-f14b-6006-7e7e-98ba1fb22cb6,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,01318a0b-4cbb-3467-9605-dc8aeb7df935,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#categorical data
categorical_cols = ['READMISSION', 'RACE', 'ETHNICITY', 'GENDER'] 

#import pandas as pd
df2 = pd.get_dummies(df, columns = categorical_cols,drop_first=True)

In [7]:
df2.shape

(26596, 1211)

In [8]:
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import confusion_matrix, roc_auc_score

In [9]:
# y = df2['READMISSION_Y'].values

# X = df2.drop(['READMISSION_Y','PATIENT'], axis=1)

# # Split data to train and test on 70-30 ratio
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y)

In [10]:
df2.drop(['PATIENT'],axis=1, inplace=True)

In [11]:
df2.columns

Index(['allergy_1191', 'allergy_5640', 'allergy_7984', 'allergy_10831',
       'allergy_25037', 'allergy_29046', 'allergy_3718001', 'allergy_84489001',
       'allergy_102263004', 'allergy_111088007',
       ...
       'Procedures_454711000124102', 'Procedures_16335031000119103',
       'READMISSION_Y', 'RACE_black', 'RACE_hawaiian', 'RACE_native',
       'RACE_other', 'RACE_white', 'ETHNICITY_nonhispanic', 'GENDER_M'],
      dtype='object', length=1210)

In [12]:
# convert from Pandas Dataframe to ADSDataset
ds = DatasetFactory.from_dataframe(df2,target="READMISSION_Y")

loop1:   0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
%%time
train, test = ds.train_test_split(test_size=0.2)

CPU times: user 713 ms, sys: 101 ms, total: 814 ms
Wall time: 820 ms


In [14]:
%%time
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.dataset.factory import DatasetFactory
## USE ADS for model building
import ads
import logging
import seaborn as sns
from ads.evaluations.evaluator import ADSEvaluator

CPU times: user 27 µs, sys: 0 ns, total: 27 µs
Wall time: 32.7 µs


In [15]:
import datetime
ct = datetime.datetime.now()
print("current time:-", ct)

current time:- 2022-05-05 04:04:29.389344


In [16]:
ct = datetime.datetime.now()
print("current time:-", ct)

current time:- 2022-05-05 04:04:29.530431


In [17]:
%%time
# this is the default AutoML provider for regression and classification problem types.
# over time Oracle will introduce other providers for other training tasks.
ml_engine = OracleAutoMLProvider()
oracle_automl = AutoML(train, provider=ml_engine)
automl_model1, baseline1 = oracle_automl.train(time_budget=600)

INFO:xengine:All work stopped
INFO:xengine:All work stopped


0,1
Training Dataset size,"(21276, 1209)"
Validation Dataset size,
CV,5
Target variable,READMISSION_Y
Optimization Metric,neg_log_loss
Initial number of Features,1209
Selected number of Features,445
Selected Features,"Index(['allergy_84489001', 'allergy_256277009', 'allergy_260147004',  'careplan_134435003', 'careplan_170836005', 'careplan_182964004',  'careplan_395082007', 'careplan_412776001', 'careplan_699728000',  'careplan_711282006',  ...  'Procedures_763302001', 'Procedures_866148006',  'Procedures_112011000119102', 'Procedures_428211000124100',  'Procedures_454711000124102', 'Procedures_16335031000119103',  'RACE_black', 'RACE_white', 'ETHNICITY_nonhispanic', 'GENDER_M'],  dtype='object', length=445)"
Selected Algorithm,XGBClassifier
End-to-end Elapsed Time (seconds),645.8849


Rank based on Performance,Algorithm,#Samples,#Features,Mean Validation Score,Hyperparameters,CPU Time,Memory Usage
2,XGBClassifier_AVGRanking_FS,21276,445,-0.0727,"{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1}",93.0637,0.1410
3,XGBClassifier_AVGRanking_FS,21276,148,-0.0728,"{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1}",34.5068,0.0746
4,XGBClassifier_AVGRanking_FS,21276,178,-0.0729,"{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1}",39.0373,0.0069
5,XGBClassifier_AVGRanking_FS,21276,257,-0.0729,"{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1}",55.3526,0.0664
6,XGBClassifier_AVGRanking_FS,21276,123,-0.0730,"{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1}",28.9912,0.0000
...,...,...,...,...,...,...,...
163,XGBClassifier_MIClassification_FS,21276,1,-0.2960,"{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1}",4.0374,0.0746
164,XGBClassifier_AdaBoostClassifier_FS,21276,1,-0.3954,"{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1}",3.2579,0.0000
165,TorchMLPClassifier_AS,5000,1179,-0.4390,"{'activation': 'relu', 'class_weight': None, 'dropout': 0.1, 'l2_reg': 0.0, 'nr_layers': 1, 'nr_neurons': 100, 'optimizer': 'adam'}",136.7920,0.0603
166,DecisionTreeClassifier_AS,5000,1179,-0.6216,"{'class_weight': None, 'max_features': 1.0, 'min_samples_leaf': 0.000625, 'min_samples_split': 0.00125}",3.3190,0.0002


CPU times: user 5min 1s, sys: 1min 33s, total: 6min 35s
Wall time: 10min 50s


In [18]:
# %%time
# n_jobs=-1
# ml_engine = OracleAutoMLProvider(n_jobs=n_jobs,  loglevel=logging.INFO) 
# ml_engine.automl.init(
#     engine='local',
#     engine_opts={'n_jobs': n_jobs, 'model_n_jobs': 1},
#     loglevel=logging.INFO,
# )
# oracle_automl = AutoML(train, provider=ml_engine)
# automl_model2, _ = oracle_automl.train(time_budget=1800)

In [19]:
ct = datetime.datetime.now()
print("current time:-", ct)

current time:- 2022-05-05 04:15:21.175001


In [20]:
automl_model1

Pipeline(model_list=['DecisionTreeClassifier', 'ExtraTreesClassifier',
                     'GaussianNB', 'LGBMClassifier', 'LogisticRegression',
                     'RandomForestClassifier', 'SVC', 'XGBClassifier',
                     'TorchMLPClassifier'])

In [21]:
ct = datetime.datetime.now()
print("current time:-", ct)

current time:- 2022-05-05 04:15:56.551419


In [5]:
# Use below code for detailed Automl logging information
# %%time
# n_jobs=12
# ml_engine = OracleAutoMLProvider(n_jobs=n_jobs,  loglevel=logging.INFO) 
# ml_engine.automl.init(
#     engine='local',
#     engine_opts={'n_jobs': n_jobs, 'model_n_jobs': 4},
#     loglevel=logging.INFO,
# )
# oracle_automl = AutoML(train, provider=ml_engine)
# automl_model2, _ = oracle_automl.train()

In [22]:
%%time
import ads 
ads.set_auth(auth='resource_principal')

CPU times: user 63 µs, sys: 0 ns, total: 63 µs
Wall time: 90.8 µs


In [24]:
from ads.model.framework.automl_model import AutoMLModel
artifact_dir = "/home/datascience/readmission/Automl_artifacts"
automl_model = AutoMLModel(estimator=automl_model1, artifact_dir=artifact_dir)

In [25]:
from ads.common.model_metadata import UseCaseType
automl_model.prepare(inference_conda_env="generalml_p37_cpu_v1",
                    training_conda_env="generalml_p37_cpu_v1",
                    use_case_type=UseCaseType.BINARY_CLASSIFICATION,
                    X_sample=test.X,
                    force_overwrite=True,
                    training_id=None)



In [26]:
automl_model.verify(test.X.iloc[:10])

INFO:xengine:All work stopped
INFO:xengine:All work stopped
INFO:automl.xengine:Using Single Node XEngine with n_jobs: 1
INFO:automl.xengine:Max timeout per task is set to 1500
INFO:automl.xengine:local xengine initialization: <multiprocessing.pool.Pool object at 0x7fbfa5221110>
Start loading model.pkl from model directory /home/datascience/readmission/Automl_artifacts ...
Model is successfully loaded.
INFO:automl.preprocessing:transform: After dropping constant and mangled columns Index(['diagnosis1_66857006', 'diagnosis1_160701002', 'diagnosis1_840544004',
       'diagnosis2_7200002.0', 'diagnosis2_24079001.0',
       'diagnosis2_840539006.0', 'diagnosis3_431855005.0',
       'diagnosis3_449868002.0', 'diagnosis3_132281000119108.0',
       'Condition_160701002', 'Medications_205923', 'Medications_212446',
       'Medications_238100', 'Medications_309045', 'Medications_311700',
       'Medications_312617', 'Medications_542347', 'Medications_562366',
       'Medications_1234995', 'Medi

{'prediction': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]}

In [27]:
model_id = automl_model.save(display_name='Synthea - Readmission Prediction Model')

INFO:xengine:All work stopped
INFO:xengine:All work stopped
INFO:automl.xengine:Using Single Node XEngine with n_jobs: 1
INFO:automl.xengine:Max timeout per task is set to 1500
INFO:automl.xengine:local xengine initialization: <multiprocessing.pool.Pool object at 0x7fbe4163d510>
Start loading model.pkl from model directory /home/datascience/readmission/Automl_artifacts ...
Model is successfully loaded.
['input_schema.json', 'score.py', 'model.pkl', 'runtime.yaml']


loop1:   0%|          | 0/5 [00:00<?, ?it/s]

artifact:/tmp/saved_model_3510befb-6f0c-4d14-a6b0-88d6f776d6a9.zip


In [28]:
deploy = automl_model.deploy(display_name='Syntea - Readmission Prediction Deployment')

loop1:   0%|          | 0/6 [00:00<?, ?it/s]