### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import gzip
import pickle
import logging
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator

plt.rcParams['figure.figsize'] = [10, 7]
plt.rcParams['font.size'] = 15
sns.set(color_codes=True)
sns.set(font_scale=1.5)
sns.set_palette("bright")
sns.set_style("whitegrid")

In [2]:
!pwd

/home/datascience/readmission/model_build


In [3]:
df = pd.read_csv("../feature_csvs/RAP_DF.csv")

df.shape

# pd.DataFrame({'Data type': df.dtypes}).to_csv("./tmp.csv")

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df.select_dtypes(include =numerics)

In [4]:
df[[col for col in df.columns if col not in newdf.columns]].columns

Index(['PATIENT', 'READMISSION', 'RACE', 'ETHNICITY', 'GENDER'], dtype='object')

In [5]:
df.head()

Unnamed: 0,PATIENT,READMISSION,allergy_1191,allergy_5640,allergy_7984,allergy_10831,allergy_25037,allergy_29046,allergy_3718001,allergy_84489001,...,Procedures_763302001,Procedures_866148006,Procedures_868187001,Procedures_112001000119100,Procedures_112011000119102,Procedures_426701000119108,Procedures_428211000124100,Procedures_449381000124108,Procedures_454711000124102,Procedures_16335031000119103
0,0360958b-500a-8b24-07dc-c6ec34186b7e,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,56d5a077-96c5-a053-a55c-292484e5fd67,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11a9315b-b71a-d87e-56cf-4ffb471b2523,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,223aa945-f14b-6006-7e7e-98ba1fb22cb6,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,01318a0b-4cbb-3467-9605-dc8aeb7df935,N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#categorical data
categorical_cols = ['READMISSION', 'RACE', 'ETHNICITY', 'GENDER'] 

#import pandas as pd
df2 = pd.get_dummies(df, columns = categorical_cols,drop_first=True)

In [7]:
df2.shape

(26596, 1211)

In [8]:
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import confusion_matrix, roc_auc_score

In [9]:
# y = df2['READMISSION_Y'].values

# X = df2.drop(['READMISSION_Y','PATIENT'], axis=1)

# # Split data to train and test on 70-30 ratio
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y)

In [8]:
df2.drop(['PATIENT'],axis=1, inplace=True)

In [9]:
df2.columns

Index(['allergy_1191', 'allergy_5640', 'allergy_7984', 'allergy_10831',
       'allergy_25037', 'allergy_29046', 'allergy_3718001', 'allergy_84489001',
       'allergy_102263004', 'allergy_111088007',
       ...
       'Procedures_454711000124102', 'Procedures_16335031000119103',
       'READMISSION_Y', 'RACE_black', 'RACE_hawaiian', 'RACE_native',
       'RACE_other', 'RACE_white', 'ETHNICITY_nonhispanic', 'GENDER_M'],
      dtype='object', length=1210)

In [10]:
# convert from Pandas Dataframe to ADSDataset
ds = DatasetFactory.from_dataframe(df2,target="READMISSION_Y")

loop1:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
%%time
train, test = ds.train_test_split(test_size=0.2)

CPU times: user 1.12 s, sys: 968 ms, total: 2.09 s
Wall time: 2.5 s


In [12]:
%%time
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.dataset.factory import DatasetFactory
## USE ADS for model building
import ads
import logging
import seaborn as sns
from ads.evaluations.evaluator import ADSEvaluator

CPU times: user 40 µs, sys: 4 µs, total: 44 µs
Wall time: 50.5 µs


In [13]:
import datetime
ct = datetime.datetime.now()
print("current time:-", ct)

current time:- 2022-09-27 18:32:30.514795


In [14]:
ct = datetime.datetime.now()
print("current time:-", ct)

current time:- 2022-09-27 18:32:37.211942


In [15]:
%%time
# this is the default AutoML provider for regression and classification problem types.
# over time Oracle will introduce other providers for other training tasks.
ml_engine = OracleAutoMLProvider()
oracle_automl = AutoML(train, provider=ml_engine)
automl_model1, baseline1 = oracle_automl.train(time_budget=600)

       'allergy_25037', 'allergy_29046', 'allergy_3718001', 'allergy_84489001',
       'allergy_102263004', 'allergy_111088007',
       ...
       'Procedures_449381000124108', 'Procedures_454711000124102',
       'Procedures_16335031000119103', 'RACE_black', 'RACE_hawaiian',
       'RACE_native', 'RACE_other', 'RACE_white', 'ETHNICITY_nonhispanic',
       'GENDER_M'],
      dtype='object', length=1173)
INFO:xengine:All work stopped
INFO:xengine:All work stopped
ERROR:ads.common:Unfortunately, there were no trials found, so we cannot visualize it.
CPU times: user 1min 7s, sys: 10.7 s, total: 1min 17s
Wall time: 11min 40s


In [27]:
automl_model1.summary()

Framework: lightgbm.sklearn
Estimator class: LGBMClassifier
Model Parameters: {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': 2, 'num_leaves': 31, 'objective': None, 'random_state': 7, 'reg_alpha': 0, 'reg_lambda': 1, 'silent': 'warn', 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}



In [17]:
ct = datetime.datetime.now()
print("current time:-", ct)

current time:- 2022-09-27 18:44:22.179427


In [19]:
ct = datetime.datetime.now()
print("current time:-", ct)

current time:- 2022-09-27 18:45:14.793762


In [20]:
%%time
import ads 
ads.set_auth(auth='resource_principal')

CPU times: user 98 µs, sys: 0 ns, total: 98 µs
Wall time: 117 µs


In [21]:
from ads.model.framework.automl_model import AutoMLModel
artifact_dir = "/home/datascience/readmission/Automl_artifacts"
automl_model = AutoMLModel(estimator=automl_model1, artifact_dir=artifact_dir)

In [22]:
from ads.common.model_metadata import UseCaseType
automl_model.prepare(inference_conda_env="generalml_p37_cpu_v1",
                    training_conda_env="generalml_p37_cpu_v1",
                    use_case_type=UseCaseType.BINARY_CLASSIFICATION,
                    X_sample=test.X,
                    force_overwrite=True,
                    training_id=None)



In [26]:
automl_model.verify(test.X.iloc[:10])

INFO:xengine:All work stopped
INFO:xengine:All work stopped
INFO:automl.xengine:Using Single Node XEngine with n_jobs: 1
INFO:automl.xengine:Max timeout per task is set to 1500
INFO:automl.xengine:local xengine initialization: <multiprocessing.pool.Pool object at 0x7fbfa5221110>
Start loading model.pkl from model directory /home/datascience/readmission/Automl_artifacts ...
Model is successfully loaded.
INFO:automl.preprocessing:transform: After dropping constant and mangled columns Index(['diagnosis1_66857006', 'diagnosis1_160701002', 'diagnosis1_840544004',
       'diagnosis2_7200002.0', 'diagnosis2_24079001.0',
       'diagnosis2_840539006.0', 'diagnosis3_431855005.0',
       'diagnosis3_449868002.0', 'diagnosis3_132281000119108.0',
       'Condition_160701002', 'Medications_205923', 'Medications_212446',
       'Medications_238100', 'Medications_309045', 'Medications_311700',
       'Medications_312617', 'Medications_542347', 'Medications_562366',
       'Medications_1234995', 'Medi

{'prediction': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]}

In [23]:
model_id = automl_model.save(display_name='Synthea - Readmission Prediction Model')

INFO:xengine:All work stopped
INFO:xengine:All work stopped
INFO:automl.xengine:Using Single Node XEngine with n_jobs: 1
INFO:automl.xengine:Max timeout per task is set to 1500
INFO:automl.xengine:local xengine initialization: <multiprocessing.pool.Pool object at 0x7f4e1959fe10>
Start loading model.pkl from model directory /home/datascience/readmission/Automl_artifacts ...
Model is successfully loaded.
['input_schema.json', 'test_json_output.json', 'score.py', 'model.pkl', 'runtime.yaml']


loop1:   0%|          | 0/5 [00:00<?, ?it/s]

artifact:/tmp/saved_model_77a59e4c-9855-4302-a25f-3cffb754bf47.zip


In [24]:
deploy = automl_model.deploy(display_name='Syntea - Readmission Prediction Deployment')

loop1:   0%|          | 0/6 [00:00<?, ?it/s]