In [1]:
##This notebook uses xgboost with grid search

In [2]:
dataset = spark.table("mldata_ventilation_status")

In [3]:
label = "ventilation_status"
allColumns = dataset.columns

categoricalColumns = [
  "City",
  "Marital",
  "Race",
  "Ethnicity",
  "Gender",
  "Suffix",
  "County",
#   "State",
  "smoker_status"
]

# numericalColumns = ["num1",
#                     "num2"]

ignoreColumns = ["Patient", "dataset", "State", label]

numericalColumns = list(set(allColumns) - set(categoricalColumns) - set(ignoreColumns))
# print(numericalColumns)

#categoricalColumnsclassVec = [c + "classVec" for c in categoricalColumns]

stages = []

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.sql.functions import *

for categoricalColumn in categoricalColumns:
  print(categoricalColumn)
  ## Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalColumn, outputCol = categoricalColumn+"Index").setHandleInvalid("skip")
  ## Use OneHotEncoder to convert categorical variables into binary SparseVectors
  #encoder = OneHotEncoder(inputCol=categoricalColumn+"Index", outputCol=categoricalColumn+"classVec")
  ## Add stages
  stages += [stringIndexer]

In [5]:
prepPipeline = Pipeline().setStages(stages)
# pipelineModel = prepPipeline.fit(train)
pipelineModel = prepPipeline.fit(dataset)
dataset = pipelineModel.transform(dataset)

In [6]:
train = dataset.filter(col("dataset") == "train")
train = train.filter(col("ventilation_status").isNotNull())
training, validation = train.randomSplit([0.8, 0.2], seed = 123)


test = dataset.filter(col("dataset") == "test")

In [7]:
print(training.count())
print(training.select("Patient").distinct().count())
print(validation.count())
print(validation.select("Patient").distinct().count())

In [8]:
target = 'ventilation_status'
ignoreColumns = ["Patient", "dataset", "State", 'label']
ignoreColumns2 = ["Patient", "dataset", "State", 'label',target]
columnCast=list(set(dataset.columns) - set(categoricalColumns) - set(ignoreColumns))
predictors=list(set(dataset.columns) - set(categoricalColumns) - set(ignoreColumns2))

In [9]:
import pandas as pd

trainCast=training.select(*(col(c).cast("float").alias(c) for c in columnCast)).toPandas()

In [10]:
validationCast=validation.select(*(col(c).cast("float").alias(c) for c in columnCast)).toPandas()

In [11]:
import xgboost as xgb

In [12]:
xgb_params = {
    'learning_rate':    [.03, 0.05, .07],
    'max_depth':        [5, 6,7],
    'min_child_weight': [4,3,2],
    'colsample_bytree': [.3,.5,.7],
    'subsample':       [0.8, 1],
    'n_estimators':     [100],
    'eval_metric': ['auc'],
    'early_stopping_rounds': [10],
    'objective':['binary:logistic']
}

In [13]:
xgb1=xgb.XGBClassifier()

In [14]:
from sklearn.model_selection import GridSearchCV

xgb_grid = GridSearchCV(xgb1,
                        xgb_params,
                        cv = 2,
                        n_jobs = 5,
                        verbose=False)

In [15]:
train2=trainCast[predictors]
valid=validationCast[predictors]

#y_train = train.target
#y_valid = valid.target
y_train=trainCast.ventilation_status
y_valid=validationCast.ventilation_status

In [16]:
xgb_grid.fit(train2,
         y_train)

In [17]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

In [18]:
from sklearn.metrics import roc_auc_score

pred_valid=xgb_grid.predict_proba(valid)[:,1]
auc_valid=roc_auc_score(y_valid, pred_valid)

In [19]:
pred_train=xgb_grid.predict_proba(train2)[:,1]
auc_train=roc_auc_score(y_train, pred_train)

In [20]:
from sklearn.metrics import average_precision_score

pr_valid=average_precision_score(y_valid, pred_valid)
pr_train=average_precision_score(y_train, pred_train)

In [21]:
print(auc_train)
print(auc_valid)

In [22]:
print(pr_train)
print(pr_valid)

In [23]:
import pickle

with open('/dbfs/mnt/data/ml/ventilationstatus/xg/model.pkl', 'wb') as f:
    pickle.dump(xgb_grid, f)

display(dbutils.fs.ls("/mnt/data/ml/ventilationstatus/xg/"))

path,name,size
dbfs:/mnt/data/ml/ventilationstatus/xg/model.pkl,model.pkl,250683
dbfs:/mnt/data/ml/ventilationstatus/xg/placeholder.txt,placeholder.txt,0


In [24]:
xgb_grid.best_estimator_.get_params()

In [25]:
feature_importances = pd.DataFrame({'Feature': valid.columns,
                                   'Importance': xgb_grid.best_estimator_.feature_importances_}).sort_values(by=['Importance'], ascending = False)

feature_importances

Unnamed: 0,Feature,Importance
352,6298-4_Min,0.025339
804,age_years,0.022874
189,Revenue,0.018139
92,QALY_Min,0.017974
277,condition_126906006,0.016433
516,QALY_Max,0.013441
158,procedure_265764009,0.012892
483,condition_424132000,0.011206
681,condition_64859006,0.010236
430,condition_92691004,0.010151


In [26]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, FloatType
## Function to extract probability from array
getprob = udf(lambda v:float(v[1]),FloatType())

## Load in COVID-19 Predictions
cv19test = spark.table("mldata_covid19_status").filter(col("dataset") == "test")

pipelineModel = PipelineModel.load("/mnt/data/ml/covid19status/pipeline")
cv19test = pipelineModel.transform(cv19test)

cvModel = CrossValidatorModel.load("/mnt/data/ml/covid19status/rf")
cv19test = cvModel.bestModel.transform(cv19test)

cv19test = cv19test.select(col("Patient"),         
                           getprob(col("probability")).alias("probability"),
                           col("prediction"))

###########################################################

testCast = test.select(*(col(c).cast("float").alias(c) for c in columnCast)).toPandas()

pred_test = xgb_grid.predict_proba(testCast[predictors])[:,1]

test_output = test.select("Patient").toPandas()
test_output["probability"] = pred_test

test_output = spark.createDataFrame(test_output) \
                   .join(cv19test.drop("probability").withColumnRenamed("prediction", "cv_prediction"), on=["Patient"], how="left") \
                   .withColumn("probability", when(col("cv_prediction") == 0, lit(0)).otherwise(col("probability"))) \
                   .drop("cv_prediction")

test_output.coalesce(1) \
      .write.format("com.databricks.spark.csv") \
      .option("header", "false") \
      .save("/mnt/data/scored/ventilationstatus/xgboost/")

display(test_output)

Patient,probability
046495b9-e1cd-47df-bc24-995e9807248f,0.009627958
076ba84e-6fd5-4024-92cc-28c3f10e42cc,0.0
08170598-af43-40c5-b1f6-f9e2a61261b0,0.0
08b13997-1dd1-4447-9da7-8e831ec12116,0.00778181
09f29ef9-559d-4d7e-abb2-b112fe29a441,0.0324391
0a18e6ae-4b9d-4ef9-9906-cdb803c7ca44,0.0
0deb965a-447e-4d2e-a751-ef1bda118122,0.0
1010bd6b-5789-4165-9a1e-e9be0db8373e,0.106460154
123bf5e8-493e-487e-9e99-03f5858ddeca,0.0
125885b6-2f5e-4ee3-ac59-d6b88afb1e59,0.0
