In [1]:
##This notebook uses xgboost with grid search

In [2]:
dataset = spark.table("mldata_days_hospitalized")

In [3]:
label = "days_hospitalized"
allColumns = dataset.columns

categoricalColumns = [
  "City",
  "Marital",
  "Race",
  "Ethnicity",
  "Gender",
  "Suffix",
  "County",
#   "State",
  "smoker_status"
]

# numericalColumns = ["num1",
#                     "num2"]

ignoreColumns = ["Patient", "dataset", "State", label]

numericalColumns = list(set(allColumns) - set(categoricalColumns) - set(ignoreColumns))
# print(numericalColumns)

#categoricalColumnsclassVec = [c + "classVec" for c in categoricalColumns]

stages = []

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.sql.functions import *

for categoricalColumn in categoricalColumns:
  print(categoricalColumn)
  ## Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalColumn, outputCol = categoricalColumn+"Index").setHandleInvalid("skip")
  ## Use OneHotEncoder to convert categorical variables into binary SparseVectors
  #encoder = OneHotEncoder(inputCol=categoricalColumn+"Index", outputCol=categoricalColumn+"classVec")
  ## Add stages
  stages += [stringIndexer]

In [5]:
prepPipeline = Pipeline().setStages(stages)
# pipelineModel = prepPipeline.fit(train)
pipelineModel = prepPipeline.fit(dataset)
dataset = pipelineModel.transform(dataset)

In [6]:
train = dataset.filter(col("dataset") == "train")
train = train.filter(col("days_hospitalized").isNotNull())
training, validation = train.randomSplit([0.8, 0.2], seed = 123)


test = dataset.filter(col("dataset") == "test")

In [7]:
print(training.count())
print(training.select("Patient").distinct().count())
print(validation.count())
print(validation.select("Patient").distinct().count())

In [8]:
target = 'days_hospitalized'
ignoreColumns = ["Patient", "dataset", "State", 'label']
ignoreColumns2 = ["Patient", "dataset", "State", 'label',target]
columnCast=list(set(dataset.columns) - set(categoricalColumns) - set(ignoreColumns))
predictors=list(set(dataset.columns) - set(categoricalColumns) - set(ignoreColumns2))

In [9]:
import pandas as pd

trainCast=training.select(*(col(c).cast("float").alias(c) for c in columnCast)).toPandas()

In [10]:
validationCast=validation.select(*(col(c).cast("float").alias(c) for c in columnCast)).toPandas()

In [11]:
import xgboost as xgb

In [12]:
xgb_params = {
    'learning_rate':    [.03, 0.05, .07],
    'max_depth':        [5, 6,7],
    'min_child_weight': [4,3,2],
    'colsample_bytree': [.3,.5,.7],
    'subsample':       [0.8, 1],
    'n_estimators':     [100],
    'eval_metric': ['auc'],
    'early_stopping_rounds': [10],
    'objective':['reg:squarederror']
}

In [13]:
xgb1=xgb.XGBRegressor()

In [14]:
from sklearn.model_selection import GridSearchCV

xgb_grid = GridSearchCV(xgb1,
                        xgb_params,
                        cv = 2,
                        n_jobs = 5,
                        verbose=False)

In [15]:
train2=trainCast[predictors]
valid=validationCast[predictors]

#y_train = train.target
#y_valid = valid.target
y_train=trainCast.days_hospitalized
y_valid=validationCast.days_hospitalized

In [16]:
xgb_grid.fit(train2,
         y_train)

In [17]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

In [18]:
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

pred_valid = xgb_grid.predict(valid)
mse_valid = mean_squared_error(y_valid, pred_valid)
r2_valid = r2_score(y_valid, pred_valid)

pred_train = xgb_grid.predict(train2)
mse_train = mean_squared_error(y_train, pred_train)
r2_train = r2_score(y_train, pred_train)

print("Mean Sq. Error:\nValidation:", mse_valid, "\nTrain:", mse_train)
print("R2:\nValidation:", r2_valid, "\nTrain:", r2_train)

In [19]:
import pickle

with open('/dbfs/mnt/data/ml/dayshospitalized/xg/model.pkl', 'wb') as f:
    pickle.dump(xgb_grid, f)

display(dbutils.fs.ls("/mnt/data/ml/dayshospitalized/xg/"))

path,name,size
dbfs:/mnt/data/ml/dayshospitalized/xg/model.pkl,model.pkl,273943
dbfs:/mnt/data/ml/dayshospitalized/xg/placeholder.txt,placeholder.txt,0


In [20]:
xgb_grid.best_estimator_.get_params()

In [21]:
feature_importances = pd.DataFrame({'Feature': valid.columns,
                                   'Importance': xgb_grid.best_estimator_.feature_importances_}).sort_values(by=['Importance'], ascending = False)

feature_importances

Unnamed: 0,Feature,Importance
804,age_years,0.034940
92,QALY_Min,0.014294
537,careplan_736252007,0.013973
669,Healthcare_Coverage,0.013780
563,QALY_Avg,0.012041
516,QALY_Max,0.011030
742,6299-2_StdDev,0.010921
430,condition_92691004,0.010662
800,medication_752899,0.010040
784,2069-3_StdDev,0.008447


In [22]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, FloatType
## Function to extract probability from array
getprob = udf(lambda v:float(v[1]),FloatType())

## Load in COVID-19 Predictions
cv19test = spark.table("mldata_covid19_status").filter(col("dataset") == "test")

pipelineModel = PipelineModel.load("/mnt/data/ml/covid19status/pipeline")
cv19test = pipelineModel.transform(cv19test)

cvModel = CrossValidatorModel.load("/mnt/data/ml/covid19status/rf")
cv19test = cvModel.bestModel.transform(cv19test)

cv19test = cv19test.select(col("Patient"),         
                           getprob(col("probability")).alias("probability"),
                           col("prediction"))

###########################################################

testCast = test.select(*(col(c).cast("float").alias(c) for c in columnCast)).toPandas()

pred_test = xgb_grid.predict(testCast[predictors])

test_output = test.select("Patient").toPandas()
test_output["prediction"] = pred_test

test_output = spark.createDataFrame(test_output) \
                   .join(cv19test.drop("probability").withColumnRenamed("prediction", "cv_prediction"), on=["Patient"], how="left") \
                   .withColumn("prediction", when(col("cv_prediction") == 0, lit(0)).otherwise(col("prediction"))) \
                   .drop("cv_prediction")

test_output.coalesce(1) \
      .write.format("com.databricks.spark.csv") \
      .option("header", "false") \
      .save("/mnt/data/scored/dayshospitalized/xgboost/")

display(test_output)

Patient,prediction
00f594e1-6b73-40cb-a028-cffe86b12e94,14.556954
01812846-a928-49f5-ad25-494db0eb5205,10.901575
046c66fb-1f52-4f4d-9b6d-2cfa7a2a5605,0.0
04a71615-ff6f-4a9f-a824-126b9173f2f6,14.757245
050e5edf-e422-473c-8f61-2f9ea2049d8d,0.0
05c2dcc7-ef23-4f90-b676-55f227b6b08b,14.726102
06405f69-b67f-451f-aa96-4c686baef513,14.322564
08461ddc-12df-458e-9fe7-2a97e8d325ae,14.453214
090f97f6-8f7a-4fdc-b4e3-48a4fb0c298a,0.0
09a804cc-7874-4626-8d6b-ea3b15bebacf,14.556954
