In [1]:
import h2o
import pandas as pd
from h2o.estimators import H2OGradientBoostingEstimator


def drop_columns(df, lst):
    """
    Drops columns specified in the Pandas DataFrame
    :param df: DataFrame that will be altered
    :param lst: A list of strings, denoting columns in the DataFrame
    :return: New DataFrame, with specified columns removed.
    """
    df.drop(lst, axis=1, inplace=True)
    return df


def get_training_columns(df, target):
    """
    Creates list of columms for training (h2o only)
    :param df: H2OFrame to be used for training
    :param target: Target column within training H2OFrame
    :return: list of strings, containig all training columns
    """
    return [col for col in df.columns if col != target]


if __name__ == '__main__':

    data = pd.read_csv("../Data_Entry_2017.csv", skiprows=1, names=['ImageIndex', 'FindingLabels', 'Follow-up',
                                                                         'PatientID', 'PatientAge', 'PatientGender',
                                                                         'ViewPosition', 'OriginalImage[Width',
                                                                         'Height]',
                                                                         'OriginalImagePixelSpacing[x',
                                                                         'y]',
                                                                         'Unnamed'], low_memory=False)

    data = drop_columns(data, ['Follow-up', 'Unnamed', 'OriginalImage[Width',
                               'Height]', 'ImageIndex', 'PatientID',
                               'OriginalImagePixelSpacing[x', 'y]'])


    # Remove the character in Patient Age, and convert to integer
    data['PatientAge'] = data['PatientAge'].map(lambda x: str(x)).astype(int)

    # Remove all categories right of the first '|' delimiter in labels
    # Reduces categories from 709 to 15
    data['FindingLabels'] = data['FindingLabels'].apply(lambda x: x.split('|')[0])

    h2o.init()

    data = h2o.H2OFrame(data)

    train, valid, test = data.split_frame(ratios=[0.6, 0.2], seed=8)

    training_columns = get_training_columns(train, "FindingLabels")

    dis_gbm = H2OGradientBoostingEstimator(ntrees=1000, distribution='multinomial', max_depth=2, learn_rate=0.001,
                                       balance_classes=True
                                       # stopping_metric="logloss",
                                       )

    dis_gbm.train(x=training_columns, y='FindingLabels', training_frame=train, validation_frame=valid)
    
    #dis_gbm.auc(valid=True)
    # performance = gbm.model_performance(test_data=test)

    # h2o.cluster().shutdown()


Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.171-b11, mixed mode)
  Starting server from C:\Users\PrashantAgrawal\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\PRASHA~1\AppData\Local\Temp\tmpeg5es335
  JVM stdout: C:\Users\PRASHA~1\AppData\Local\Temp\tmpeg5es335\h2o_PrashantAgrawal_started_from_python.out
  JVM stderr: C:\Users\PRASHA~1\AppData\Local\Temp\tmpeg5es335\h2o_PrashantAgrawal_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,06 secs
H2O cluster timezone:,Asia/Kolkata
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.10
H2O cluster version age:,24 days
H2O cluster name:,H2O_from_python_PrashantAgrawal_jud9na
H2O cluster total nodes:,1
H2O cluster free memory:,1.754 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
