An implementation for porting to other platforms and discussion (this is not to do exploratory analysis but rather to consider the APIs and technologies involved - it is not intended to be a good or reference solution to this problem). 

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

Obtain the data from Google Cloud Storage buckets

In [2]:
! wget https://storage.googleapis.com/bdt-spark-store/external_sources.csv -O gcs_external_sources.csv

--2020-11-29 09:03:38--  https://storage.googleapis.com/bdt-spark-store/external_sources.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.223.144, 172.217.170.16
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.223.144|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15503836 (15M) [text/csv]
Saving to: ‘gcs_external_sources.csv’


2020-11-29 09:03:46 (2.28 MB/s) - ‘gcs_external_sources.csv’ saved [15503836/15503836]



In [3]:
! wget https://storage.googleapis.com/bdt-spark-store/internal_data.csv -O gcs_internal_data.csv

--2020-11-29 09:03:50--  https://storage.googleapis.com/bdt-spark-store/internal_data.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.223.144, 172.217.170.16
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.223.144|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 152978396 (146M) [text/csv]
Saving to: ‘gcs_internal_data.csv’


2020-11-29 09:04:52 (2.39 MB/s) - ‘gcs_internal_data.csv’ saved [152978396/152978396]



Read in data sources

In [2]:
from pyspark.sql import SparkSession
# import pandas as pd 
spark = SparkSession.builder.appName('panda-and-spark').getOrCreate()

In [3]:
# df_data = pd.read_csv('gcs_internal_data.csv')
df_data = spark.read.format("csv").option("header","true").load("gcs_internal_data.csv")
# df_ext = pd.read_csv('gcs_external_sources.csv')
df_ext = spark.read.format("csv").option("header","true").load("gcs_external_sources.csv")


Join them on their common identifier key

In [4]:
df_full = df_data.join(df_ext, df_ext.SK_ID_CURR == df_data.SK_ID_CURR)
# df_full.show(1)

We will filter a few features out for the sake of this example

In [5]:
columns_extract = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                  'DAYS_BIRTH', 'DAYS_EMPLOYED', 'NAME_EDUCATION_TYPE',
                  'DAYS_ID_PUBLISH', 'CODE_GENDER', 'AMT_ANNUITY',
                  'DAYS_REGISTRATION', 'AMT_GOODS_PRICE', 'AMT_CREDIT',
                  'ORGANIZATION_TYPE', 'DAYS_LAST_PHONE_CHANGE',
                  'NAME_INCOME_TYPE', 'AMT_INCOME_TOTAL', 'OWN_CAR_AGE', 'TARGET']
categorical_columns= ['NAME_INCOME_TYPE','ORGANIZATION_TYPE','CODE_GENDER','NAME_EDUCATION_TYPE']
non_cate_cols= [i for i in columns_extract if i not in categorical_columns ]

df = df_full[columns_extract]

Let's obtain a train and test split

In [6]:
train, test = df.randomSplit([0.8,0.2],seed=101)

In [7]:
df.count()-test.count()

246057

In [8]:
zz=train[['TARGET']].groupby('TARGET').count()
zz.withColumn('Value_split_train',zz['count']/train.count()).show()

+------+------+-------------------+
|TARGET| count|  Value_split_train|
+------+------+-------------------+
|     0|226229| 0.9194170456438955|
|     1| 19828|0.08058295435610448|
+------+------+-------------------+



In [9]:
zz1=test[['TARGET']].groupby('TARGET').count()
zz1.withColumn('Value_split_test',zz1['count']/test.count()).show()

+------+-----+-------------------+
|TARGET|count|   Value_split_test|
+------+-----+-------------------+
|     0|56457|  0.918687148110782|
|     1| 4997|0.08131285188921795|
+------+-----+-------------------+



Handle the categorical variables

In [10]:
# One hot encode the train set 
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# The index of string vlaues multiple columns
indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in categorical_columns
]

# The encode of indexed vlaues multiple columns
encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers
]

# Vectorizing encoded values
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders],outputCol="features")

pipeline = Pipeline(stages=indexers + encoders)
model=pipeline.fit(train)
transformed = model.transform(train)
# transformed.show(5)

actualCol = transformed.columns
newCols=[]
#Get c and its repective index. One hot encoder will put those on same index in vector
for i in categorical_columns:
    colIdx = transformed.select(i,i+"_indexed").distinct().rdd.collectAsMap()
    colIdx =  sorted((value, i+"_" + key) for (key, value) in colIdx.items())
    newCols += list(map(lambda x: x[1], colIdx))
#     allColNames += newCols
allColNames = actualCol+newCols

def extract(row):
    return tuple(map(lambda x: row[x], row.__fields__)) + tuple(row.NAME_INCOME_TYPE_indexed_encoded.toArray().tolist())+ tuple(row.ORGANIZATION_TYPE_indexed_encoded.toArray().tolist())+ tuple(row.CODE_GENDER_indexed_encoded.toArray().tolist())+ tuple(row.NAME_EDUCATION_TYPE_indexed_encoded.toArray().tolist())

result = transformed.rdd.map(extract).toDF(allColNames,sampleRatio=0.2)
for col in newCols:
    result = result.withColumn(col, result[col].cast("int"))
final_df_cols=non_cate_cols+newCols
res_train=result[final_df_cols]
# res2.show(1)

In [11]:
# One hot encode the test set 
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# The index of string vlaues multiple columns
indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in categorical_columns
]

# The encode of indexed vlaues multiple columns
encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
            outputCol="{0}_encoded".format(indexer.getOutputCol())) 
    for indexer in indexers
]

# Vectorizing encoded values
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders],outputCol="features")

pipeline = Pipeline(stages=indexers + encoders)
model=pipeline.fit(test)
transformed = model.transform(test)
# transformed.show(5)

actualCol = transformed.columns
newCols=[]
#Get c and its repective index. One hot encoder will put those on same index in vector
for i in categorical_columns:
    colIdx = transformed.select(i,i+"_indexed").distinct().rdd.collectAsMap()
    colIdx =  sorted((value, i+"_" + key) for (key, value) in colIdx.items())
    newCols += list(map(lambda x: x[1], colIdx))
#     allColNames += newCols
allColNames = actualCol+newCols

def extract(row):
    return tuple(map(lambda x: row[x], row.__fields__)) + tuple(row.NAME_INCOME_TYPE_indexed_encoded.toArray().tolist())+ tuple(row.ORGANIZATION_TYPE_indexed_encoded.toArray().tolist())+ tuple(row.CODE_GENDER_indexed_encoded.toArray().tolist())+ tuple(row.NAME_EDUCATION_TYPE_indexed_encoded.toArray().tolist())

result = transformed.rdd.map(extract).toDF(allColNames,sampleRatio=0.2)
for col in newCols:
    result = result.withColumn(col, result[col].cast("int"))
final_df_cols=non_cate_cols+newCols
res_test=result[final_df_cols]

# res2.show(1)

In [12]:
print(("test",res_test.count(), len(res_test.columns)))
print(("train",res_train.count(), len(res_train.columns)))

('test', 61454, 88)
('train', 246057, 88)


Align the training and test data (as the test data may not have the same columns in the encoding)

In [45]:
final_features=list(set(res_train.schema.names).intersection(res_train.schema.names))
train=res_train[final_features]
test=res_test[final_features]

print(("test",test.count(), len(test.columns)))
print(("train",train.count(), len(train.columns)))

('test', 61454, 88)
('train', 246057, 88)


Get labels from data

In [46]:
train_labels = train['TARGET']
test_labels = test['TARGET']

Fill in missing data and scale

In [47]:
# # Drop the target from the training data -- cannot implement otherwise the random forest will not work
# if 'TARGET' in train.schema.names:
#     train = train.drop(*['TARGET'])
#     test = test.drop(*['TARGET'])

    
# # Feature names
# features = list(train.schema.names)



In [48]:
# [item[0] for item in train.dtypes if item[1].startswith('string') ]

for col in ['EXT_SOURCE_3',
 'OWN_CAR_AGE',
 'AMT_ANNUITY',
 'AMT_CREDIT',
 'DAYS_BIRTH',
 'DAYS_ID_PUBLISH',
 'DAYS_REGISTRATION',
 'DAYS_EMPLOYED',
 'EXT_SOURCE_1',
 'AMT_GOODS_PRICE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_INCOME_TOTAL',
 'EXT_SOURCE_2','TARGET']:
    test = test.withColumn(col, test[col].cast("int"))
    train = train.withColumn(col, train[col].cast("int"))

In [49]:
from pyspark.ml.feature import Imputer

imputer= Imputer(inputCols=train.schema.names, outputCols=train.schema.names )

train=imputer.fit(train).transform(train)
test=imputer.fit(test).transform(test)

In [50]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
columns_to_scale = ['EXT_SOURCE_3',
 'OWN_CAR_AGE',
 'AMT_ANNUITY',
 'AMT_CREDIT',
 'DAYS_BIRTH',
 'DAYS_ID_PUBLISH',
 'DAYS_REGISTRATION',
 'DAYS_EMPLOYED',
 'EXT_SOURCE_1',
 'AMT_GOODS_PRICE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_INCOME_TOTAL',
 'EXT_SOURCE_2']
assemblers = [VectorAssembler(inputCols=[col], outputCol=col + '_Vec' ) for col in columns_to_scale]
scalers = [StandardScaler(inputCol=col+ '_Vec'  , outputCol=col + '_Scaled' ) for col in columns_to_scale]
pipeline = Pipeline(stages=assemblers + scalers)
scalerModel = pipeline.fit(train)

In [51]:
train = scalerModel.transform(train)
test = scalerModel.transform(test)

In [52]:
new=[]
for i in ['EXT_SOURCE_3',
 'OWN_CAR_AGE',
 'AMT_ANNUITY',
 'AMT_CREDIT',
 'DAYS_BIRTH',
 'DAYS_ID_PUBLISH',
 'DAYS_REGISTRATION',
 'DAYS_EMPLOYED',
 'EXT_SOURCE_1',
 'AMT_GOODS_PRICE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_INCOME_TOTAL',
 'EXT_SOURCE_2']:
    new+=[i]
    new+=[i+'_Vec']

In [53]:
train=train.drop(*new)
test=test.drop(*new)

In [54]:
train.schema.names

['NAME_EDUCATION_TYPE_Lower secondary',
 'ORGANIZATION_TYPE_Mobile',
 'NAME_EDUCATION_TYPE_Secondary / secondary special',
 'ORGANIZATION_TYPE_Electricity',
 'ORGANIZATION_TYPE_Trade: type 7',
 'ORGANIZATION_TYPE_Business Entity Type 3',
 'ORGANIZATION_TYPE_Trade: type 5',
 'ORGANIZATION_TYPE_Postal',
 'NAME_INCOME_TYPE_Businessman',
 'ORGANIZATION_TYPE_Construction',
 'ORGANIZATION_TYPE_Industry: type 5',
 'ORGANIZATION_TYPE_Insurance',
 'ORGANIZATION_TYPE_Industry: type 4',
 'NAME_INCOME_TYPE_Pensioner',
 'ORGANIZATION_TYPE_Industry: type 1',
 'ORGANIZATION_TYPE_Culture',
 'ORGANIZATION_TYPE_Industry: type 3',
 'ORGANIZATION_TYPE_Realtor',
 'ORGANIZATION_TYPE_Transport: type 1',
 'ORGANIZATION_TYPE_University',
 'ORGANIZATION_TYPE_Restaurant',
 'NAME_EDUCATION_TYPE_Incomplete higher',
 'ORGANIZATION_TYPE_Trade: type 4',
 'ORGANIZATION_TYPE_Military',
 'ORGANIZATION_TYPE_Trade: type 6',
 'NAME_EDUCATION_TYPE_Academic degree',
 'ORGANIZATION_TYPE_Transport: type 3',
 'ORGANIZATION_TYPE

In [55]:
from functools import reduce

oldColumns = train.schema.names
newColumns = ['NAME_EDUCATION_TYPE_Lower secondary',
 'ORGANIZATION_TYPE_Mobile',
 'NAME_EDUCATION_TYPE_Secondary / secondary special',
 'ORGANIZATION_TYPE_Electricity',
 'ORGANIZATION_TYPE_Trade: type 7',
 'ORGANIZATION_TYPE_Business Entity Type 3',
 'ORGANIZATION_TYPE_Trade: type 5',
 'ORGANIZATION_TYPE_Postal',
 'NAME_INCOME_TYPE_Businessman',
 'ORGANIZATION_TYPE_Construction',
 'ORGANIZATION_TYPE_Industry: type 5',
 'ORGANIZATION_TYPE_Insurance',
 'ORGANIZATION_TYPE_Industry: type 4',
 'NAME_INCOME_TYPE_Pensioner',
 'ORGANIZATION_TYPE_Industry: type 1',
 'ORGANIZATION_TYPE_Culture',
 'ORGANIZATION_TYPE_Industry: type 3',
 'ORGANIZATION_TYPE_Realtor',
 'ORGANIZATION_TYPE_Transport: type 1',
 'ORGANIZATION_TYPE_University',
 'ORGANIZATION_TYPE_Restaurant',
 'NAME_EDUCATION_TYPE_Incomplete higher',
 'ORGANIZATION_TYPE_Trade: type 4',
 'ORGANIZATION_TYPE_Military',
 'ORGANIZATION_TYPE_Trade: type 6',
 'NAME_EDUCATION_TYPE_Academic degree',
 'ORGANIZATION_TYPE_Transport: type 3',
 'ORGANIZATION_TYPE_Trade: type 2',
 'ORGANIZATION_TYPE_Religion',
 'ORGANIZATION_TYPE_Legal Services',
 'CODE_GENDER_F',
 'ORGANIZATION_TYPE_Security',
 'ORGANIZATION_TYPE_Self-employed',
 'ORGANIZATION_TYPE_Cleaning',
 'ORGANIZATION_TYPE_Hotel',
 'ORGANIZATION_TYPE_Transport: type 4',
 'ORGANIZATION_TYPE_Other',
 'ORGANIZATION_TYPE_Business Entity Type 2',
 'ORGANIZATION_TYPE_Industry: type 11',
 'ORGANIZATION_TYPE_Industry: type 13',
 'ORGANIZATION_TYPE_Trade: type 1',
 'NAME_INCOME_TYPE_Commercial associate',
 'ORGANIZATION_TYPE_Services',
 'ORGANIZATION_TYPE_School',
 'ORGANIZATION_TYPE_Security Ministries',
 'NAME_INCOME_TYPE_Student',
 'ORGANIZATION_TYPE_Emergency',
 'CODE_GENDER_XNA',
 'ORGANIZATION_TYPE_Housing',
 'ORGANIZATION_TYPE_Government',
 'ORGANIZATION_TYPE_Industry: type 7',
 'ORGANIZATION_TYPE_Business Entity Type 1',
 'CODE_GENDER_M',
 'ORGANIZATION_TYPE_Industry: type 2',
 'ORGANIZATION_TYPE_Police',
 'ORGANIZATION_TYPE_Industry: type 9',
 'ORGANIZATION_TYPE_Industry: type 10',
 'NAME_EDUCATION_TYPE_Higher education',
 'ORGANIZATION_TYPE_Kindergarten',
 'ORGANIZATION_TYPE_XNA',
 'ORGANIZATION_TYPE_Bank',
 'ORGANIZATION_TYPE_Medicine',
 'ORGANIZATION_TYPE_Industry: type 6',
 'TARGET',
 'NAME_INCOME_TYPE_Unemployed',
 'ORGANIZATION_TYPE_Industry: type 8',
 'NAME_INCOME_TYPE_Working',
 'ORGANIZATION_TYPE_Trade: type 3',
 'ORGANIZATION_TYPE_Transport: type 2',
 'ORGANIZATION_TYPE_Advertising',
 'ORGANIZATION_TYPE_Agriculture',
 'NAME_INCOME_TYPE_Maternity leave',
 'NAME_INCOME_TYPE_State servant',
 'ORGANIZATION_TYPE_Telecom',
 'ORGANIZATION_TYPE_Industry: type 12',
 'EXT_SOURCE_3',
 'OWN_CAR_AGE',
 'AMT_ANNUITY',
 'AMT_CREDIT',
 'DAYS_BIRTH',
 'DAYS_ID_PUBLISH',
 'DAYS_REGISTRATION',
 'DAYS_EMPLOYED',
 'EXT_SOURCE_1',
 'AMT_GOODS_PRICE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_INCOME_TOTAL',
 'EXT_SOURCE_2']

train = reduce(lambda train, idx: train.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), train)

oldColumns = test.schema.names
test = reduce(lambda test, idx: test.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), test)


In [56]:
print(("test",test.count(), len(test.columns)))
print(("train",train.count(), len(train.columns)))

('test', 61454, 88)
('train', 246057, 88)


In [59]:
[i for i in train.schema.names if i not in ['TARGET']]

['TARGET']

In [58]:
feature_list=[i for i in final_features if i not in ['TARGET']]

Fit random forest

In [336]:
from pyspark.ml.classification import RandomForestClassifier

assembler = VectorAssembler(inputCols=feature_list,outputCol="features")



# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="TARGET", featuresCol='features', numTrees=100)

pipeline = Pipeline(stages=[assembler , rf])

model=pipeline.fit(train)


In [60]:
train_pred=model.transform(train)
test_pred=model.transform(test)

IllegalArgumentException: NAME_INCOME_TYPE_indexed does not exist. Available: NAME_EDUCATION_TYPE_Lower secondary, ORGANIZATION_TYPE_Mobile, NAME_EDUCATION_TYPE_Secondary / secondary special, ORGANIZATION_TYPE_Electricity, ORGANIZATION_TYPE_Trade: type 7, ORGANIZATION_TYPE_Business Entity Type 3, ORGANIZATION_TYPE_Trade: type 5, ORGANIZATION_TYPE_Postal, NAME_INCOME_TYPE_Businessman, ORGANIZATION_TYPE_Construction, ORGANIZATION_TYPE_Industry: type 5, ORGANIZATION_TYPE_Insurance, ORGANIZATION_TYPE_Industry: type 4, NAME_INCOME_TYPE_Pensioner, ORGANIZATION_TYPE_Industry: type 1, ORGANIZATION_TYPE_Culture, ORGANIZATION_TYPE_Industry: type 3, ORGANIZATION_TYPE_Realtor, ORGANIZATION_TYPE_Transport: type 1, ORGANIZATION_TYPE_University, ORGANIZATION_TYPE_Restaurant, NAME_EDUCATION_TYPE_Incomplete higher, ORGANIZATION_TYPE_Trade: type 4, ORGANIZATION_TYPE_Military, ORGANIZATION_TYPE_Trade: type 6, NAME_EDUCATION_TYPE_Academic degree, ORGANIZATION_TYPE_Transport: type 3, ORGANIZATION_TYPE_Trade: type 2, ORGANIZATION_TYPE_Religion, ORGANIZATION_TYPE_Legal Services, CODE_GENDER_F, ORGANIZATION_TYPE_Security, ORGANIZATION_TYPE_Self-employed, ORGANIZATION_TYPE_Cleaning, ORGANIZATION_TYPE_Hotel, ORGANIZATION_TYPE_Transport: type 4, ORGANIZATION_TYPE_Other, ORGANIZATION_TYPE_Business Entity Type 2, ORGANIZATION_TYPE_Industry: type 11, ORGANIZATION_TYPE_Industry: type 13, ORGANIZATION_TYPE_Trade: type 1, NAME_INCOME_TYPE_Commercial associate, ORGANIZATION_TYPE_Services, ORGANIZATION_TYPE_School, ORGANIZATION_TYPE_Security Ministries, NAME_INCOME_TYPE_Student, ORGANIZATION_TYPE_Emergency, CODE_GENDER_XNA, ORGANIZATION_TYPE_Housing, ORGANIZATION_TYPE_Government, ORGANIZATION_TYPE_Industry: type 7, ORGANIZATION_TYPE_Business Entity Type 1, CODE_GENDER_M, ORGANIZATION_TYPE_Industry: type 2, ORGANIZATION_TYPE_Police, ORGANIZATION_TYPE_Industry: type 9, ORGANIZATION_TYPE_Industry: type 10, NAME_EDUCATION_TYPE_Higher education, ORGANIZATION_TYPE_Kindergarten, ORGANIZATION_TYPE_XNA, ORGANIZATION_TYPE_Bank, ORGANIZATION_TYPE_Medicine, ORGANIZATION_TYPE_Industry: type 6, TARGET, NAME_INCOME_TYPE_Unemployed, ORGANIZATION_TYPE_Industry: type 8, NAME_INCOME_TYPE_Working, ORGANIZATION_TYPE_Trade: type 3, ORGANIZATION_TYPE_Transport: type 2, ORGANIZATION_TYPE_Advertising, ORGANIZATION_TYPE_Agriculture, NAME_INCOME_TYPE_Maternity leave, NAME_INCOME_TYPE_State servant, ORGANIZATION_TYPE_Telecom, ORGANIZATION_TYPE_Industry: type 12, EXT_SOURCE_3, OWN_CAR_AGE, AMT_ANNUITY, AMT_CREDIT, DAYS_BIRTH, DAYS_ID_PUBLISH, DAYS_REGISTRATION, DAYS_EMPLOYED, EXT_SOURCE_1, AMT_GOODS_PRICE, DAYS_LAST_PHONE_CHANGE, AMT_INCOME_TOTAL, EXT_SOURCE_2

In [348]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="TARGET", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(test_pred)
print("Test Accuracy = %g" % (accuracy))



Test Accuracy = 0.918687


In [26]:
from  pyspark import SparkContext as sc
from pysparkling import H2OContext

In [64]:
train.schema.names

['NAME_EDUCATION_TYPE_Lower secondary',
 'ORGANIZATION_TYPE_Mobile',
 'NAME_EDUCATION_TYPE_Secondary / secondary special',
 'ORGANIZATION_TYPE_Electricity',
 'ORGANIZATION_TYPE_Trade: type 7',
 'ORGANIZATION_TYPE_Business Entity Type 3',
 'ORGANIZATION_TYPE_Trade: type 5',
 'ORGANIZATION_TYPE_Postal',
 'NAME_INCOME_TYPE_Businessman',
 'ORGANIZATION_TYPE_Construction',
 'ORGANIZATION_TYPE_Industry: type 5',
 'ORGANIZATION_TYPE_Insurance',
 'ORGANIZATION_TYPE_Industry: type 4',
 'NAME_INCOME_TYPE_Pensioner',
 'ORGANIZATION_TYPE_Industry: type 1',
 'ORGANIZATION_TYPE_Culture',
 'ORGANIZATION_TYPE_Industry: type 3',
 'ORGANIZATION_TYPE_Realtor',
 'ORGANIZATION_TYPE_Transport: type 1',
 'ORGANIZATION_TYPE_University',
 'ORGANIZATION_TYPE_Restaurant',
 'NAME_EDUCATION_TYPE_Incomplete higher',
 'ORGANIZATION_TYPE_Trade: type 4',
 'ORGANIZATION_TYPE_Military',
 'ORGANIZATION_TYPE_Trade: type 6',
 'NAME_EDUCATION_TYPE_Academic degree',
 'ORGANIZATION_TYPE_Transport: type 3',
 'ORGANIZATION_TYPE

In [368]:
from pysparkling import *
conf = H2OConf().setExternalClusterMode().useManualClusterStart().setCloudName("test")
hc = H2OContext.getOrCreate(conf)

NameError: name 'H2OConf' is not defined

In [360]:
import h2o

In [361]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "14.0.2" 2020-07-14; OpenJDK Runtime Environment (build 14.0.2+12-46); OpenJDK 64-Bit Server VM (build 14.0.2+12-46, mixed mode, sharing)
  Starting server from /Users/mac/opt/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/l8/r180_12s5ys0mkl6_ccbmr1w0000gn/T/tmpt75_vjr6
  JVM stdout: /var/folders/l8/r180_12s5ys0mkl6_ccbmr1w0000gn/T/tmpt75_vjr6/h2o_mac_started_from_python.out
  JVM stderr: /var/folders/l8/r180_12s5ys0mkl6_ccbmr1w0000gn/T/tmpt75_vjr6/h2o_mac_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,25 secs
H2O_cluster_timezone:,Africa/Johannesburg
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.2
H2O_cluster_version_age:,12 days
H2O_cluster_name:,H2O_from_python_mac_1s2t4q
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [31]:
from pysparkling import *
import h2o

from pysparkling import H2OConf
conf = H2OConf()

hc = H2OContext.getOrCreate()

In [67]:
final_features=list(set(res_train.schema.names).intersection(res_train.schema.names))
train=res_train[final_features]
test=res_test[final_features]

print(("test",test.count(), len(test.columns)))
print(("train",train.count(), len(train.columns)))

('test', 61454, 88)
('train', 246057, 88)


In [68]:
training_frame=hc.asH2OFrame(train, "training_frame")

In [69]:
test_frame=hc.asH2OFrame(test, "test_frame")

In [70]:
from h2o.estimators import H2ORandomForestEstimator
cars_drf = H2ORandomForestEstimator(ntrees=10,
                                    max_depth=1)
cars_drf.train(x=final_features,
               y='TARGET',
               training_frame=training_frame,
               validation_frame=test_frame)

# Eval performance:
perf = cars_drf.model_performance()

# Generate predictions on a validation set (if necessary):
pred = cars_drf.predict(test_frame)

drf Model Build progress: |███████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%


Test/Validation dataset column 'DAYS_BIRTH' has levels not trained on: ["-23934", "-24244", "-24252", "-24268", "-24372", "-24383", "-24451", "-24492", "-24529", "-24812", "-24894", "-24898", "-24902", "-24919", "-24932", "-24948", "-24949", "-24972", "-25032", "-25038", "-25051", "-25055", "-25083", "-25088", "-25097", "-25099", "-25100", "-25106", "-25127", "-25144", "-25147", "-25162", "-25163", "-25177", "-25179", "-25195", "-25229", "-7489", "-7685", "-7733", "-7737", "-7792", "-7814", "-7827", "-7839", "-7853", "-7861", "-7865", "-7889", "-7934", "-7956", "-8001", "-8052"]
Test/Validation dataset column 'EXT_SOURCE_1' has levels not trained on: ["0.017755658754766125", "0.017896804814895563", "0.02023765065731696", "0.02093879214083812", "0.021482517959622718", "0.022287942588261167", "0.02319484055591637", "0.02339798210758813", "0.023940919450124918", "0.02469824960666477", "0.025381860851011042", "0.02606749992830337", "0.02614211022100112", "0.026502548887837485", "0.02765348

Test/Validation dataset column 'AMT_INCOME_TOTAL' has levels not trained on: ["100278.0", "100786.5", "101362.5", "101740.5", "103761.0", "103801.5", "103815.0", "104247.0", "104701.5", "105475.5", "105975.0", "106348.5", "106371.0", "106713.0", "106875.0", "106920.0", "107397.0", "108765.0", "109458.0", "109467.0", "110025.0", "110187.0", "110331.0", "110475.0", "111217.5", "111640.5", "112158.0", "112230.0", "112266.0", "112428.0", "112608.0", "112810.5", "112833.0", "113238.0", "113301.0", "113742.0", "114210.0", "114336.0", "114349.5", "114457.5", "114840.0", "115290.0", "115533.0", "115560.0", "116604.0", "116775.0", "117319.5", "118039.5", "118197.0", "118498.5", "118975.5", "119119.5", "1192500.0", "119407.5", "121198.5", "122202.0", "124438.5", "125698.5", "125730.0", "126261.0", "126450.0", "126733.5", "127125.0", "127449.0", "1282500.0", "128997.0", "129222.0", "1305000.0", "1306831.5", "131098.5", "131854.5", "131913.0", "132372.0", "133330.5", "133461.0", "133497.0", "13441

Test/Validation dataset column 'DAYS_ID_PUBLISH' has levels not trained on: ["-5610", "-5736", "-5780", "-5793", "-5810", "-5854", "-5879", "-5893", "-5897", "-5924", "-5941", "-6032", "-6084", "-6097", "-6103", "-6105", "-6116", "-6129", "-6140", "-6162", "-6175", "-6179", "-6189", "-6194", "-6227", "-6232", "-6233", "-6255", "-6256", "-6337", "-7197"]
Test/Validation dataset column 'EXT_SOURCE_2' has levels not trained on: ["0.0001103535288487474", "0.00012773044217350148", "0.00013539730138290343", "0.00013952356243617458", "0.00015072853769404026", "0.00018543947562433137", "0.0001880483908008136", "0.0001976469211371752", "0.00025490122916898254", "0.000258922632297468", "0.00027811010649830326", "0.00028877920701127327", "0.00029262398230360287", "0.000292694358934605", "0.00030493435584740604", "0.0003080768544818169", "0.0003199441528716373", "0.0003324428040691888", "0.0003506946172084826", "0.00036481651740081215", "0.00039023266208445867", "0.00040487377831855965", "0.000409

Test/Validation dataset column 'OWN_CAR_AGE' has levels not trained on: ["56.0"]
Test/Validation dataset column 'EXT_SOURCE_3' has levels not trained on: ["0.011345719434837441", "0.012715923858768621", "0.014148265518207309", "0.018924392893755082", "0.020455029866158526", "0.029283298591354017", "0.043226523922461416", "0.8581775591924884", "0.8724558162271476"]
Test/Validation dataset column 'DAYS_REGISTRATION' has levels not trained on: ["-12029.0", "-12125.0", "-12178.0", "-12191.0", "-12220.0", "-12403.0", "-12446.0", "-12573.0", "-12577.0", "-12921.0", "-13007.0", "-13008.0", "-13095.0", "-13116.0", "-13185.0", "-13271.0", "-13281.0", "-13347.0", "-13424.0", "-13433.0", "-13533.0", "-13549.0", "-13557.0", "-13559.0", "-13608.0", "-13610.0", "-13612.0", "-13680.0", "-13694.0", "-13737.0", "-13758.0", "-13777.0", "-13780.0", "-13815.0", "-13823.0", "-13843.0", "-13858.0", "-13867.0", "-13875.0", "-13888.0", "-13896.0", "-13898.0", "-13930.0", "-13938.0", "-13952.0", "-13958.0", "-

Test/Validation dataset column 'AMT_ANNUITY' has levels not trained on: ["100458.0", "101493.0", "10228.5", "102451.5", "103392.0", "103788.0", "104017.5", "105201.0", "105340.5", "105511.5", "106452.0", "107257.5", "107356.5", "108369.0", "10885.5", "109809.0", "110655.0", "111730.5", "112324.5", "114673.5", "11578.5", "122040.0", "123669.0", "124119.0", "124141.5", "130923.0", "131535.0", "133848.0", "13999.5", "16191.0", "169231.5", "170986.5", "17343.0", "173574.0", "17392.5", "2052.0", "20533.5", "206464.5", "213160.5", "22207.5", "22869.0", "2299.5", "2511.0", "26361.0", "2673.0", "27765.0", "27886.5", "28246.5", "28512.0", "28629.0", "2880.0", "30190.5", "30249.0", "30343.5", "3132.0", "3145.5", "31725.0", "32062.5", "3213.0", "32575.5", "3316.5", "3352.5", "3388.5", "34146.0", "3415.5", "3447.0", "34789.5", "3541.5", "35532.0", "36247.5", "36283.5", "3636.0", "36711.0", "36783.0", "36792.0", "36855.0", "3712.5", "37300.5", "37318.5", "37503.0", "37656.0", "37696.5", "3775.5", "

In [73]:
perf.accuracy


ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.07362411505571519
RMSE: 0.2713376403223762
LogLoss: 0.277216875992202
Mean Per-Class Error: 0.42541784900790636
AUC: 0.601095493236822
AUCPR: 0.11653151614365388
Gini: 0.20219098647364397

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.08359766500806728: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,167352.0,56478.0,0.2523,(56478.0/223830.0)
1,1,11851.0,7767.0,0.6041,(11851.0/19618.0)
2,Total,179203.0,64245.0,0.2807,(68329.0/243448.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.083598,0.185231,195.0
1,max f2,0.074784,0.31341,301.0
2,max f0point5,0.091454,0.155758,137.0
3,max accuracy,0.152697,0.919416,1.0
4,max precision,0.152697,0.5,1.0
5,max recall,0.054017,1.0,399.0
6,max specificity,0.157113,0.999987,0.0
7,max absolute_mcc,0.089764,0.092678,149.0
8,max min_per_class_accuracy,0.079856,0.572128,238.0
9,max mean_per_class_accuracy,0.081303,0.574582,219.0



Gains/Lift Table: Avg response rate:  8,06 %, avg score:  7,97 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010457,0.114434,1.977429,1.977429,0.159347,0.125524,0.159347,0.125524,0.020678,0.020678,97.742894,97.742894,0.011117
1,2,0.020016,0.108119,1.904701,1.942696,0.153486,0.111191,0.156548,0.118679,0.018207,0.038884,90.470051,94.26965,0.020522
2,3,0.030042,0.105433,2.017121,1.967535,0.162546,0.10686,0.15855,0.114734,0.020224,0.059108,101.71214,96.753501,0.031614
3,4,0.040019,0.10278,1.951159,1.963452,0.15723,0.104164,0.158221,0.112099,0.019467,0.078576,95.115883,96.345219,0.041936
4,5,0.050037,0.101064,1.817386,1.934208,0.14645,0.101776,0.155864,0.110032,0.018207,0.096782,81.738564,93.420803,0.050842
5,6,0.10005,0.094574,1.739518,1.836887,0.140176,0.09739,0.148022,0.103712,0.086998,0.183781,73.95183,83.688689,0.091069
6,7,0.150197,0.089411,1.543779,1.739026,0.124402,0.091874,0.140136,0.09976,0.077416,0.261196,54.377936,73.902576,0.120728
7,8,0.200628,0.086201,1.182055,1.599021,0.095253,0.087634,0.128854,0.096712,0.059613,0.320809,18.205452,59.902138,0.130714
8,9,0.300008,0.082402,1.1388,1.44657,0.091768,0.084069,0.116569,0.092524,0.113173,0.433982,13.880016,44.657026,0.145717
9,10,0.400029,0.080186,1.001902,1.335387,0.080736,0.081243,0.107609,0.089703,0.100212,0.534194,0.190241,33.538748,0.145924





<bound method H2OBinomialModelMetrics.accuracy of >