In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, roc_curve
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
import h2o
from h2o.estimators import H2ORandomForestEstimator

In [2]:
# Creación de un cluster local H2O

h2o.init(ip = "localhost",
         # -1 indica que se empleen todos los cores disponibles.
         nthreads = -1,
         # Máxima memoria disponible para el cluster.
         max_mem_size = "4g")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM JBR-11.0.13.7-1751.21-jcef (build 11.0.13+7-b1751.21, mixed mode)
  Starting server from C:\Users\ferli\anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\ferli\AppData\Local\Temp\tmpswxkrlfp
  JVM stdout: C:\Users\ferli\AppData\Local\Temp\tmpswxkrlfp\h2o_ferli_started_from_python.out
  JVM stderr: C:\Users\ferli\AppData\Local\Temp\tmpswxkrlfp\h2o_ferli_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,05 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.1
H2O_cluster_version_age:,2 months
H2O_cluster_name:,H2O_from_python_ferli_qphrze
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [3]:
inputs = pd.read_csv('training_set_values.csv')
inputs.set_index('id', inplace = True)
inputs.head()

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
obj = pd.read_csv('training_set_labels.csv')
obj.set_index('id', inplace = True)
obj

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
69572,functional
8776,functional
34310,functional
67743,non functional
19728,functional
...,...
60739,functional
27263,functional
37057,functional
31282,functional


In [5]:
predictors = pd.read_csv('test_set_values.csv')
predictors.set_index('id', inplace = True)
predictors.head()

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,Internal,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,Pangani,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,Internal,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,...,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,...,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [95]:
out = pd.read_csv('SubmissionFormat.csv')
out.set_index('id', inplace = True)
out.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
50785,predicted label
51630,predicted label
17168,predicted label
45559,predicted label
49871,predicted label


In [8]:
inputs['type'] = 'train'
predictors['type'] = 'predict'

In [9]:
data = pd.concat([inputs, predictors], sort = False, ignore_index=False)
data.columns= data.columns.str.lower()
data

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,train
8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,train
34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,train
67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,train
19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39307,0.0,2011-02-24,Danida,34,Da,38.852669,-6.582841,Kwambwezi,0,Wami / Ruvu,...,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,predict
18990,1000.0,2011-03-21,Hiap,0,HIAP,37.451633,-5.350428,Bonde La Mkondoa,0,Pangani,...,salty,salty,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,predict
28749,0.0,2013-03-04,,1476,,34.739804,-4.585587,Bwawani,0,Internal,...,soft,good,insufficient,insufficient,dam,dam,surface,communal standpipe,communal standpipe,predict
33492,0.0,2013-02-18,Germany,998,DWE,35.432732,-10.584159,Kwa John,0,Lake Nyasa,...,soft,good,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe,predict


In [10]:
del inputs
del predictors

In [11]:
data['date_recorded'] = pd.to_datetime(data['date_recorded'])
data.date_recorded

id
69572   2011-03-14
8776    2013-03-06
34310   2013-02-25
67743   2013-01-28
19728   2011-07-13
           ...    
39307   2011-02-24
18990   2011-03-21
28749   2013-03-04
33492   2013-02-18
68707   2013-02-13
Name: date_recorded, Length: 74250, dtype: datetime64[ns]

In [12]:
data['antiguedad'] = (data['date_recorded'].dt.year - data['construction_year'])
data['antiguedad']

id
69572      12
8776        3
34310       4
67743      27
19728    2011
         ... 
39307      23
18990      17
28749       3
33492       4
68707       5
Name: antiguedad, Length: 74250, dtype: int64

In [13]:
data = data.drop(['date_recorded'], axis = 1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74250 entries, 69572 to 68707
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             74250 non-null  float64
 1   funder                 69746 non-null  object 
 2   gps_height             74250 non-null  int64  
 3   installer              69718 non-null  object 
 4   longitude              74250 non-null  float64
 5   latitude               74250 non-null  float64
 6   wpt_name               74250 non-null  object 
 7   num_private            74250 non-null  int64  
 8   basin                  74250 non-null  object 
 9   subvillage             73780 non-null  object 
 10  region                 74250 non-null  object 
 11  region_code            74250 non-null  int64  
 12  district_code          74250 non-null  int64  
 13  lga                    74250 non-null  object 
 14  ward                   74250 non-null  object 
 15

In [14]:
data_2 = pd.merge(data, obj, on='id', how='left')
data_2

Unnamed: 0_level_0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,...,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,type,antiguedad,status_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,...,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,train,12,functional
8776,0.0,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,...,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,train,3,functional
34310,25.0,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,...,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,train,4,functional
67743,0.0,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,...,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,train,27,non functional
19728,0.0,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,...,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,train,2011,functional
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39307,0.0,Danida,34,Da,38.852669,-6.582841,Kwambwezi,0,Wami / Ruvu,Yombo,...,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,predict,23,
18990,1000.0,Hiap,0,HIAP,37.451633,-5.350428,Bonde La Mkondoa,0,Pangani,Mkondoa,...,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,predict,17,
28749,0.0,,1476,,34.739804,-4.585587,Bwawani,0,Internal,Juhudi,...,insufficient,insufficient,dam,dam,surface,communal standpipe,communal standpipe,predict,3,
33492,0.0,Germany,998,DWE,35.432732,-10.584159,Kwa John,0,Lake Nyasa,Namakinga B,...,insufficient,insufficient,river,river/lake,surface,communal standpipe,communal standpipe,predict,4,


In [32]:
df_h2o = h2o.H2OFrame(python_obj = data_2, destination_frame = "df_h2o")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [33]:
df_h2o.types

{'amount_tsh': 'real',
 'funder': 'enum',
 'gps_height': 'int',
 'installer': 'enum',
 'longitude': 'real',
 'latitude': 'real',
 'wpt_name': 'enum',
 'num_private': 'int',
 'basin': 'enum',
 'subvillage': 'enum',
 'region': 'enum',
 'region_code': 'int',
 'district_code': 'int',
 'lga': 'enum',
 'ward': 'enum',
 'population': 'int',
 'public_meeting': 'enum',
 'recorded_by': 'enum',
 'scheme_management': 'enum',
 'scheme_name': 'enum',
 'permit': 'enum',
 'construction_year': 'int',
 'extraction_type': 'enum',
 'extraction_type_group': 'enum',
 'extraction_type_class': 'enum',
 'management': 'enum',
 'management_group': 'enum',
 'payment': 'enum',
 'payment_type': 'enum',
 'water_quality': 'enum',
 'quality_group': 'enum',
 'quantity': 'enum',
 'quantity_group': 'enum',
 'source': 'enum',
 'source_type': 'enum',
 'source_class': 'enum',
 'waterpoint_type': 'enum',
 'waterpoint_type_group': 'enum',
 'type': 'enum',
 'antiguedad': 'int',
 'status_group': 'enum'}

In [34]:
df_h2o.head()

ValueError: could not convert string to float: 'True'

ValueError: could not convert string to float: 'True'

In [35]:
df_h2o.describe()

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,type,antiguedad,status_group
type,real,enum,int,enum,real,real,enum,int,enum,enum,enum,int,int,enum,enum,int,enum,enum,enum,enum,enum,int,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,enum,int,enum
mins,0.0,,-90.0,,0.0,-11.64944018,,0.0,,,,1.0,0.0,,,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,-7.0,
mean,318.68570437710434,,665.6673131313141,,34.07426239589904,-5.7017710265220325,,0.4623299663299663,,,,15.265414141414148,5.629077441077439,,,180.75082828282842,,,,,,1298.4636498316515,,,,,,,,,,,,,,,,,,713.4549090909094,
maxs,350000.0,,2777.0,,40.34519307,-2e-08,,1776.0,,,,99.0,80.0,,,30500.0,,,,,,2013.0,,,,,,,,,,,,,,,,,,2013.0,
sigma,2906.7623643510474,,692.7610332644816,,6.5725188106281465,2.94496912260335,,11.537878627650082,,,,17.508906841729914,9.64163557030592,,,471.08612000428775,,,,,,952.3493754104975,,,,,,,,,,,,,,,,,,952.0115205105405,
zeros,52049,,25649,,2269,0,,73299,,,,0,27,,,26834,,,,,,25969,,,,,,,,,,,,,,,,,,729,
missing,0,980,0,980,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,794,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,6000.0,Roman,1390.0,Roman,34.93809275,-9.85632177,none,0.0,Lake Nyasa,Mnyusi B,Iringa,11.0,5.0,Ludewa,Mundindi,109.0,True,GeoData Consultants Ltd,VWC,Roman,False,1999.0,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,train,12.0,functional
1,0.0,Grumeti,1399.0,GRUMETI,34.6987661,-2.14746569,Zahanati,0.0,Lake Victoria,Nyamara,Mara,20.0,2.0,Serengeti,Natta,280.0,,GeoData Consultants Ltd,Other,,True,2010.0,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,train,3.0,functional
2,25.0,Lottery Club,686.0,World vision,37.46066446,-3.82132853,Kwa Mahundi,0.0,Pangani,Majengo,Manyara,21.0,4.0,Simanjiro,Ngorika,250.0,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009.0,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,train,4.0,functional


In [36]:
df_h2o.shape

(74250, 41)

#### Muestreo

In [46]:
df2_h2o = df_h2o[df_h2o['type']=='train']
df2_h2o.shape

(59400, 41)

In [47]:
df_train_h2o, df_test_h2o = df2_h2o.split_frame(ratios=[0.8],destination_frames= ["df_train_h2o",
                                                        "df_test_h2o"],
                                                        seed = 123)

In [48]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

H2OGradientBoostingEstimator(fold_assignment = "stratified", nfolds = 5, seed = 1234)

In [58]:
model_h2o = H2ORandomForestEstimator(ntrees=10,
                                    max_depth=5,
                                    min_rows=10,
                                    calibrate_model=False,
                                    calibration_frame=df_test_h2o,
                                    binomial_double_trees=False)

In [59]:
df_train_h2o['status_group'].asfactor()

status_group
functional
functional
functional
non functional
functional
functional
non functional
non functional
functional
functional


In [60]:
model_h2o.train(x=df_train_h2o.drop('status_group').col_names,
               y='status_group',
               training_frame=df_train_h2o,
               validation_frame=df_test_h2o)

drf Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,10.0,30.0,15871.0,5.0,5.0,5.0,29.0,32.0,30.766666

functional,functional needs repair,nan,non functional,Error,Rate
24395.0,33.0,0.0,1153.0,0.0463625,"1,186 / 25,581"
2940.0,143.0,0.0,346.0,0.9582969,"3,286 / 3,429"
0.0,0.0,0.0,0.0,,0 / 0
8021.0,69.0,0.0,9933.0,0.4488709,"8,090 / 18,023"
35356.0,245.0,0.0,11432.0,0.2670891,"12,562 / 47,033"

k,hit_ratio
1,0.7329109
2,0.9392129
3,1.0
4,1.0

functional,functional needs repair,nan,non functional,Error,Rate
6164.0,5.0,0.0,261.0,0.0413686,"266 / 6,430"
736.0,33.0,0.0,83.0,0.9612676,819 / 852
0.0,0.0,0.0,0.0,,0 / 0
2042.0,11.0,0.0,2543.0,0.4466928,"2,053 / 4,596"
8942.0,49.0,0.0,2887.0,0.2641859,"3,138 / 11,878"

k,hit_ratio
1,0.7358141
2,0.9404782
3,1.0
4,1.0

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,training_auc,training_pr_auc,validation_rmse,validation_logloss,validation_classification_error,validation_auc,validation_pr_auc
,2023-04-09 17:00:38,0.155 sec,0.0,,,,,,,,,,
,2023-04-09 17:00:39,1.039 sec,1.0,0.4786261,0.6944503,0.2709136,,,0.4801228,0.7112308,0.2752989,,
,2023-04-09 17:00:39,1.403 sec,2.0,0.4697876,0.6643203,0.2712168,,,0.466701,0.6486516,0.2748779,,
,2023-04-09 17:00:39,1.665 sec,3.0,0.4694635,0.6628614,0.2716912,,,0.464201,0.6416344,0.2700791,,
,2023-04-09 17:00:40,1.986 sec,4.0,0.4684252,0.6584042,0.2702328,,,0.4629965,0.6387293,0.2661222,,
,2023-04-09 17:00:40,2.303 sec,5.0,0.4666637,0.6530965,0.270092,,,0.4617734,0.635392,0.2673851,,
,2023-04-09 17:00:40,2.496 sec,6.0,0.4663196,0.6523301,0.2694681,,,0.4608598,0.6328368,0.2683954,,
,2023-04-09 17:00:40,2.596 sec,7.0,0.4657716,0.6495432,0.2689542,,,0.4607838,0.633204,0.2658697,,
,2023-04-09 17:00:40,2.692 sec,8.0,0.4654053,0.6470975,0.2695425,,,0.4611087,0.6339009,0.2675535,,
,2023-04-09 17:00:41,2.847 sec,9.0,0.4645824,0.6443364,0.2688876,,,0.460472,0.632252,0.267806,,

variable,relative_importance,scaled_importance,percentage
quantity_group,14308.2109375,1.0,0.2534605
quantity,10577.0175781,0.7392271,0.1873648
lga,7384.0712891,0.5160723,0.1308039
extraction_type_group,4887.1884766,0.3415653,0.0865733
extraction_type,4645.9653320,0.3247062,0.0823002
waterpoint_type,2188.9504395,0.1529856,0.0387758
waterpoint_type_group,2084.7983398,0.1457064,0.0369308
extraction_type_class,1431.4245605,0.1000422,0.0253567
payment_type,1235.8486328,0.0863734,0.0218922
antiguedad,1146.3652344,0.0801194,0.0203071


In [65]:
predictors = df_h2o[df_h2o['type']=='predict']
predictors = predictors.drop('status_group', axis = 1)
predictors.shape

(14850, 40)

In [66]:
predictions = model_h2o.predict(predictors)

drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [67]:
predictions

predict,functional,functional needs repair,nan,non functional
non functional,0.246071,0.0357345,0,0.718195
functional,0.703613,0.0552083,0,0.241179
non functional,0.324566,0.0828655,0,0.592569
non functional,0.0119527,0.0161843,0,0.971863
functional,0.783594,0.0932738,0,0.123132
functional,0.711717,0.0498857,0,0.238397
functional,0.604928,0.0158229,0,0.379249
non functional,0.0774847,0.0142709,0,0.908244
functional,0.509702,0.127725,0,0.362573
functional,0.846237,0.0312429,0,0.122521


In [85]:
out['status_group']

id
50785    predicted label
51630    predicted label
17168    predicted label
45559    predicted label
49871    predicted label
              ...       
39307    predicted label
18990    predicted label
28749    predicted label
33492    predicted label
68707    predicted label
Name: status_group, Length: 14850, dtype: object

In [96]:
predictions['predict']

predict
non functional
functional
non functional
non functional
functional
functional
functional
non functional
functional
functional


In [97]:
out['status_group'] = predictions['predict'].as_data_frame()['predict'].values

In [98]:
out['status_group']

id
50785    non functional
51630        functional
17168    non functional
45559    non functional
49871        functional
              ...      
39307        functional
18990        functional
28749        functional
33492        functional
68707    non functional
Name: status_group, Length: 14850, dtype: object

In [99]:
out.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
50785,non functional
51630,functional
17168,non functional
45559,non functional
49871,functional


In [100]:
out.to_csv('submission_h2o.csv', index=True)