In [61]:
import pandas as pd
import numpy as np 
from pygam import LinearGAM, GAM, s, f, te, l
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import re
from sklearn.model_selection import GridSearchCV
import h2o
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch
from scipy.optimize import linprog

In [22]:
train_df = pd.read_csv("Data/train_imputed.csv")

In [23]:
train_df['y_log10'] = np.log10(train_df.y)

In [54]:
df = train_df.copy()

h2o.init()

h2o_df = h2o.H2OFrame(df)

train, test = h2o_df.split_frame(ratios=[.8], seed=1234)


predictors = df.loc[:, (df.columns != 'y') & (df.columns != 'y_log10')].columns

response = 'y_log10'

hyper_params_tune = {'max_depth' : list(range(1,40,1)),
            'sample_rate': [x/100. for x in range(20,101)],
            'col_sample_rate' : [x/100. for x in range(20,101)],
            'col_sample_rate_per_tree': [x/100. for x in range(20,101)],
            'col_sample_rate_change_per_level': [x/100. for x in range(90,111)],
            'min_rows': [1,2,4,8,16,25],
            'nbins': [2**x for x in range(4,9)],
            'nbins_cats': [2**x for x in range(4,12)],
            'min_split_improvement': [0,1e-8,1e-6,1e-4],
            'histogram_type': ["UniformAdaptive","QuantilesGlobal","RoundRobin"]}

search_criteria_tune = {'strategy': "RandomDiscrete",
                   'max_runtime_secs': 10000,  ## limit the runtime to 60 minutes
                   'max_models': 100,  ## build no more than 100 models
                   'seed' : 1234,
                   'stopping_rounds' : 5,
                   'stopping_metric' : "rmse",
                   'stopping_tolerance': 1e-5
                   }



h2ogbm = H2OGradientBoostingEstimator(nfolds=5,
                                     learn_rate=0.05,
                                     learn_rate_annealing = 0.99,
                                     score_tree_interval = 10, 
                                     stopping_rounds = 5,
                                     stopping_metric = "rmse",
                                     stopping_tolerance = 1e-5,
                                     ntrees= 1000,
                                     seed=1111,
                                     keep_cross_validation_predictions = True,
                                     distribution ='gaussian')




h2ogbm = H2OGridSearch(h2ogbm, grid_id='gbm_.{0}'.format(response), hyper_params=hyper_params_tune,
                       search_criteria = search_criteria_tune)

h2ogbm.train(x=predictors, y=response, training_frame=train, seed=1111)


gbm_gridperf1 = h2ogbm.get_grid(sort_by='mse', decreasing=False)


bestgbm = gbm_gridperf1.models[0]

pred = h2o.as_list(bestgbm.predict(test[:, list(predictors)]), use_pandas=True)

pred = np.array(10 ** pred['predict'])

rmse = mean_squared_error(h2o.as_list(test['y']),pred, squared = False)

rmse = mean_squared_error(h2o.as_list(test['y']),pred, squared = False)
print('RMSE for {0}: {1}'.format('y',rmse))


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,2 hours 18 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,1 month and 3 days
H2O_cluster_name:,H2O_from_python_datageek_q5dlzr
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.616 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Grid Build progress: |███████████████████████████████████ (failed)


OSError: Job with key $03017f00000132d4ffffffff$_abf6e611d996edd7e158786044cc407d failed with an exception: java.lang.AssertionError: I am really confused about the heap usage; MEM_MAX=3739746304 heapUsedGC=4003317248
stacktrace: 
java.lang.AssertionError: I am really confused about the heap usage; MEM_MAX=3739746304 heapUsedGC=4003317248
	at water.MemoryManager.set_goals(MemoryManager.java:98)
	at water.MemoryManager.malloc(MemoryManager.java:267)
	at water.MemoryManager.malloc(MemoryManager.java:223)
	at water.MemoryManager.malloc8d(MemoryManager.java:289)
	at hex.tree.DTree.findBestSplitPoint(DTree.java:860)
	at hex.tree.DTree$DecidedNode$FindSplits.computeSplit(DTree.java:539)
	at hex.tree.DTree$DecidedNode$FindSplits.compute(DTree.java:522)
	at jsr166y.RecursiveAction.exec(RecursiveAction.java:160)
	at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
	at jsr166y.ForkJoinTask.doJoin(ForkJoinTask.java:344)
	at jsr166y.ForkJoinTask.invokeAll(ForkJoinTask.java:806)
	at hex.tree.DTree$DecidedNode.bestCol(DTree.java:500)
	at hex.tree.DTree$DecidedNode.<init>(DTree.java:547)
	at hex.tree.SharedTree.makeDecided(SharedTree.java:675)
	at hex.tree.SharedTree$ScoreBuildOneTree.onCompletion(SharedTree.java:598)
	at jsr166y.CountedCompleter.__tryComplete(CountedCompleter.java:425)
	at jsr166y.CountedCompleter.tryComplete(CountedCompleter.java:383)
	at water.LocalMR.compute2(LocalMR.java:91)
	at water.H2O$H2OCountedCompleter.compute(H2O.java:1557)
	at jsr166y.CountedCompleter.exec(CountedCompleter.java:468)
	at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
	at jsr166y.ForkJoinPool$WorkQueue.popAndExecAll(ForkJoinPool.java:904)
	at jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:977)
	at jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1477)
	at jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104)


  """Entry point for launching an IPython kernel.


H2O session _sid_97c5 closed.


In [57]:
h2o.init()
df = train_df.copy()


h2o_df = h2o.H2OFrame(df)

train, test = h2o_df.split_frame(ratios=[.8], seed=1234)

predictors = list(importance[:25])
response = 'y_log10'

# Build and train the model:
pros_gbm = H2OGradientBoostingEstimator(
                                        nfolds=5,
                                        seed=1111,
                                        keep_cross_validation_predictions = True,
                                        max_depth = 7,
                                        sample_rate = 0.64,
                                        col_sample_rate = 0.6,
                                        col_sample_rate_per_tree = 0.89,
                                        col_sample_rate_change_per_level = 1.0,
                                        min_rows = 1.0,
                                        nbins = 16,
                                        nbins_cats = 256,
                                        min_split_improvement = 0.0,
                                        histogram_type = 'UniformAdaptive',
                                        learn_rate=0.05,
                                        learn_rate_annealing = 0.99,
                                        score_tree_interval = 10, 
                                        stopping_rounds = 5,
                                        stopping_metric = "rmse",
                                        stopping_tolerance = 1e-4,
                                        ntrees= 1000,
                                        distribution ='gaussian'
                                       )
pros_gbm.train(x=predictors, y=response, training_frame=train)



pred = h2o.as_list(pros_gbm.predict(test[:, list(predictors)]), use_pandas=True)

pred = np.array(10 ** pred['predict'])

#rmse = bestgbm.model_performance(test).rmse()
rmse = mean_squared_error(h2o.as_list(test['y']),pred, squared = False)
print('RMSE for {0}: {1}'.format('y',rmse))

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_252"; OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09); OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)
  Starting server from /home/datageek/.local/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpqu_vesm2
  JVM stdout: /tmp/tmpqu_vesm2/h2o_datageek_started_from_python.out
  JVM stderr: /tmp/tmpqu_vesm2/h2o_datageek_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,1 month and 3 days
H2O_cluster_name:,H2O_from_python_datageek_c9ibdf
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.483 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%
RMSE for y: 0.007716646339543673


In [59]:
pros_gbm.train(x=predictors, y=response, training_frame=h2o_df)
test_df = pd.read_csv("Data/test_imputed.csv")
df = test_df.copy()
df = h2o.H2OFrame(df)
pred = h2o.as_list(pros_gbm.predict(df[:, list(predictors)]), use_pandas=True)
pred = np.array(10 ** pred['predict'])
test_df['pred_y'] = pred 

gbm Model Build progress: |███████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


In [99]:
c = ((-np.log(test_df.pred_y + 0.01) + 3)**2).values
E = (test_df['existence expectancy index'] < 0.7).astype(int).values
Tot = np.ones(len(test_df))

In [100]:
res = linprog(-c, A_ub=np.array([-E]), b_ub=np.array([-5000]), A_eq=np.array([Tot]), b_eq=np.array([50000]), bounds=(0,100))

In [101]:
res.x

array([9.99999681e+01, 9.99999694e+01, 9.99999694e+01, 9.99999694e+01,
       9.99999728e+01, 9.99999717e+01, 9.99999720e+01, 9.99999682e+01,
       9.99999680e+01, 9.99999687e+01, 9.99999704e+01, 9.99999730e+01,
       9.99999681e+01, 9.99999699e+01, 9.99999723e+01, 9.99999705e+01,
       9.99999730e+01, 9.99999720e+01, 9.99999700e+01, 9.99999698e+01,
       9.99999704e+01, 9.99999728e+01, 9.99999707e+01, 9.99999729e+01,
       9.99999683e+01, 9.99999730e+01, 9.99999721e+01, 9.99999704e+01,
       9.99999687e+01, 9.99999714e+01, 9.99999692e+01, 9.99999694e+01,
       9.99999686e+01, 2.27721989e-05, 9.99999732e+01, 9.99999701e+01,
       9.99999694e+01, 9.99999707e+01, 9.99999729e+01, 9.99999721e+01,
       9.99999692e+01, 9.99999730e+01, 9.99999696e+01, 9.99999684e+01,
       2.93388717e-05, 9.99999722e+01, 9.99999697e+01, 9.99999722e+01,
       9.99999699e+01, 9.99999732e+01, 9.99999689e+01, 9.99999686e+01,
       9.99999730e+01, 9.99999708e+01, 9.99999727e+01, 9.99999681e+01,
      

In [102]:
submission1 = pd.DataFrame({
    'index': test_df.index,
    'pred': test_df.pred_y,
    'opt_pred': res.x
})

In [103]:
submission1.to_csv("Data/sub1.csv", index = False)