This script shows how to find weighted averages 

In [None]:
import numpy as np
np.random.seed(123)
import pandas as pd
import xgboost as xgb
import gc
from datetime import datetime
from scipy.optimize import minimize
from sklearn.metrics import mean_absolute_error


In [None]:
train = pd.read_csv("./input/train.csv")
xgb1 = pd.read_csv("./xgboost/oob_xgb_fairobj_1124.252199_2016-12-09-22-05.csv")
xgb2 = pd.read_csv("./xgboost/oob_power3_xgb_fairobj_1125.110344_2016-12-08-22-18.csv")
xgb3 = pd.read_csv("./xgboost/oob_xgb_fairobj_1124.738622_2016-12-08-03-06.csv")
xgb4 = pd.read_csv("./xgboost/oob_xgb_fairobj_1125.099072_2016-12-07-21-03.csv")
xgb5 = pd.read_csv("./xgboost/oob_power3_xgb_fairobj_1124.879544_2016-12-08-07-07.csv")
xgb6 = pd.read_csv("./xgboost/oob_xgb_fairobj_1124.456032_2016-12-08-14-34.csv")

keras1 = pd.read_csv("./keras/preds_oob1130.58667325_from fourm.csv").sort_values('id')
keras2 = pd.read_csv("./keras/preds_oob1130.400_different seed.csv").sort_values('id')
keras3 = pd.read_csv("./keras/preds_oob1131.39946641_seed400-250-50_no_early_stop.csv").sort_values('id')
keras4 = pd.read_csv("./keras/preds_oob1132.36796706_no_log_400_200_50.csv").sort_values('id')

In [None]:
def mae_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction

    return mean_absolute_error(Y_values, final_prediction)

# def mae_func(weights):
#     ''' scipy minimize will pass the weights as a numpy array '''
#     final_prediction = 0
#     for weight, prediction in zip(weights, predictions):
#             final_prediction += prediction**weight

#     final_prediction /= len(weights)
#     return mean_absolute_error(Y_values, final_prediction)

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print(' Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [None]:
Y_values = train['loss'].values
predictions = []
lls = []
wghts = []

In [None]:
predictions.append(np.array(xgb1.loss))
predictions.append(np.array(xgb2.loss))
predictions.append(np.array(xgb3.loss))
predictions.append(np.array(xgb4.loss))
predictions.append(np.array(xgb5.loss))
predictions.append(np.array(xgb6.loss))

predictions.append(np.array(keras1.loss))
predictions.append(np.array(keras2.loss))
predictions.append(np.array(keras3.loss))
predictions.append(np.array(keras4.loss))


In [None]:
start_time = timer(None)
for i in range(100):
    if i % 50 == 0:
    starting_values = np.random.uniform(size=len(predictions))
    cons = ({'type':'ineq','fun':lambda w: 1.2-sum(w)})
    bounds = [(0,1)]*len(predictions)

    res = minimize(mae_func, starting_values, method='L-BFGS-B',
                   bounds=bounds, options={'disp': False, 'maxiter': 10000})

    lls.append(res['fun'])
    wghts.append(res['x'])
# Uncomment the next line if you want to see the weights and scores calculated in real time
    #print('Weights: {weights}  Score: {score}'.format(weights=res['x'], score=res['fun']))

bestSC = np.min(lls)
bestWght = wghts[np.argmin(lls)]

print('\n Ensemble Score: {best_score}'.format(best_score=bestSC))
print('\n Best Weights: {weights}'.format(weights=bestWght))

timer(start_time)

In [None]:
xgb1_test = pd.read_csv("./xgboost/submission_5fold-average-xgb_fairobj_1124.252199_2016-12-02-11-19.csv")
xgb2_test = pd.read_csv("./xgboost/test_power3_xgb_fairobj_1125.110344_2016-12-08-22-18.csv")
xgb3_test = pd.read_csv("./xgboost/test_xgb_fairobj_1124.738622_2016-12-08-03-06.csv")
xgb4_test = pd.read_csv("./xgboost/test_xgb_fairobj_1125.099072_2016-12-07-21-03.csv")
xgb5_test = pd.read_csv("./xgboost/test_power3_xgb_fairobj_1124.879544_2016-12-08-07-07.csv")
xgb6_test = pd.read_csv("./xgboost/test_xgb_fairobj_1124.456032_2016-12-08-14-34.csv")
keras1_test = pd.read_csv("./keras/submission_keras_shift_perm1130.58667325.csv").sort_values('id')
keras2_test = pd.read_csv("./keras/submission_keras_shift_perm1130.400.csv").sort_values('id')
keras3_test = pd.read_csv("./keras/submission_keras_shift_perm1131.39946641.csv").sort_values('id')
keras4_test = pd.read_csv("./keras/submission_keras_shift_perm1132.36796706.csv").sort_values('id')


In [None]:
combined = bestWght[0] * np.array(xgb1_test.loss) + \
bestWght[1] * np.array(xgb2_test.loss) +\
bestWght[2] * np.array(xgb3_test.loss) +\
bestWght[3] * np.array(xgb4_test.loss) +\
bestWght[4] * np.array(xgb5_test.loss) +\
bestWght[5] * np.array(xgb6_test.loss) +\
bestWght[6] * np.array(keras1_test.loss) +\
bestWght[7] * np.array(keras2_test.loss) +\
bestWght[8] * np.array(keras3_test.loss) +\
bestWght[9] * np.array(keras4_test.loss) 


In [None]:
ids = pd.read_csv('./data_prep/input/test.csv')['id']
df = pd.DataFrame({'id': ids, 'loss': combined})


In [None]:
df.to_csv('allstate'+str(bestSC) + '.csv', index = False)