In [99]:
# Libraries
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
from sklearn.model_selection import train_test_split

init_notebook_mode(connected=True)

import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

from tqdm import tqdm

tqdm.pandas()

import warnings

warnings.filterwarnings("ignore")
import gc

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

import optuna

bold = ['\033[1m', '\033[0m']

UsageError: Line magic function `%` not found.


We will be predicting  the median house value for California districts, expressed in hundreds of thousands of dollars ($100,000). The independent variables at our disposal are:

MedInc - median income in block group
HouseAge - median house age in block group
AveRooms - average number of rooms per household
AveBedrms - average number of bedrooms per household
Population - block group population
AveOccup - average number of household members
Latitude - block group latitude
Longitude - block group longitude
The evaluation metric is going the be the standard Root Mean Squared Error (RMSE) and the useful thing to keep in mind about this metric, as it involves a squared term, is that outliers, or predictions that err a lot, are disproportionately penalized!

In [64]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



In [65]:
from sklearn.datasets import fetch_california_housing as fch

sklearn_df = pd.DataFrame(fch()['data'], columns=fch()['feature_names'])
sklearn_df['MedHouseVal'] = fch()['target']

# Show all columns
pd.set_option('display.max_columns', None)

In [66]:
sklearn_df['is_generated'] = 0
train['is_generated'] = 1
test['is_generated'] = 1

In [67]:
train.drop('id', axis=1, inplace=True)
train = pd.concat([train, sklearn_df],
                  ignore_index=True)
test.drop('id', axis=1, inplace=True)

In [68]:
train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,is_generated
0,2.3859,15.0,3.827160,1.112100,1280.0,2.486989,34.60,-120.12,0.980,1
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946,1
2,4.7750,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576,1
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336,1
4,3.7500,52.0,4.284404,1.069246,1793.0,1.604790,37.80,-122.41,4.500,1
...,...,...,...,...,...,...,...,...,...,...
57772,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781,0
57773,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771,0
57774,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923,0
57775,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847,0


In [69]:
def crt_crds(df):
    df['rot_15_x'] = (np.cos(np.radians(15)) * df['Longitude']) + (np.sin(np.radians(15)) * df['Latitude'])

    df['rot_15_y'] = (np.cos(np.radians(15)) * df['Latitude']) + (np.sin(np.radians(15)) * df['Longitude'])

    df['rot_30_x'] = (np.cos(np.radians(30)) * df['Longitude']) + (np.sin(np.radians(30)) * df['Latitude'])

    df['rot_30_y'] = (np.cos(np.radians(30)) * df['Latitude']) + (np.sin(np.radians(30)) * df['Longitude'])

    df['rot_45_x'] = (np.cos(np.radians(45)) * df['Longitude']) + (np.sin(np.radians(45)) * df['Latitude'])
    return df


train = crt_crds(train)
test = crt_crds(test)

In [70]:
train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,is_generated,rot_15_x,rot_15_y,rot_30_x,rot_30_y,rot_45_x
0,2.3859,15.0,3.827160,1.112100,1280.0,2.486989,34.60,-120.12,0.980,1,-107.071871,2.331690,-86.726972,-30.095521,-60.471772
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946,1,-107.075820,5.997626,-85.634599,-27.103477,-58.357523
2,4.7750,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576,1,-107.362157,2.352531,-86.957760,-30.165258,-60.627335
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336,1,-104.647225,1.242015,-85.072915,-30.260610,-59.701026
4,3.7500,52.0,4.284404,1.069246,1793.0,1.604790,37.80,-122.41,4.500,1,-108.455620,4.829957,-87.110170,-28.469240,-59.828305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57772,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781,0,-106.745782,6.794353,-85.127016,-26.354317,-57.706984
57773,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771,0,-106.859105,6.772954,-85.225939,-26.405657,-57.784766
57774,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923,0,-106.884294,6.712411,-85.264599,-26.462618,-57.834264
57775,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847,0,-106.980886,6.686529,-85.351202,-26.512618,-57.904974


In [71]:
import reverse_geocoder as rg


def geocoder(df):
    coordinates = list(zip(df['Latitude'], df['Longitude']))
    results = rg.search(coordinates)
    return results


results = geocoder(train)
train['place'] = [x['admin2'] for x in results]
results = geocoder(test)
test['place'] = [x['admin2'] for x in results]

places = ['Los Angeles County', 'Orange County', 'Kern County',
          'Alameda County', 'San Francisco County', 'Ventura County',
          'Santa Clara County', 'Fresno County', 'Santa Barbara County',
          'Contra Costa County', 'Yolo County', 'Monterey County',
          'Riverside County', 'Napa County']


def replace(x):
    if x in places:
        return x
    else:
        return 'Other'


train['place'] = train['place'].apply(lambda x: replace(x))
test['place'] = test['place'].apply(lambda x: replace(x))

le = LabelEncoder()
train['place'] = le.fit_transform(train['place'])
test['place'] = le.transform(test['place'])

In [72]:
train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,is_generated,rot_15_x,rot_15_y,rot_30_x,rot_30_y,rot_45_x,place
0,2.3859,15.0,3.827160,1.112100,1280.0,2.486989,34.60,-120.12,0.980,1,-107.071871,2.331690,-86.726972,-30.095521,-60.471772,11
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946,1,-107.075820,5.997626,-85.634599,-27.103477,-58.357523,8
2,4.7750,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576,1,-107.362157,2.352531,-86.957760,-30.165258,-60.627335,11
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336,1,-104.647225,1.242015,-85.072915,-30.260610,-59.701026,8
4,3.7500,52.0,4.284404,1.069246,1793.0,1.604790,37.80,-122.41,4.500,1,-108.455620,4.829957,-87.110170,-28.469240,-59.828305,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57772,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781,0,-106.745782,6.794353,-85.127016,-26.354317,-57.706984,8
57773,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771,0,-106.859105,6.772954,-85.225939,-26.405657,-57.784766,8
57774,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923,0,-106.884294,6.712411,-85.264599,-26.462618,-57.834264,8
57775,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847,0,-106.980886,6.686529,-85.351202,-26.512618,-57.904974,8


In [73]:
from sklearn.decomposition import PCA


def pca_crds(df):
    coordinates = df[['Latitude', 'Longitude']].values
    pca_obj = PCA().fit(coordinates)
    df['pca_lat'] = pca_obj.transform(df[['Latitude', 'Longitude']])[:, 0]
    df['pca_lon'] = pca_obj.transform(df[['Latitude', 'Longitude']])[:, 1]
    return df


train = pca_crds(train)
test = pca_crds(test)

In [74]:
train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal,is_generated,rot_15_x,rot_15_y,rot_30_x,rot_30_y,rot_45_x,place,pca_lat,pca_lon
0,2.3859,15.0,3.827160,1.112100,1280.0,2.486989,34.60,-120.12,0.980,1,-107.071871,2.331690,-86.726972,-30.095521,-60.471772,11,-0.339172,-1.087686
1,3.7188,17.0,6.013373,1.054217,1504.0,3.813084,38.69,-121.22,0.946,1,-107.075820,5.997626,-85.634599,-27.103477,-58.357523,8,3.394240,0.912247
2,4.7750,27.0,6.535604,1.103175,1061.0,2.464602,34.71,-120.45,1.576,1,-107.362157,2.352531,-86.957760,-30.165258,-60.627335,11,-0.032996,-1.252781
3,2.4138,16.0,3.350203,0.965432,1255.0,2.089286,32.66,-117.09,1.336,1,-104.647225,1.242015,-85.072915,-30.260610,-59.701026,8,-3.828020,-0.208802
4,3.7500,52.0,4.284404,1.069246,1793.0,1.604790,37.80,-122.41,4.500,1,-108.455620,4.829957,-87.110170,-28.469240,-59.828305,10,3.560861,-0.564383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57772,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781,0,-106.745782,6.794353,-85.127016,-26.354317,-57.706984,8,3.880794,1.548066
57773,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771,0,-106.859105,6.772954,-85.225939,-26.405657,-57.784766,8,3.970272,1.467483
57774,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923,0,-106.884294,6.712411,-85.264599,-26.462618,-57.834264,8,3.933406,1.419101
57775,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847,0,-106.980886,6.686529,-85.351202,-26.512618,-57.904974,8,4.001899,1.346241


In [93]:
X = train.drop('MedHouseVal', axis=1)
y_log = np.log1p(train["MedHouseVal"].values)
y = train['MedHouseVal']


In [94]:
Scaler = StandardScaler()
fin_train_scale = Scaler.fit_transform(X)
fin_test_scale = Scaler.transform(test)

In [95]:
feature = []
for col in X.columns:
    feature.append(col)

In [96]:
feature

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude',
 'is_generated',
 'rot_15_x',
 'rot_15_y',
 'rot_30_x',
 'rot_30_y',
 'rot_45_x',
 'place',
 'pca_lat',
 'pca_lon']

In [98]:
k = KFold(n_splits=10, random_state=75, shuffle=True)
for train_index, val_index in k.split(train):
    X_train, X_test = train[feature].loc[train_index], train[feature].loc[val_index]
    y_train, y_test = train['MedHouseVal'][train_index], train['MedHouseVal'][val_index]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [100]:
X_train, X_test, y_train, y_test = train_test_split(fin_train_scale, y, test_size=0.33, random_state=42)

In [101]:
lgb = LGBMRegressor(max_depth=9,
                    learning_rate=0.01,
                    min_data_in_leaf=36,
                    num_leaves=100,
                    feature_fraction=0.8,
                    bagging_fraction=0.89,
                    bagging_freq=5,
                    lambda_l2=28,
                    seed=75,
                    objective='regression',
                    boosting_type='gbdt',
                    device='gpu',
                    gpu_platform_id=0,
                    gpu_device_id=0,
                    n_jobs=-1,
                    metric='rmse',
                    verbose=-1,
                    n_estimators=7000)

In [104]:
lgb1 = LGBMRegressor(objective='regression', num_leaves=100, learning_rate=0.001, bagging_fraction=0.6,
                     feature_fraction=0.6, bagging_frequency=6, bagging_seed=42, verbosity=-1, random_state=17,
                     n_jobs=-1, metric='rmse', n_estimators=15000, seed=42)

In [103]:
lgb.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_test, y_test)], early_stopping_rounds=30,
        verbose=150)

[150]	valid_0's rmse: 0.614631
[300]	valid_0's rmse: 0.540496
[450]	valid_0's rmse: 0.522555
[600]	valid_0's rmse: 0.51686
[750]	valid_0's rmse: 0.514298
[900]	valid_0's rmse: 0.512529
[1050]	valid_0's rmse: 0.51134
[1200]	valid_0's rmse: 0.510416
[1350]	valid_0's rmse: 0.509622
[1500]	valid_0's rmse: 0.509035
[1650]	valid_0's rmse: 0.508774
[1800]	valid_0's rmse: 0.508565
[1950]	valid_0's rmse: 0.508252


In [105]:
lgb1.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_test, y_test)], early_stopping_rounds=30,
         verbose=150)

[150]	valid_0's rmse: 1.04414
[300]	valid_0's rmse: 0.952137
[450]	valid_0's rmse: 0.874998
[600]	valid_0's rmse: 0.809618
[750]	valid_0's rmse: 0.755429
[900]	valid_0's rmse: 0.710203
[1050]	valid_0's rmse: 0.673152
[1200]	valid_0's rmse: 0.643002
[1350]	valid_0's rmse: 0.618779
[1500]	valid_0's rmse: 0.598936
[1650]	valid_0's rmse: 0.582873
[1800]	valid_0's rmse: 0.569937
[1950]	valid_0's rmse: 0.559477
[2100]	valid_0's rmse: 0.551156
[2250]	valid_0's rmse: 0.544359
[2400]	valid_0's rmse: 0.53885
[2550]	valid_0's rmse: 0.534361
[2700]	valid_0's rmse: 0.530662
[2850]	valid_0's rmse: 0.527618
[3000]	valid_0's rmse: 0.525128
[3150]	valid_0's rmse: 0.523017
[3300]	valid_0's rmse: 0.521217
[3450]	valid_0's rmse: 0.519751
[3600]	valid_0's rmse: 0.518463
[3750]	valid_0's rmse: 0.517331
[3900]	valid_0's rmse: 0.516356
[4050]	valid_0's rmse: 0.515449
[4200]	valid_0's rmse: 0.514627
[4350]	valid_0's rmse: 0.513949
[4500]	valid_0's rmse: 0.513359
[4650]	valid_0's rmse: 0.512842
[4800]	valid_0's

In [106]:
cat = CatBoostRegressor(depth=9,
                        learning_rate=0.01,
                        rsm=0.88,
                        subsample=0.795,
                        l2_leaf_reg=8,
                        min_data_in_leaf=35,
                        random_strength=0.63,
                        use_best_model=True,
                        task_type='CPU',
                        bootstrap_type='Bernoulli',
                        grow_policy='SymmetricTree',
                        random_seed=75,
                        loss_function='RMSE',
                        eval_metric='RMSE')

In [110]:
cat1 = CatBoostRegressor(iterations=15000,
                         learning_rate=0.003,
                         depth=10,
                         eval_metric='RMSE',
                         random_seed=75,
                         bagging_temperature=0.3,
                         od_type='Iter',
                         metric_period=50,
                         od_wait=20)

In [111]:
cat1.fit(X_train, y_train, verbose=50, eval_set=(X_test, y_test), early_stopping_rounds=100,
         use_best_model=True)



0:	learn: 1.1557854	test: 1.1523079	best: 1.1523079 (0)	total: 146ms	remaining: 36m 34s
50:	learn: 1.0513553	test: 1.0472439	best: 1.0472439 (50)	total: 4.35s	remaining: 21m 16s
100:	learn: 0.9646138	test: 0.9600564	best: 0.9600564 (100)	total: 8.04s	remaining: 19m 46s
150:	learn: 0.8919139	test: 0.8870986	best: 0.8870986 (150)	total: 11.5s	remaining: 18m 48s
200:	learn: 0.8319769	test: 0.8270404	best: 0.8270404 (200)	total: 14.9s	remaining: 18m 16s
250:	learn: 0.7824394	test: 0.7773969	best: 0.7773969 (250)	total: 18.2s	remaining: 17m 50s
300:	learn: 0.7414740	test: 0.7366264	best: 0.7366264 (300)	total: 22s	remaining: 17m 53s
350:	learn: 0.7081529	test: 0.7034850	best: 0.7034850 (350)	total: 25.5s	remaining: 17m 45s
400:	learn: 0.6806777	test: 0.6762672	best: 0.6762672 (400)	total: 28.9s	remaining: 17m 33s
450:	learn: 0.6584338	test: 0.6542907	best: 0.6542907 (450)	total: 32.3s	remaining: 17m 20s
500:	learn: 0.6399216	test: 0.6361228	best: 0.6361228 (500)	total: 35.9s	remaining: 17m 

<catboost.core.CatBoostRegressor at 0x246be059390>

In [118]:
xgb = XGBRegressor(max_depth=9,
                   booster='gbtree',
                   eta=0.01,
                   colsample_bytree=0.66,
                   subsample=0.76,
                   min_child_weight=22, gamma=1,

                   seed=75,
                   objective='reg:squarederror',
                   eval_metric='rmse',
                   l1=16,
                   n_estimators = 15000
                   )

In [119]:
xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=30)

Parameters: { "l1" } are not used.

[0]	validation_0-rmse:1.93330
[30]	validation_0-rmse:1.48929
[60]	validation_0-rmse:1.16819
[90]	validation_0-rmse:0.94252
[120]	validation_0-rmse:0.78856
[150]	validation_0-rmse:0.68611
[180]	validation_0-rmse:0.62076
[210]	validation_0-rmse:0.57937
[240]	validation_0-rmse:0.55407
[270]	validation_0-rmse:0.53858
[300]	validation_0-rmse:0.52946
[330]	validation_0-rmse:0.52354
[360]	validation_0-rmse:0.51958
[390]	validation_0-rmse:0.51691
[420]	validation_0-rmse:0.51494
[450]	validation_0-rmse:0.51374
[480]	validation_0-rmse:0.51287
[510]	validation_0-rmse:0.51216
[540]	validation_0-rmse:0.51166
[570]	validation_0-rmse:0.51109
[600]	validation_0-rmse:0.51084
[630]	validation_0-rmse:0.51049
[660]	validation_0-rmse:0.51024
[690]	validation_0-rmse:0.51004
[720]	validation_0-rmse:0.50981
[750]	validation_0-rmse:0.50967
[780]	validation_0-rmse:0.50946
[810]	validation_0-rmse:0.50919
[840]	validation_0-rmse:0.50900
[870]	validation_0-rmse:0.50882
[900]	val

In [120]:
xgb1 = XGBRegressor(objective='reg:linear',
                   eval_metric='rmse',
                   eta=0.001,
                   max_depth=10,
                   subsample=0.6,
                   colsample_bytree=0.6,
                   alpha=0.001,
                   random_state=17,
                   silent=True,
                   n_estimators=15000,
                   n_jobs=-1,
                   seed=75)

In [121]:
xgb1.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=150)

Parameters: { "silent" } are not used.

[0]	validation_0-rmse:1.94918
[150]	validation_0-rmse:1.70824
[300]	validation_0-rmse:1.50294
[450]	validation_0-rmse:1.32887
[600]	validation_0-rmse:1.18092
[750]	validation_0-rmse:1.05665
[900]	validation_0-rmse:0.95255
[1050]	validation_0-rmse:0.86650
[1200]	validation_0-rmse:0.79523
[1350]	validation_0-rmse:0.73722
[1500]	validation_0-rmse:0.69002
[1650]	validation_0-rmse:0.65196
[1800]	validation_0-rmse:0.62157
[1950]	validation_0-rmse:0.59749
[2100]	validation_0-rmse:0.57858
[2250]	validation_0-rmse:0.56366
[2400]	validation_0-rmse:0.55192
[2550]	validation_0-rmse:0.54276
[2700]	validation_0-rmse:0.53563
[2850]	validation_0-rmse:0.53002
[3000]	validation_0-rmse:0.52565
[3150]	validation_0-rmse:0.52214
[3300]	validation_0-rmse:0.51943
[3450]	validation_0-rmse:0.51722
[3600]	validation_0-rmse:0.51546
[3750]	validation_0-rmse:0.51401
[3900]	validation_0-rmse:0.51282
[4050]	validation_0-rmse:0.51186
[4200]	validation_0-rmse:0.51105
[4350]	valid

In [125]:
fin_test_scale

array([[-1.17058158,  0.64814781, -0.16468447, ...,  0.54025721,
         1.61407688,  2.22713352],
       [-1.34357023, -0.40240812, -0.60461587, ..., -0.73737375,
        -0.73103603, -0.36730672],
       [ 2.10069904, -0.48322012,  1.05636256, ...,  0.22084947,
        -0.93384109, -0.1458883 ],
       ...,
       [-0.63208641, -1.04890408, -0.37490787, ...,  0.54025721,
         0.79383901,  0.73656544],
       [ 1.83512339, -1.53377605,  1.27028853, ...,  0.22084947,
        -0.8775127 , -0.30238446],
       [ 1.01614725,  0.24408783,  0.41879847, ...,  0.54025721,
         1.30419579, -0.93964227]])

In [145]:
pred_test_XGB = xgb1.predict(fin_test_scale)
pred_test_LGBM = lgb1.predict(fin_test_scale)
pred_test_cat = cat1.predict(fin_test_scale)

In [139]:
sub = pd.read_csv('sample_submission.csv')

In [140]:
sub

Unnamed: 0,id,MedHouseVal
0,37137,2.079751
1,37138,2.079751
2,37139,2.079751
3,37140,2.079751
4,37141,2.079751
...,...,...
24754,61891,2.079751
24755,61892,2.079751
24756,61893,2.079751
24757,61894,2.079751


In [146]:
pd.DataFrame({"id": sub.iloc[:,0].tolist(), "MedHouseVal":pred_test_XGB}).to_csv("submission5.csv", index=False)
pd.DataFrame({"id": sub.iloc[:,0].tolist(), "MedHouseVal":pred_test_LGBM}).to_csv("submission6.csv", index=False)
pd.DataFrame({"id": sub.iloc[:,0].tolist(), "MedHouseVal":pred_test_cat}).to_csv("submission7.csv", index=False)

In [141]:
sub_lgb = pd.DataFrame()
sub_lgb["MedHouseVal"] = pred_test_LGBM

sub_xgb = pd.DataFrame()
sub_xgb["MedHouseVal"] = pred_test_XGB

sub_cat = pd.DataFrame()
sub_cat["MedHouseVal"] = pred_test_cat


In [143]:
sub["MedHouseVal"] = (sub_xgb["MedHouseVal"] * 0.4 + sub_lgb["MedHouseVal"] * 0.4 + sub_cat["MedHouseVal"] * 0.2)

In [144]:
sub.to_csv('sub_final1_scaled_blend.csv', index=False)