In [2]:
import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

In [2]:
seed = 42

In [3]:
california_housing = fetch_california_housing(as_frame=True)

In [6]:
california_housing.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [5]:
california_housing.data.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [7]:
california_housing.target

0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: MedHouseVal, Length: 20640, dtype: float64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(california_housing.data, california_housing.target, test_size=0.333, random_state = seed)

In [9]:
params = {
    #"booster": "gbtree",
    "nthread": 8,
    "eta": 0.03,
    "max_depth": 8,
    "objective": "reg:squarederror",
    "subsample": 0.5,
    "eval_metric": "rmse",
    "random_state": seed
}

progress = dict()
d_train = xgb.DMatrix(X_train, label = y_train)
d_test = xgb.DMatrix(X_test, label = y_test)

watchlist  = [(d_train,'train'), (d_test, 'eval')]

#XGB_model = xgb.train(params, d_train, 50000, evals = [(d_test, "test")], verbose_eval=500, early_stopping_rounds=100)
XGB_model = xgb.train(params, d_train, 50000, watchlist, evals_result = progress,  verbose_eval=500, early_stopping_rounds=100)

[0]	train-rmse:1.89387	eval-rmse:1.89985
[500]	train-rmse:0.23178	eval-rmse:0.45522
[1000]	train-rmse:0.14289	eval-rmse:0.44902
[1170]	train-rmse:0.12298	eval-rmse:0.44885


In [11]:
XGB_model.save_model("my_model.json")

In [46]:
test = X_train.iloc[0]

In [72]:
train = pd.DataFrame(test).T

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
15462,3.625,6.0,4.618012,1.124224,810.0,2.515528,33.17,-117.22


In [48]:
true = y_train.iloc[0]

In [97]:
model_xgb2 = xgb.Booster()

In [106]:
model_xgb2.load_model("my_model.json")

In [107]:
print(model_xgb2)

<xgboost.core.Booster object at 0x000001EE9B9A6EB0>


In [96]:
predicted_values = model_xgb2.predict(xgb.DMatrix(train))

float(predicted_values[0])

1.5040373802185059

In [88]:
donner = [4,3,1,2,5,4,6,7]
cols = list(fetch_california_housing(as_frame = True).data.columns)

In [89]:
final = np.array(donner)

In [91]:
data_unseen = pd.DataFrame([final], columns = cols)

In [93]:
data_unseen

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,4,3,1,2,5,4,6,7
