# AIS Summer Comp (Advay Vyas)

### Imports and data

In [5]:
# imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np
from math import sqrt
from xgboost import XGBRegressor

In [6]:
# load data
train_file_path = '../input/ais-summer-comp-2022/train.csv'
all_data = pd.read_csv(train_file_path)

In [7]:
# get target
y = all_data['Median House Price']
y.head()

0    455800
1    365700
2    298300
3    334200
4    219100
Name: Median House Price, dtype: int64

In [8]:
# creating X
features = ['Population', 'Median Age', 'Unemployment Rate', 'Median Income', 'Poverty Rate']
X = all_data[features]
X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,840562,35.6,5.1,7.8,26.5
1,913161,37.5,4.0,6.2,54.5
2,768917,33.0,6.6,15.2,24.5
3,710626,42.1,3.1,4.6,53.2
4,791257,38.3,3.5,8.1,47.9


### XGBoost

#### Initalize data splits and model

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state = 42)
xgb = XGBRegressor(booster = 'gbtree', random_state = 0, max_delta_step = 0)

#### List of parameters

In [10]:
from pprint import pprint

print('Parameters in use:\n')
pprint(xgb.get_params())

Parameters in use:

{'base_score': None,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_to_onehot': None,
 'max_delta_step': 0,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'objective': 'reg:squarederror',
 'predictor': None,
 'random_state': 0,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}


#### Hyperparameter grid values

In [11]:
from sklearn.model_selection import RandomizedSearchCV

subsample = [float(x) for x in np.linspace(start = 0.4, stop = 1, num = 3)]
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 450, num = 5)]
min_child_weight = [int(x) for x in np.linspace(start = 0, stop = 2, num = 3)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 7, num = 5)]
learning_rate = [float(x) for x in np.linspace(start = 0.1, stop = 0.7, num = 4)]
gamma = [int(x) for x in np.linspace(start = 0, stop = 5, num = 5)]
colsample_bytree = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 5)]
colsample_bylevel = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 5)]
colsample_bynode = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 5)]

#### Creating random grid to use in tuning with k-fold cross validation

In [12]:
random_grid = {
    'colsample_bytree': colsample_bytree,
    'colsample_bylevel': colsample_bylevel,
    'colsample_bynode': colsample_bynode,
    'n_estimators': n_estimators,
    'subsample': subsample,
    'max_depth': max_depth,
    'learning_rate': learning_rate,
    'gamma': gamma,
    'min_child_weight': min_child_weight}
               
pprint(random_grid)

{'colsample_bylevel': [0.1, 0.325, 0.55, 0.775, 1.0],
 'colsample_bynode': [0.1, 0.325, 0.55, 0.775, 1.0],
 'colsample_bytree': [0.1, 0.325, 0.55, 0.775, 1.0],
 'gamma': [0, 1, 2, 3, 5],
 'learning_rate': [0.1, 0.3, 0.5, 0.7],
 'max_depth': [1, 2, 4, 5, 7],
 'min_child_weight': [0, 1, 2],
 'n_estimators': [100, 187, 275, 362, 450],
 'subsample': [0.4, 0.7, 1.0]}


In [23]:
xgb_random = RandomizedSearchCV(estimator = xgb, param_distributions = random_grid, n_iter = 150, cv = 5, verbose = 2, 
    random_state = 0, n_jobs = -1) 
xgb_random.fit(X_train, y_train, verbose=False)
xgb_random.best_params_

Fitting 5 folds for each of 150 candidates, totalling 750 fits
[CV] END colsample_bylevel=0.55, colsample_bynode=0.775, colsample_bytree=0.55, gamma=5, learning_rate=0.5, max_depth=5, min_child_weight=1, n_estimators=362, subsample=1.0; total time=   1.2s
[CV] END colsample_bylevel=0.775, colsample_bynode=1.0, colsample_bytree=0.325, gamma=5, learning_rate=0.3, max_depth=1, min_child_weight=0, n_estimators=187, subsample=0.7; total time=   0.6s
[CV] END colsample_bylevel=0.325, colsample_bynode=0.1, colsample_bytree=0.325, gamma=1, learning_rate=0.1, max_depth=2, min_child_weight=0, n_estimators=275, subsample=0.7; total time=   1.0s
[CV] END colsample_bylevel=0.325, colsample_bynode=0.1, colsample_bytree=0.325, gamma=1, learning_rate=0.1, max_depth=2, min_child_weight=0, n_estimators=275, subsample=0.7; total time=   1.0s
[CV] END colsample_bylevel=0.325, colsample_bynode=0.325, colsample_bytree=0.775, gamma=5, learning_rate=0.1, max_depth=7, min_child_weight=2, n_estimators=187, subs

{'subsample': 0.4,
 'n_estimators': 100,
 'min_child_weight': 0,
 'max_depth': 1,
 'learning_rate': 0.1,
 'gamma': 1,
 'colsample_bytree': 0.55,
 'colsample_bynode': 1.0,
 'colsample_bylevel': 0.55}


[CV] END colsample_bylevel=0.775, colsample_bynode=1.0, colsample_bytree=0.775, gamma=5, learning_rate=0.7, max_depth=1, min_child_weight=1, n_estimators=450, subsample=0.4; total time=   1.5s
[CV] END colsample_bylevel=0.1, colsample_bynode=0.1, colsample_bytree=0.55, gamma=5, learning_rate=0.7, max_depth=5, min_child_weight=1, n_estimators=187, subsample=0.7; total time=   0.7s
[CV] END colsample_bylevel=0.325, colsample_bynode=0.1, colsample_bytree=0.55, gamma=3, learning_rate=0.1, max_depth=5, min_child_weight=0, n_estimators=450, subsample=1.0; total time=   1.6s
[CV] END colsample_bylevel=0.55, colsample_bynode=0.55, colsample_bytree=0.325, gamma=1, learning_rate=0.5, max_depth=7, min_child_weight=1, n_estimators=275, subsample=0.4; total time=   1.0s
[CV] END colsample_bylevel=0.55, colsample_bynode=0.55, colsample_bytree=0.325, gamma=1, learning_rate=0.5, max_depth=7, min_child_weight=1, n_estimators=275, subsample=0.4; total time=   1.0s
[CV] END colsample_bylevel=0.775, cols

#### Evaluating the results of k-fold cross-validation

In [14]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    error = sqrt(mean_squared_error(y_test, predictions))
    print('RMSE')
    print(error)

In [22]:
baseline_model = XGBRegressor(random_state = 42, booster='gbtree', reg_alpha = 10, 
 subsample = 0.667, n_estimators = 427, min_child_weight = 0, max_depth = 3, 
    max_delta_step = 0, learning_rate = 0.105, gamma = 4, sampling_method = 'gradient_based', 
tree_method = 'gpu_hist')
baseline_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
base_accuracy = evaluate(baseline_model, X_test, y_test)

RMSE
126511.43075965442




In [16]:
refined_model = XGBRegressor(random_state = 0, booster='gbtree', 
subsample = 0.4, n_estimators = 100, min_child_weight = 0, max_depth = 1,
max_delta_step = 0, learning_rate = 0.1, gamma = 1, colsample_bytree = 0.55, 
colsample_bynode = 1.0, colsample_bylevel = 0.55, sampling_method = 'gradient_based', 
tree_method = 'gpu_hist')
refined_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
refined_accuracy = evaluate(refined_model, X_test, y_test)



RMSE
177551.43963218894


### Predicting competition data

In [24]:
# reading competition data
competition_data_path = '../input/ais-summer-comp-2022/evaluation_input.csv'
competition_data = pd.read_csv(competition_data_path)
competition_data.head()

Unnamed: 0,ID,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,TX35,857654,33.0,4.1,18.4,25.2
1,PR16,678333,43.0,4.6,13.5,28.3
2,NY4,730314,40.4,3.6,5.7,43.6
3,OR1,858875,38.0,3.8,8.9,40.7
4,GA8,706237,37.6,5.3,17.3,22.7


In [25]:
# creating competition X
competition_X = competition_data[features]
competition_X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,857654,33.0,4.1,18.4,25.2
1,678333,43.0,4.6,13.5,28.3
2,730314,40.4,3.6,5.7,43.6
3,858875,38.0,3.8,8.9,40.7
4,706237,37.6,5.3,17.3,22.7


In [26]:
# competition predictions
competition_preds = baseline_model.predict(competition_X)
print(competition_preds)

[ 304751.8    120980.84   405587.28   321851.44   158552.61   231876.9
  284866.97   317118.25   431714.1     95182.74   415547.03   321851.44
  429956.44   374514.28   169515.86   196658.53   349846.28   329454.3
  196647.19   246710.38   364559.78   152910.4    224420.47   411784.
  538398.44   727809.     536325.9    380898.53   506050.06   172938.02
  318956.12   531291.7    216958.92   186621.98   217216.94   310970.6
  167761.88   373627.03   339528.3    246860.84   213961.7     90994.62
  220285.3    439628.97   374079.75   461585.25   362868.25   335326.38
  283948.78    76253.695  223130.58  1032382.44   336494.84   173474.33
  278129.22   190100.9    308258.03   175997.61   341537.72   274918.47
  422854.3    289173.25   406307.16   190755.66   150784.33   447834.06
  242755.12   437601.     390831.16   366123.06   272105.3    186438.28
  382499.34   173569.05   233962.55   346504.9    332026.4    184527.67
  166859.44   186613.8    165350.14   159871.06   230864.14   309169.

### Generating submission

In [37]:
# Creating and outputting DataFrame
output = pd.DataFrame({'ID': competition_data.ID, 'Median House Price': competition_preds})
output.to_csv('Vyas_Advay_answer.csv', index=False)