# AIS Summer Comp (Advay Vyas)

### Imports and data

In [1]:
# imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np
from math import sqrt
from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


In [2]:
# load data
train_file_path = 'C:\\Users\\advay\\OneDrive\\Desktop\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\train.csv'
all_data = pd.read_csv(train_file_path)

In [3]:
# get target
y = all_data['Median House Price']
y.head()

0    455800
1    365700
2    298300
3    334200
4    219100
Name: Median House Price, dtype: int64

In [4]:
# creating X
features = ['Population', 'Median Age', 'Unemployment Rate', 'Median Income', 'Poverty Rate']
X = all_data[features]
X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,840562,35.6,5.1,7.8,26.5
1,913161,37.5,4.0,6.2,54.5
2,768917,33.0,6.6,15.2,24.5
3,710626,42.1,3.1,4.6,53.2
4,791257,38.3,3.5,8.1,47.9


### XGBoost

#### Initalize data splits and model

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state = 42)
xgb = XGBRegressor(booster = 'gbtree', random_state = 0, max_delta_step = 0)

#### List of parameters

In [6]:
from pprint import pprint

print('Parameters in use:\n')
pprint(xgb.get_params())

Parameters in use:

{'base_score': None,
 'booster': 'gbtree',
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': 0,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'objective': 'reg:squarederror',
 'predictor': None,
 'random_state': 0,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}


#### Hyperparameter grid values

In [7]:
from sklearn.model_selection import RandomizedSearchCV

subsample = [float(x) for x in np.linspace(start = 0.4, stop = 1, num = 3)]
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 450, num = 5)]
min_child_weight = [int(x) for x in np.linspace(start = 0, stop = 2, num = 1)]
max_depth = [int(x) for x in np.linspace(start = 2, stop = 7, num = 4)]
learning_rate = [float(x) for x in np.linspace(start = 0.1, stop = 0.7, num = 4)]
gamma = [int(x) for x in np.linspace(start = 0, stop = 5, num = 3)]
colsample_bytree = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 5)]
colsample_bylevel = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 5)]
colsample_bynode = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 5)]

#### Creating random grid to use in tuning with k-fold cross validation

In [8]:
random_grid = {
    'colsample_bytree': colsample_bytree,
    'colsample_bylevel': colsample_bylevel,
    'colsample_bynode': colsample_bynode,
    'n_estimators': n_estimators,
    'subsample': subsample,
    'max_depth': max_depth,
    'learning_rate': learning_rate,
    'gamma': gamma,
    'min_child_weight': min_child_weight}
               
pprint(random_grid)

{'colsample_bylevel': [0.1, 0.325, 0.55, 0.775, 1.0],
 'colsample_bynode': [0.1, 0.325, 0.55, 0.775, 1.0],
 'colsample_bytree': [0.1, 0.325, 0.55, 0.775, 1.0],
 'gamma': [0, 2, 5],
 'learning_rate': [0.1, 0.3, 0.5, 0.7],
 'max_depth': [2, 3, 5, 7],
 'min_child_weight': [0],
 'n_estimators': [100, 187, 275, 362, 450],
 'subsample': [0.4, 0.7, 1.0]}


In [10]:
xgb_random = RandomizedSearchCV(estimator = xgb, param_distributions = random_grid, n_iter = 150, cv = 5, verbose = 2, 
    random_state = 0, n_jobs = -1) 
xgb_random.fit(X_train, y_train)
xgb_random.best_params_

Fitting 5 folds for each of 150 candidates, totalling 750 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


{'subsample': 0.7,
 'n_estimators': 100,
 'min_child_weight': 0,
 'max_depth': 2,
 'learning_rate': 0.1,
 'gamma': 2,
 'colsample_bytree': 1.0,
 'colsample_bynode': 0.775,
 'colsample_bylevel': 0.325}

#### Evaluating the results of k-fold cross-validation

In [11]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    error = sqrt(mean_squared_error(y_test, predictions))
    print('RMSE')
    print(error)

In [32]:
baseline_model = XGBRegressor(random_state = 42, booster='gbtree', reg_alpha = 10, 
 subsample = 0.667, n_estimators = 427, min_child_weight = 0, max_depth = 3, 
    max_delta_step = 0, learning_rate = 0.105, gamma = 4)
baseline_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
base_accuracy = evaluate(baseline_model, X_test, y_test)

RMSE
132535.80874358775


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [36]:
refined_model = XGBRegressor(random_state = 0, booster='gbtree', reg_alpha = 10,
subsample = 0.7, n_estimators = 100, min_child_weight = 0, max_depth = 2, learning_rate = 0.1, gamma = 4, colsample_bytree = 1.0, 
colsample_bynode = 0.775, colsample_bylevel = 0.325, sampling_method = 'gradient_based', tree_method = 'gpu_hist')
refined_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
refined_accuracy = evaluate(refined_model, X_test, y_test)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBoostError: [19:57:32] d:\bld\xgboost-split_1645118015404\work\src\common\common.h:157: XGBoost version not compiled with GPU support.

### Predicting competition data

In [28]:
# reading competition data
competition_data_path = 'C:\\Users\\advay\\OneDrive\\Desktop\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\evaluation_input.csv'
competition_data = pd.read_csv(competition_data_path)
competition_data.head()

Unnamed: 0,ID,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,TX35,857654,33.0,4.1,18.4,25.2
1,PR16,678333,43.0,4.6,13.5,28.3
2,NY4,730314,40.4,3.6,5.7,43.6
3,OR1,858875,38.0,3.8,8.9,40.7
4,GA8,706237,37.6,5.3,17.3,22.7


In [29]:
# creating competition X
competition_X = competition_data[features]
competition_X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,857654,33.0,4.1,18.4,25.2
1,678333,43.0,4.6,13.5,28.3
2,730314,40.4,3.6,5.7,43.6
3,858875,38.0,3.8,8.9,40.7
4,706237,37.6,5.3,17.3,22.7


In [30]:
# competition predictions
competition_preds = baseline_model.predict(competition_X)
print(competition_preds)

[ 242104.73  137402.25  406398.44  284415.7   153550.64  189986.22
  173408.2   290630.62  446218.28  122003.24  341582.94  320728.56
  434516.28  311182.75  173995.58  171936.22  368468.16  243160.31
  184441.94  244976.    261576.36  161537.48  184535.19  553461.7
  526604.4   983565.    549755.3   474035.88  487509.5   284098.34
  254975.72  518901.38  205313.81  190137.9   194070.62  259962.55
  174862.19  313680.97  263887.75  267634.1   189374.47  119116.19
  188177.22  429844.28  442395.84  407811.84  288517.62  351631.2
  323660.7   114532.35  208044.1   860465.6   295123.5   176938.47
  239438.47  184441.94  308530.16  173849.62  278671.06  228590.14
  401095.2   327085.88  351694.97  175656.92  159106.81  435577.94
  212398.25  456862.4   362873.94  324804.3   228590.14  178267.44
  259518.4   157822.5   252032.    322267.72  294541.44  175043.4
  186542.9   190339.25  166072.47  178659.56  171218.38  314035.7
  619760.1   227309.67  205122.61  314976.94  189681.19  278811.9


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


### Generating submission

In [31]:
# Creating and outputting DataFrame
output = pd.DataFrame({'ID': competition_data.ID, 'Median House Price': competition_preds})
output.to_csv('Vyas_Advay_answer.csv', index=False)