# AIS Summer Comp (Advay Vyas)

### Imports and data

In [68]:
# imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np
from math import sqrt
from xgboost import XGBRegressor
from xgboost import XGBClassifier

In [3]:
# load data
train_file_path = 'C:\\Users\\advay\\OneDrive\\Desktop\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\train.csv'
all_data = pd.read_csv(train_file_path)

In [4]:
# get target
y = all_data['Median House Price']
y.head()

0    455800
1    365700
2    298300
3    334200
4    219100
Name: Median House Price, dtype: int64

In [5]:
# creating X
features = ['Population', 'Median Age', 'Unemployment Rate', 'Median Income', 'Poverty Rate']
X = all_data[features]
X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,840562,35.6,5.1,7.8,26.5
1,913161,37.5,4.0,6.2,54.5
2,768917,33.0,6.6,15.2,24.5
3,710626,42.1,3.1,4.6,53.2
4,791257,38.3,3.5,8.1,47.9


### XGBoost

#### Initalize data splits and model

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state = 42)
xgb = XGBRegressor()

#### List of parameters

In [7]:
from pprint import pprint

print('Parameters in use:\n')
pprint(xgb.get_params())

Parameters in use:

{'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'objective': 'reg:squarederror',
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}


#### Hyperparameter grid values

In [70]:
from sklearn.model_selection import RandomizedSearchCV

# tree amounts
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# features to consider at each split
max_features = ['auto', 'sqrt']
# max levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None) # got to consider none
# samples to split a node
min_samples_split = [2, 5, 10]
# samples at each leaf node
min_samples_leaf = [1, 2, 4]
# method of selecting samples
bootstrap = [True, False]

#### Creating random grid to use in tuning with k-fold cross validation

In [71]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
               
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [66]:
rf_random = RandomizedSearchCV(estimator = xgb, param_distributions = random_grid, n_iter = 150, cv = 5, verbose = 2, random_state = 0, n_jobs = -1) 
rf_random.fit(X_train, y_train)
rf_random.best_params_

ModuleNotFoundError: No module named 'sklearn.model_selection.RandomizedSearchCV'

#### Evaluating the results of k fold cross validation

In [15]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [52]:
model_1 = XGBRegressor(random_state = 0, learning_rate=0.05, booster='gbtree', gamma=0, max_depth=8, reg_alpha = 10)
model_1.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
base_accuracy = evaluate(model_1, X_test, y_test)

Model Performance
Average Error: 92203.9758 degrees.
Accuracy = 70.54%.


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [73]:
model_2 = XGBRegressor(random_state = 0, learning_rate=0.05, booster='gbtree', gamma=0, max_depth=8, reg_alpha = 10, sampling_method = 'gradient_based')
model_2.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
base_accuracy = evaluate(model_2, X_test, y_test)

Model Performance
Average Error: 92203.9758 degrees.
Accuracy = 70.54%.


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


#### XGB Classifier Params
alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, 
enable_categorical=False, gamma=0, gpu_id=-1, importance_type=None, interaction_constraints='', learning_rate=0.05, max_delta_step=0, max_depth=4, 
min_child_weight=1, monotone_constraints='()', n_estimators=100, n_jobs=4, num_parallel_tree=1, objective='reg:squarederror', predictor='auto', random_state=0, 
reg_alpha=10, reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method='exact', validate_parameters=1, verbosity=None)

#### Fitting to full data

In [74]:
final_model = XGBRegressor(random_state = 0, learning_rate=0.05, booster='gbtree', gamma=0, max_depth=8, reg_alpha = 10)
final_model.fit(X, y)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=8, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

### Predicting competition data

In [77]:
# reading competition data
competition_data_path = 'C:\\Users\\advay\\OneDrive\\Desktop\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\evaluation_input.csv'
competition_data = pd.read_csv(competition_data_path)
competition_data.head()

Unnamed: 0,ID,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,TX35,857654,33.0,4.1,18.4,25.2
1,PR16,678333,43.0,4.6,13.5,28.3
2,NY4,730314,40.4,3.6,5.7,43.6
3,OR1,858875,38.0,3.8,8.9,40.7
4,GA8,706237,37.6,5.3,17.3,22.7


In [78]:
# creating competition X
competition_X = competition_data[features]
competition_X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,857654,33.0,4.1,18.4,25.2
1,678333,43.0,4.6,13.5,28.3
2,730314,40.4,3.6,5.7,43.6
3,858875,38.0,3.8,8.9,40.7
4,706237,37.6,5.3,17.3,22.7


In [79]:
# competition predictions
competition_preds = final_model.predict(competition_X)
print(competition_preds)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[ 170054.19   185777.47   445998.     334994.3    150495.78   167919.05
  167966.14   259780.17   295587.6    129098.26   314600.94   352641.12
  280114.2    329130.4    190284.95   149959.58   290851.34   213194.33
  218852.72   186237.38   179311.47   152072.1    191479.81   400469.34
  626850.94  1130300.1    515343.56   283574.56   791248.2    172859.42
  159976.75   683149.75   158307.42   166834.84   182132.14   261185.17
  219730.89   238873.     180310.19   248995.67   189568.64   130228.96
  190415.45   660478.6    552254.75   656501.4    296849.7    360725.38
  542216.5    137095.48   182946.2   1298041.6    211471.34   209593.02
  180508.53   181921.81   297525.2    165735.98   127789.19   219593.55
  661635.44   250669.83   429406.72   185505.23   154445.52   530817.8
  179133.77   360793.94   310970.16   459657.84   189487.22   213267.39
  292229.47   147321.2    173158.14   380360.12   210139.5    156097.1
  152603.19   198817.11   148904.44   148827.77   164575.3    3270

### Generating submission

In [80]:
# Creating and outputting DataFrame
output = pd.DataFrame({'ID': competition_data.ID, 'Median House Price': competition_preds})
output.to_csv('Vyas_Advay_answer.csv', index=False)