# AIS Summer Comp (Advay Vyas)

### Imports and data

In [57]:
# imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from math import sqrt

In [58]:
# load data
train_file_path = 'C:\\Users\\advay\\OneDrive\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\train.csv'
all_data = pd.read_csv(train_file_path)

In [59]:
# get target
y = all_data['Median House Price']
y.head()

0    455800
1    365700
2    298300
3    334200
4    219100
Name: Median House Price, dtype: int64

In [60]:
# creating X
features = ['Population', 'Median Age', 'Unemployment Rate', 'Median Income', 'Poverty Rate']
X = all_data[features]
X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,840562,35.6,5.1,7.8,26.5
1,913161,37.5,4.0,6.2,54.5
2,768917,33.0,6.6,15.2,24.5
3,710626,42.1,3.1,4.6,53.2
4,791257,38.3,3.5,8.1,47.9


### Improved Random Forest Model

#### Initalize data splits and model

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state = 42)
rf = RandomForestRegressor()

#### List of parameters

In [62]:
from pprint import pprint

print('Parameters in use:\n')
pprint(rf.get_params())

Parameters in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


#### Hyperparameter grid values

In [63]:
from sklearn.model_selection import RandomizedSearchCV

# tree amounts
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# features to consider at each split
max_features = ['auto', 'sqrt']
# max levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None) # got to consider none
# samples to split a node
min_samples_split = [2, 5, 10]
# samples at each leaf node
min_samples_leaf = [1, 2, 4]
# method of selecting samples
bootstrap = [True, False]

#### Creating random grid to use in tuning with k-fold cross validation

In [64]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
               
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [65]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 150, cv = 5, verbose = 2, random_state = 42, n_jobs = -1) 
rf_random.fit(X_train, y_train)
rf_random.best_params_

Fitting 5 folds for each of 150 candidates, totalling 750 fits


{'n_estimators': 400,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 70,
 'bootstrap': True}

#### Evaluating the results of k fold cross validation

In [80]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [81]:
base_model = RandomForestRegressor(random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 96124.5000 degrees.
Accuracy = 66.15%.


In [82]:
refined_model = RandomForestRegressor(max_depth=70, min_samples_leaf=4, min_samples_split=10, n_estimators=400, random_state=42)
refined_model.fit(X_train, y_train)
refined_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 96124.5000 degrees.
Accuracy = 66.15%.


In [83]:
print('Improvement of {:0.2f}%.'.format( 100 * (refined_accuracy - base_accuracy) / base_accuracy))

Improvement of 0.00%.


#### Fitting to full data

In [84]:
final_model = RandomForestRegressor(max_depth=70, min_samples_leaf=4, min_samples_split=10, n_estimators=400, random_state=42)
final_model.fit(X, y)

RandomForestRegressor(max_depth=70, min_samples_leaf=4, min_samples_split=10,
                      n_estimators=400, random_state=42)

### Predicting competition data

In [85]:
# reading competition data
competition_data_path = 'C:\\Users\\advay\\OneDrive\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\evaluation_input.csv'
competition_data = pd.read_csv(competition_data_path)
competition_data.head()

Unnamed: 0,ID,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,TX35,857654,33.0,4.1,18.4,25.2
1,PR16,678333,43.0,4.6,13.5,28.3
2,NY4,730314,40.4,3.6,5.7,43.6
3,OR1,858875,38.0,3.8,8.9,40.7
4,GA8,706237,37.6,5.3,17.3,22.7


In [86]:
# creating competition X
competition_X = competition_data[features]
competition_X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,857654,33.0,4.1,18.4,25.2
1,678333,43.0,4.6,13.5,28.3
2,730314,40.4,3.6,5.7,43.6
3,858875,38.0,3.8,8.9,40.7
4,706237,37.6,5.3,17.3,22.7


In [87]:
# competition predictions
competition_preds = final_model.predict(competition_X)
print(competition_preds)

[193249.51548148 237721.40803926 478975.69949667 320124.80892851
 156849.2338875  200951.62058296 172187.55201298 338899.48185265
 417914.72407568 129995.75902112 429584.71022842 407015.34672895
 441886.89403398 323884.80216579 178634.79749317 170749.89739273
 462490.74561248 221874.50560001 208878.68392856 218366.62476213
 275369.39844947 166813.1411724  197437.00200655 456215.7190537
 536185.65971929 932533.60078622 591783.80312667 435653.09218044
 562563.77274085 246985.03245629 247057.15363913 546137.32752482
 184261.63426294 163501.48447158 189730.96670745 307633.8027697
 188746.24041595 354539.23847177 222993.21902552 271598.80927795
 204064.65599373 132491.27227859 213727.3281055  460581.00875062
 280544.81405032 483254.25870245 304319.25556382 334564.53386097
 383504.4219283  131863.6669991  212137.97664783 963122.4619854
 382516.23433878 182382.40622362 245479.2448357  204421.47640075
 314923.00538692 178016.82106963 367209.62948284 224186.02102753
 459397.59793977 348434.2199

### Generating submission

In [88]:
# Creating and outputting DataFrame
output = pd.DataFrame({'ID': competition_data.ID, 'Median House Price': competition_preds})
output.to_csv('Vyas_Advay_answer.csv', index=False)