# AIS Summer Comp (Advay Vyas)

### Imports and data

In [90]:
# imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import pprint
from math import sqrt

In [91]:
# load data
train_file_path = 'C:\\Users\\advay\\OneDrive\\Desktop\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\train.csv'
all_data = pd.read_csv(train_file_path)
all_data.head()

Unnamed: 0,ID,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate,Median House Price
0,CA42,840562,35.6,5.1,7.8,26.5,455800
1,TX3,913161,37.5,4.0,6.2,54.5,365700
2,CA22,768917,33.0,6.6,15.2,24.5,298300
3,IL6,710626,42.1,3.1,4.6,53.2,334200
4,IN5,791257,38.3,3.5,8.1,47.9,219100


In [92]:
# get target
y = all_data['Median House Price']
y.head()

0    455800
1    365700
2    298300
3    334200
4    219100
Name: Median House Price, dtype: int64

In [93]:
# creating X
features = ['Population', 'Median Age', 'Unemployment Rate', 'Median Income', 'Poverty Rate']
X = all_data[features]
X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,840562,35.6,5.1,7.8,26.5
1,913161,37.5,4.0,6.2,54.5
2,768917,33.0,6.6,15.2,24.5
3,710626,42.1,3.1,4.6,53.2
4,791257,38.3,3.5,8.1,47.9


### Refined RandomForestRegressor Pipeline

#### Pipeline

In [139]:
# set up data distrubtion
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state = 42)

In [140]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [141]:
# create model
model = RandomForestRegressor(random_state = 42, criterion = 'squared_error')

In [142]:
# create pipeline
rf_pipeline = make_pipeline(preprocessing.StandardScaler(), model)

#### Hyperparameter grid values

In [105]:
from sklearn.model_selection import RandomizedSearchCV

# max features
max_features = ['sqrt', 'log2', None]

# bootstrap
bootstrap = ['True', 'False']

# tree depth
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 5)]
max_depth.append(None)

# max leaf nodes
max_leaf_nodes = [int(x) for x in np.linspace(start = 2, stop = 20, num = 8)]
max_leaf_nodes.append(None)

# min samples leaf
min_samples_leaf = [int(x) for x in np.linspace(start = 2, stop = 10, num = 5)]
min_samples_leaf.append(1)

# min samples split
min_samples_split = [int(x) for x in np.linspace(start = 3, stop = 10, num = 5)]
min_samples_split.append(2)

# number of trees
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 450, num = 8)]
n_estimators.append(100)

#### Creating hyperparameter grid to use in tuning with k-fold cross validation

In [106]:
# create hyperparameter grid
hyperparameters = {
 'randomforestregressor__bootstrap': bootstrap,
 'randomforestregressor__max_depth': max_depth,
 'randomforestregressor__max_features': max_features,
 'randomforestregressor__max_leaf_nodes': max_leaf_nodes,
 'randomforestregressor__min_samples_leaf': min_samples_leaf,
 'randomforestregressor__min_samples_split': min_samples_split,
 'randomforestregressor__n_estimators': n_estimators}
               
pprint.pprint(hyperparameters)

{'randomforestregressor__bootstrap': ['True', 'False'],
 'randomforestregressor__max_depth': [1, 3, 5, 7, 10, None],
 'randomforestregressor__max_features': ['sqrt', 'log2', None],
 'randomforestregressor__max_leaf_nodes': [2, 4, 7, 9, 12, 14, 17, 20, None],
 'randomforestregressor__min_samples_leaf': [2, 4, 6, 8, 10, 1],
 'randomforestregressor__min_samples_split': [3, 4, 6, 8, 10, 2],
 'randomforestregressor__n_estimators': [50,
                                         107,
                                         164,
                                         221,
                                         278,
                                         335,
                                         392,
                                         450,
                                         100]}


#### K-fold CV with GridSearchCV and RF Pipeline

In [107]:
# fit and tune model
rf_clf = RandomizedSearchCV(rf_pipeline, hyperparameters, n_iter = 150, cv = 5)
rf_clf.fit(X_train, y_train)      

print('Best Parameters')
print(rf_clf.best_params_)
print(rf_clf.refit)

Best Parameters
{'randomforestregressor__n_estimators': 107, 'randomforestregressor__min_samples_split': 8, 'randomforestregressor__min_samples_leaf': 4, 'randomforestregressor__max_leaf_nodes': 17, 'randomforestregressor__max_features': None, 'randomforestregressor__max_depth': 5, 'randomforestregressor__bootstrap': 'False'}
True


#### Evaluating the results of k-fold cross-validation

In [145]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    error = sqrt(mean_squared_error(y_test, predictions))
    print(error)

    return error

In [146]:
baseline_model = RandomForestRegressor(random_state = 42, criterion = 'squared_error', max_features = None)
baseline_model.fit(X_train, y_train)
print('RMSE (lower is better)')
base_accuracy = evaluate(baseline_model, X_test, y_test)

RMSE (lower is better)
135361.16594452044


In [147]:
print('RMSE (lower is better)')
refined_model = RandomForestRegressor(random_state = 42, criterion = 'squared_error', n_estimators = 107, min_samples_split = 8,
    min_samples_leaf = 4, max_leaf_nodes = 17, max_features = None, max_depth = 5, bootstrap = True)
refined_model.fit(X_train, y_train)
refined_accuracy = evaluate(refined_model, X_test, y_test)

RMSE (lower is better)
144752.48983315096


In [111]:
print('RMSE Errors (higher difference is better)')
print(base_accuracy - refined_accuracy)

RMSE Errors (higher difference is better)
-9612.54400596692


### Predicting competition data

In [151]:
# reading competition data
competition_data_path = 'C:\\Users\\advay\\OneDrive\\Desktop\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\evaluation_input.csv'
competition_data = pd.read_csv(competition_data_path)
competition_data.head()

Unnamed: 0,ID,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,TX35,857654,33.0,4.1,18.4,25.2
1,PR16,678333,43.0,4.6,13.5,28.3
2,NY4,730314,40.4,3.6,5.7,43.6
3,OR1,858875,38.0,3.8,8.9,40.7
4,GA8,706237,37.6,5.3,17.3,22.7


In [152]:
# creating competition X
competition_X = competition_data[features]
#competition_X_scaled = scaler.transform(competition_X)
competition_X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,857654,33.0,4.1,18.4,25.2
1,678333,43.0,4.6,13.5,28.3
2,730314,40.4,3.6,5.7,43.6
3,858875,38.0,3.8,8.9,40.7
4,706237,37.6,5.3,17.3,22.7


In [153]:
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
competition_X_scaled = scaler.transform(competition_X)
pprint.pprint(competition_X_scaled)
baseline_model.fit(X_scaled, y)

array([[ 1.66326066, -1.59456436, -0.38139992,  1.33522784, -0.71656292],
       [-1.15862523,  1.12676742, -0.03372451,  0.28081746, -0.44403868],
       [-0.34062594,  0.41922116, -0.72907533, -1.39763172,  0.90100029],
       ...,
       [ 0.4741631 , -0.50603164, -0.8681455 ,  0.62511472, -0.84842948],
       [ 0.61925368,  0.11987466, -1.00721566, -0.70903718,  2.04384385],
       [-0.47632183,  0.55528775, -0.24232976, -0.75207434, -0.25063439]])


RandomForestRegressor(max_features=None, random_state=42)

In [154]:
# competition predictions
competition_preds = baseline_model.predict(competition_X_scaled)
print(competition_preds)

[ 182758.  263926.  439692.  330045.  156362.  193470.  173912.  316124.
  398793.  126642.  547971.  425030.  414446.  316522.  182470.  168764.
  391867.  205511.  218936.  211084.  262031.  165863.  186071.  486949.
  528147.  969253.  574135.  394498.  564091.  253096.  227566.  517046.
  187086.  166492.  189315.  350592.  199473.  305660.  203876.  252290.
  197280.  131279.  203845.  432877.  273584.  515212.  285186.  325011.
  400072.  130705.  208811. 1253987.  345099.  188900.  244877.  199469.
  296049.  190940.  311728.  211912.  460753.  333006.  419263.  183283.
  158642.  504869.  227464.  473040.  559669.  449238.  235308.  199875.
  277206.  152869.  203563.  405730.  278860.  156889.  158504.  201672.
  157335.  179648.  158843.  387098.  711594.  234988.  185472.  407945.
  193137.  287224.  149336.  463815.  364091.  413299.  179221.  168073.
  404656.  164883.  313913.  272291.  287867.  437874.  133049.  173307.
  278506.  245373.  439329.  182042.  169265.  2618

### Generating submission

In [155]:
# Creating and outputting DataFrame
output = pd.DataFrame({'ID': competition_data.ID, 'Median House Price': competition_preds})
output.to_csv('Vyas_Advay_answer.csv', index=False)