# AIS Summer Comp (Advay Vyas)

### Imports and data

In [22]:
# imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np
import pprint
from math import sqrt

In [23]:
# load data
train_file_path = 'C:\\Users\\advay\\OneDrive\\Desktop\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\train.csv'
all_data = pd.read_csv(train_file_path)

In [24]:
# get target
y = all_data['Median House Price']
y.head()

0    455800
1    365700
2    298300
3    334200
4    219100
Name: Median House Price, dtype: int64

In [25]:
# creating X
features = ['Population', 'Median Age', 'Unemployment Rate', 'Median Income', 'Poverty Rate']
X = all_data[features]
X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,840562,35.6,5.1,7.8,26.5
1,913161,37.5,4.0,6.2,54.5
2,768917,33.0,6.6,15.2,24.5
3,710626,42.1,3.1,4.6,53.2
4,791257,38.3,3.5,8.1,47.9


### Refined RandomForestRegressor Pipeline

#### Pipeline

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state = 42)
                    
# preprocess and scale data
scaler = preprocessing.StandardScaler().fit(X_train) #create scaler
X_train_scaled = scaler.transform(X_train)
print("\nMean of scaled training data:")
print(X_train_scaled.mean(axis=0))
print("\nStandard Dev of scaled training data:")
print(X_train_scaled.std(axis=0))


Mean of scaled training data:
[ 7.51991062e-16  1.14278957e-15 -4.07081776e-17  2.36847579e-17
  2.30926389e-16]

Standard Dev of scaled training data:
[1. 1. 1. 1. 1.]


In [27]:
# transform test set using training set scale
X_test_scaled = scaler.transform(X_test)
print("\nMean of scaled test data:")
print (X_test_scaled.mean(axis=0))
print("\nStandard Dev of scaled test data:")
print (X_test_scaled.std(axis=0))


Mean of scaled test data:
[ 0.02296455 -0.13318823 -0.19505616 -0.21431031 -0.03409505]

Standard Dev of scaled test data:
[0.79937603 0.89947398 1.05992033 0.82536742 0.85708767]


In [77]:
# create pipeline
rf_pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(random_state = 42, criterion = 'squared_error', 
    max_features = 'auto'))

#### Hyperparameter grid values

In [80]:
from sklearn.model_selection import RandomizedSearchCV

# max features
max_features = ['auto', 'sqrt','log2']

# bootstrap
bootstrap = ['True', 'False']

# tree depth
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 5)]

# max leaf nodes
max_leaf_nodes = [int(x) for x in np.linspace(start = 2, stop = 20, num = 8)]

# min samples leaf
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 10, num = 3)]
min_samples_leaf.append(1)

# min samples split
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 10, num = 3)]
min_samples_split.append(2)

# number of trees
n_estimators = [int(x) for x in np.linspace(start = 80, stop = 450, num = 6)]
n_estimators.append(100)

#### Creating hyperparameter grid to use in tuning with k-fold cross validation

In [81]:
# create hyperparameter grid
hyperparameters = {
 'randomforestregressor__bootstrap': bootstrap,
 'randomforestregressor__max_depth': max_depth,
 'randomforestregressor__max_features': max_features,
 'randomforestregressor__max_leaf_nodes': max_leaf_nodes,
 'randomforestregressor__min_samples_leaf': min_samples_leaf,
 'randomforestregressor__min_samples_split': min_samples_split,
 'randomforestregressor__n_estimators': n_estimators}
               
pprint.pprint(hyperparameters)

{'randomforestregressor__bootstrap': ['True', 'False'],
 'randomforestregressor__max_depth': [1, 3, 5, 7, 10],
 'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
 'randomforestregressor__max_leaf_nodes': [2, 4, 7, 9, 12, 14, 17, 20],
 'randomforestregressor__min_samples_leaf': [1, 5, 10, 1],
 'randomforestregressor__min_samples_split': [2, 6, 10, 2],
 'randomforestregressor__n_estimators': [80, 154, 228, 302, 376, 450, 100]}


#### K-fold CV with GridSearchCV and RF Pipeline

In [82]:
# fit and tune model
rf_clf = RandomizedSearchCV(rf_pipeline, hyperparameters, n_iter = 120, cv=6)
rf_clf.fit(X_train, y_train)      

print('Best Parameters')
print(rf_clf.best_params_)
print(rf_clf.refit)

Best Parameters
{'randomforestregressor__n_estimators': 302, 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__min_samples_leaf': 5, 'randomforestregressor__max_leaf_nodes': 20, 'randomforestregressor__max_features': 'auto', 'randomforestregressor__max_depth': 10, 'randomforestregressor__bootstrap': 'False'}
True


#### Evaluating the results of k-fold cross-validation

In [71]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    error = sqrt(mean_squared_error(y_test, predictions))
    print('RMSE:')
    print(error)

    return error

In [120]:
baseline_model = RandomForestRegressor(random_state = 42, criterion = 'squared_error', max_features = 'auto')
baseline_model.fit(X_train, y_train)
print('RMSE (lower is better)')
base_accuracy = evaluate(baseline_model, X_test, y_test)

RMSE (lower is better)
RMSE:
135361.16594452044


In [104]:
print('RMSE (lower is better)')
refined_accuracy = evaluate(rf_clf, X_test, y_test)

RMSE (lower is better)
RMSE:
151183.01954848226


In [93]:
print('RMSE Errors (higher difference is better)')
print(base_accuracy - refined_accuracy)

RMSE Errors (higher difference is better)
15543.30939102138


### Predicting competition data

In [114]:
# reading competition data
competition_data_path = 'C:\\Users\\advay\\OneDrive\\Desktop\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\evaluation_input.csv'
competition_data = pd.read_csv(competition_data_path)
competition_data.head()

Unnamed: 0,ID,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,TX35,857654,33.0,4.1,18.4,25.2
1,PR16,678333,43.0,4.6,13.5,28.3
2,NY4,730314,40.4,3.6,5.7,43.6
3,OR1,858875,38.0,3.8,8.9,40.7
4,GA8,706237,37.6,5.3,17.3,22.7


In [115]:
# creating competition X
competition_X = competition_data[features]
#competition_X_scaled = scaler.transform(competition_X)
competition_X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,857654,33.0,4.1,18.4,25.2
1,678333,43.0,4.6,13.5,28.3
2,730314,40.4,3.6,5.7,43.6
3,858875,38.0,3.8,8.9,40.7
4,706237,37.6,5.3,17.3,22.7


In [116]:
# competition predictions
competition_preds = baseline_model.predict(competition_X)
print(competition_preds)

[ 199716.  159026.  479865.  328395.  162920.  202302.  194956.  395264.
  481639.  120628.  507208.  381868.  417949.  301795.  196560.  185826.
  457843.  212968.  191326.  203425.  238714.  176273.  212747.  477321.
  557063.  903735.  574812.  495227.  568278.  239760.  228184.  548812.
  257471.  178570.  192478.  328287.  225128.  337965.  238341.  273579.
  203047.  130227.  194391.  426190.  302372.  685082.  323584.  410770.
  337241.  123834.  206586. 1128846.  353978.  211349.  241544.  193687.
  298589.  196074.  268660.  204637.  456492.  361299.  408279.  187955.
  158743.  518792.  212613.  516614.  539083.  456403.  212396.  214390.
  318117.  163435.  246298.  341729.  286460.  161597.  169225.  228541.
  162796.  178022.  164762.  386969.  725449.  274553.  228506.  386719.
  203242.  314184.  153379.  471874.  376827.  425912.  180429.  188884.
  328949.  176227.  393931.  260271.  282788.  575535.  130177.  181770.
  352969.  212230.  454265.  187162.  173176.  2647

### Generating submission

In [117]:
# Creating and outputting DataFrame
output = pd.DataFrame({'ID': competition_data.ID, 'Median House Price': competition_preds})
output.to_csv('Vyas_Advay_answer.csv', index=False)