# AIS Summer Comp (Advay Vyas)

### Imports and data

In [22]:
# imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np
import pprint
from math import sqrt

In [23]:
# load data
train_file_path = 'C:\\Users\\advay\\OneDrive\\Desktop\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\train.csv'
all_data = pd.read_csv(train_file_path)

In [24]:
# get target
y = all_data['Median House Price']
y.head()

0    455800
1    365700
2    298300
3    334200
4    219100
Name: Median House Price, dtype: int64

In [25]:
# creating X
features = ['Population', 'Median Age', 'Unemployment Rate', 'Median Income', 'Poverty Rate']
X = all_data[features]
X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,840562,35.6,5.1,7.8,26.5
1,913161,37.5,4.0,6.2,54.5
2,768917,33.0,6.6,15.2,24.5
3,710626,42.1,3.1,4.6,53.2
4,791257,38.3,3.5,8.1,47.9


### Refined RandomForestRegressor Pipeline

#### Pipeline

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state = 42)
                    
# preprocess and scale data
scaler = preprocessing.StandardScaler().fit(X_train) #create scaler
X_train_scaled = scaler.transform(X_train)
print("\nMean of scaled training data:")
print(X_train_scaled.mean(axis=0))
print("\nStandard Dev of scaled training data:")
print(X_train_scaled.std(axis=0))


Mean of scaled training data:
[ 7.51991062e-16  1.14278957e-15 -4.07081776e-17  2.36847579e-17
  2.30926389e-16]

Standard Dev of scaled training data:
[1. 1. 1. 1. 1.]


In [27]:
# transform test set using training set scale
X_test_scaled = scaler.transform(X_test)
print("\nMean of scaled test data:")
print (X_test_scaled.mean(axis=0))
print("\nStandard Dev of scaled test data:")
print (X_test_scaled.std(axis=0))


Mean of scaled test data:
[ 0.02296455 -0.13318823 -0.19505616 -0.21431031 -0.03409505]

Standard Dev of scaled test data:
[0.79937603 0.89947398 1.05992033 0.82536742 0.85708767]


In [28]:
# create pipeline
rf_pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(random_state = 42, criterion = 'squared_error'))

#### Hyperparameter grid values

In [29]:
from sklearn.model_selection import RandomizedSearchCV

# max features
max_features = ['auto', 'sqrt', 'log']

# bootstrap
bootstrap = ['True', 'False']

# tree depth
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 5)]
max_depth.append('None')

# max features
max_features = [float(x) for x in np.linspace(start = 0, stop = 5, num = 6)]
max_features.append('None')

# max leaf nodes
max_leaf_nodes = [int(x) for x in np.linspace(start = 1, stop = 20, num = 8)]
max_leaf_nodes.append('None')

# min samples leaf
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 10, num = 3)]
min_samples_leaf.append(1)

# min samples split
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 10, num = 3)]
min_samples_split.append(2)

# number of trees
n_estimators = [int(x) for x in np.linspace(start = 80, stop = 450, num = 6)]

#### Creating hyperparameter grid to use in tuning with k-fold cross validation

In [34]:
# create hyperparameter grid
hyperparameters = {
 'randomforestregressor__bootstrap': bootstrap,
 'randomforestregressor__max_depth': max_depth,
 'randomforestregressor__max_features': max_features,
 'randomforestregressor__max_leaf_nodes': max_leaf_nodes,
 'randomforestregressor__min_samples_leaf': min_samples_leaf,
 'randomforestregressor__min_samples_split': min_samples_split,
 'randomforestregressor__n_estimators': n_estimators}
               
pprint.pprint(hyperparameters)

{'randomforestregressor__bootstrap': ['True', 'False'],
 'randomforestregressor__max_depth': [1, 3, 5, 7, 10, 'None'],
 'randomforestregressor__max_features': [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 'None'],
 'randomforestregressor__max_leaf_nodes': [1, 3, 6, 9, 11, 14, 17, 20, 'None'],
 'randomforestregressor__min_samples_leaf': [1, 5, 10, 1],
 'randomforestregressor__min_samples_split': [2, 6, 10, 2],
 'randomforestregressor__n_estimators': [80, 154, 228, 302, 376, 450]}


#### K-fold CV with GridSearchCV and RF Pipeline

In [37]:
# fit and tune model
rf_clf = GridSearchCV(rf_pipeline, hyperparameters, cv=10)
rf_clf.fit(X_train, y_train)      

print('Best Parameters')
print(rf_clf.best_params_)
rf_clf.refit
print(rf_clf.refit)

#### Evaluating the results of k-fold cross-validation

In [None]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    error = sqrt(mean_squared_error(y_test, predictions))
    print('RMSE:')
    print(error)

In [None]:
baseline_model = RandomForestRegressor(random_state = 42)
baseline_model.fit(X_train_scaled, y_train)
print('RMSE (lower is better)')
base_accuracy = evaluate(baseline_model, X_test_scaled, y_test)

In [None]:
print('RMSE (lower is better)')
refined_accuracy = evaluate(rf_clf, X_test, y_test)

In [None]:
print('RMSE Errors (higher difference is better)')
print(refined_accuracy - base_accuracy)
print('Percentage difference')
print(((refined_accuracy - base_accuracy) / base_accuracy) + '%')

### Predicting competition data

In [None]:
# reading competition data
competition_data_path = 'C:\\Users\\advay\\OneDrive\\Desktop\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\evaluation_input.csv'
competition_data = pd.read_csv(competition_data_path)
competition_data.head()

In [None]:
# creating competition X
competition_X = competition_data[features]
competition_X.head()

In [None]:
# competition predictions
competition_preds = rf_clf.predict(competition_X)
print(competition_preds)

### Generating submission

In [None]:
# Creating and outputting DataFrame
output = pd.DataFrame({'ID': competition_data.ID, 'Median House Price': competition_preds})
output.to_csv('Vyas_Advay_answer.csv', index=False)