# CSE 151A Group Project: Tree Models

In [None]:
# All of our imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, poisson
import warnings

Run to suppress warnings when running code

In [None]:
warnings.filterwarnings('ignore')

First, we must get our processed data to train the model

In [None]:
input_df = pd.read_csv('processed_input.csv')
output_df = pd.read_csv('processed_output.csv')
display(input_df)
display(output_df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(input_df,output_df, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.2, random_state=42)

print(f'Training data shapes: {X_train.shape}, {y_train.shape}')
print(f'Validation data shapes: {X_val.shape}, {y_val.shape}')
print(f'Test data shape: {X_test.shape}, {y_test.shape}')

In [None]:
validation_error_list = []
training_error_list = []
testing_error_list = []

In [None]:
decision_tree_model=DecisionTreeRegressor()
decision_tree_model.fit(X_train,y_train)

In [None]:
# Show our starting off point for error
yhat_train = decision_tree_model.predict(X_train)
train_error = mean_squared_error(y_train,yhat_train)
training_error_list.append(np.log(train_error))
print('Training Error:',train_error)

yhat_test = decision_tree_model.predict(X_test)
test_error = mean_squared_error(y_test,yhat_test)
testing_error_list.append(np.log(test_error))
print('Testing Error:',test_error)

yhat_val = decision_tree_model.predict(X_val)
val_error = mean_squared_error(y_val,yhat_val)
validation_error_list.append(np.log(val_error))
print('Validation Error:',val_error)

In [None]:
# parameters={"splitter":["best","random"],
#             "max_depth" : [1,3,5,7,9,11,12],
#            "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
#            "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
#            "max_features":["auto","log2","sqrt",None],
#            "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] }
# parameters = {
#     'splitter':["best","random"],
#     'max_depth': [3,5,7],
#     'min_samples_split': [3,5,7,9],
#     'min_samples_leaf': [3,5,7,9],
#     "max_leaf_nodes":[10,20,30,40]
# }
parameters = {'criterion':['friedman_mse','squared_error'],
              'max_depth':np.arange(1,15).tolist()[0::2],
              'min_samples_split':np.arange(2,11).tolist()[0::2],
              'max_leaf_nodes':np.arange(3,20).tolist()[0::2]}
tuning_model=GridSearchCV(decision_tree_model,parameters,scoring='neg_mean_squared_error',cv=10,verbose=0, n_jobs=1)

In [None]:
tuning_model.fit(X_train,y_train)

In [None]:
tuning_model.best_params_

In [None]:
tuned_decision_tree_model = DecisionTreeRegressor(max_depth=7,max_leaf_nodes=19, min_samples_split=2, criterion='friedman_mse')
tuned_decision_tree_model.fit(X_train, y_train)

In [None]:
# Show our starting off point for error
yhat_train = tuned_decision_tree_model.predict(X_train)
train_error = mean_squared_error(y_train,yhat_train)
training_error_list.append(np.log(train_error))
print('Training Error:',train_error)

yhat_test = tuned_decision_tree_model.predict(X_test)
test_error = mean_squared_error(y_test,yhat_test)
testing_error_list.append(np.log(test_error))
print('Testing Error:',test_error)

yhat_val = tuned_decision_tree_model.predict(X_val)
val_error = mean_squared_error(y_val,yhat_val)
validation_error_list.append(np.log(val_error))
print('Validation Error:',val_error)

In [None]:
#Random Forest Model
param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_depth': [10, 25],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 3]  # Minimum number of samples required to be at a leaf node
}
rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
tuned_grid_search_model = RandomForestRegressor(max_depth=25, min_samples_leaf=1, min_samples_split=2, n_estimators=100)
tuned_grid_search_model.fit(X_train, y_train)

In [None]:
#print train, test, and validation errors
yhat_train = tuned_grid_search_model.predict(X_train)
train_error = mean_squared_error(y_train,yhat_train)
print('Training Error:',train_error)

yhat_test = tuned_grid_search_model.predict(X_test)
test_error = mean_squared_error(y_test,yhat_test)
print('Testing Error:',test_error)

yhat_val = tuned_grid_search_model.predict(X_val)
val_error = mean_squared_error(y_val,yhat_val)
print('Validation Error:',val_error)

Now, we will try out a few Gradient Boosting libraries to see if we can improve upon our current error. We'll start with the most popular one: XGBoost. Gradient boosting is an ensemble method involving multiple shallow decision trees that correct each others' mistakes.


In [None]:
pip install xgboost

In [None]:
# Gradient boosting models

import xgboost as xgb
from sklearn.model_selection import GridSearchCV

model = xgb.XGBRegressor()

param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [5,7,9],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.7, 0.9],
    'colsample_bytree': [0.7, 0.9, 0.95],
    'gamma': [0, 0.1],
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

These are the best hyperparameters:

In [None]:
xgbmodel = xgb.XGBRegressor(learning_rate= 0.1, max_depth= 7, n_estimators= 200, 
                         colsample_bytree= 0.7, subsample = 0.9, gamma=0)
xgbmodel.fit(X_train, y_train)

xgbmodel.save_model('part_2_xgb')

In [None]:
yhat_train = xgbmodel.predict(X_train)
train_error = mean_squared_error(y_train,yhat_train)
print('Training Error:',train_error)

yhat_test = xgbmodel.predict(X_test)
test_error = mean_squared_error(y_test,yhat_test)
print('Testing Error:',test_error)

yhat_val = xgbmodel.predict(X_val)
val_error = mean_squared_error(y_val,yhat_val)
print('Validation Error:',val_error)

Trying scikitlearn's built in gradient booster:

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 4, 5],
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# View Results
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

In [None]:
gbmodel = GradientBoostingRegressor(learning_rate= 0.05, max_depth= 5, n_estimators= 300)
gbmodel.fit(X_train, y_train)

In [None]:
yhat_train = gbmodel.predict(X_train)
train_error = mean_squared_error(y_train,yhat_train)
print('Training Error:',train_error)

yhat_test = gbmodel.predict(X_test)
test_error = mean_squared_error(y_test,yhat_test)
print('Testing Error:',test_error)

yhat_val = gbmodel.predict(X_val)
val_error = mean_squared_error(y_val,yhat_val)
print('Validation Error:',val_error)