<a href="https://colab.research.google.com/github/ChitandaMayaka/CMPE255-Team-Project/blob/main/TrainingAndEvaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Training, Evaluation and Comparison
1. Models:
  - Linear Regression(baseline)
  - Support Vector Machines
  - Random Forest 
  - XGBoost
  - LightGBM
2. Evaluation: 
  RMSE for target variables(Price), for cross validation, we use default 5 fold(cv=5).
3. Comparision:
  Compare the training time, CV RMSE, Training RMSE and Test RMSE for different models


## Load data

In [None]:
import numpy as np
import pandas as pd
import requests
from time import time

In [None]:
# Load data
def load_dataset(fn, url):
    URL = url
    filename = fn
    request = requests.get(URL)
    file = open(filename, 'wb')
    file.write(request.content)

In [None]:
load_dataset('x_train.csv', 'https://raw.githubusercontent.com/ChitandaMayaka/CMPE255-Team-Project/main/dataset/x_train.csv')
load_dataset('x_test.csv', 'https://raw.githubusercontent.com/ChitandaMayaka/CMPE255-Team-Project/main/dataset/x_test.csv')
load_dataset('y_train.csv', 'https://raw.githubusercontent.com/ChitandaMayaka/CMPE255-Team-Project/main/dataset/y_train.csv')
load_dataset('y_test.csv', 'https://raw.githubusercontent.com/ChitandaMayaka/CMPE255-Team-Project/main/dataset/y_test.csv')

In [None]:
X_train = pd.read_csv('x_train.csv')
X_test = pd.read_csv('x_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [18]:
# import evaluation metrics and cross validation, grid search
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV

In [None]:
def getRMSE(y_true, y_pred):
  return (mean_squared_error(y_true, y_pred))**0.5

In [None]:
# dictionary to store all the result from different models
# result example: {"Linear regression: {training_time, cv_rmse, train_rmse, test_rmse}"}
results = {}

## Linear Regression

In [None]:
# use linear regression as a basic model for baseline
from sklearn.linear_model import LinearRegression

In [None]:
linreg = LinearRegression()
linreg_cv_rmse = -cross_val_score(linreg, X_train, y_train, scoring='neg_root_mean_squared_error').mean()
start_t = time()
linreg.fit(X_train, y_train)
linreg_training_time = time() - start_t
linreg_y_pred_train = linreg.predict(X_train)  
linreg_y_pred_test = linreg.predict(X_test)
linreg_rmse_train = getRMSE(y_train, linreg_y_pred_train)
linreg_rmse_test = getRMSE(y_test, linreg_y_pred_test)
linreg_r2_train = r2_score(y_train, linreg_y_pred_train)
linreg_r2_test = r2_score(y_test, linreg_y_pred_test)
linreg_result = {"training_time":linreg_training_time, 
                 "cv_rmse":linreg_cv_rmse, 
                 "train_rmse":linreg_rmse_train, 
                 "test_rmse":linreg_rmse_test,
                 "train_r2":linreg_r2_train, 
                 "test_r2":linreg_r2_test,
                 }
results["Linear Regression"] = linreg_result

In [None]:
print("Linear Regression: {}".format(linreg_result))

Linear Regression: {'training_time': 0.02353811264038086, 'cv_rmse': 8783.593741764049, 'train_rmse': 8769.258643615587, 'test_rmse': 8908.23874588955, 'train_r2': 0.41518502942025926, 'test_r2': 0.3993815949713295}


## Support Vector Machines

In [None]:
from sklearn.svm import SVR

### Grid Search

In [None]:
svr_param_grid = {
    'kernel' : ['poly'],
    'C' : [10, 70],
    # 'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
    # 'C' : [5, 10, 30, 70],
    # 'gamma' : ['auto','scale'],
    }

In [None]:
svr_gs = GridSearchCV(SVR(), svr_param_grid, scoring='neg_root_mean_squared_error')
svr_gs.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=None, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [10, 70], 'kernel': ['poly']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_root_mean_squared_error', verbose=0)

In [None]:
svr_cv_rmse = -svr_gs.best_score_
svr_best_params = svr_gs.best_params_
print("CV RMSE:{}, Best prameters:{}".format(svr_cv_rmse, svr_best_params))

CV RMSE:7532.576659307827, Best prameters:{'C': 70, 'kernel': 'poly'}


### Best Parameters Model

In [None]:
# train and test model with best parameters
svr = SVR(**svr_best_params)
start_t = time()
svr.fit(X_train, y_train.values.ravel())
svr_training_time = time() - start_t
svr_y_pred_train = svr.predict(X_train)  
svr_y_pred_test = svr.predict(X_test)
svr_rmse_train = getRMSE(y_train, svr_y_pred_train)
svr_rmse_test = getRMSE(y_test, svr_y_pred_test)
svr_r2_train = r2_score(y_train, svr_y_pred_train)
svr_r2_test = r2_score(y_test, svr_y_pred_test)
svr_result = {"training_time":svr_training_time, 
                 "cv_rmse":svr_cv_rmse, 
                 "train_rmse":svr_rmse_train, 
                 "test_rmse":svr_rmse_test,
                 "train_r2":svr_r2_train, 
                 "test_r2":svr_r2_test,
                 }
results["Support Vector Regression"] = svr_result

In [None]:
print("Support Vector Regression: {}".format(svr_result))

Support Vector Regression: {'training_time': 246.25625371932983, 'cv_rmse': 7532.576659307827, 'train_rmse': 7485.888133912164, 'test_rmse': 7761.023758893805, 'train_r2': 0.5738334209721521, 'test_r2': 0.544117452437032}


## RandomForest

In [13]:
from sklearn.ensemble import RandomForestRegressor

### Grid Search

In [None]:
rf_params = {'n_estimators': [100, 300, 500, 700], 
             'max_depth': [4, 5, 6, 7], 
             'bootstrap': [True, False], 
             'max_samples': [0.7, 0.8, 0.9]}

In [12]:
rf_rs = RandomizedSearchCV(RandomForestRegressor(), rf_params, scoring='neg_root_mean_squared_error')
rf_rs.fit(X_train, y_train.values.ravel())

RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                           

In [15]:
rf_rs_rmse = -rf_rs.best_score_
rf_best_params = rf_rs.best_params_
print("RS RMSE:{}, Best prameters:{}".format(rf_rs_rmse, rf_best_params))

RS RMSE:6413.147594926005, Best prameters:{'n_estimators': 100, 'max_samples': 0.8, 'max_depth': 7, 'bootstrap': True}


### Best parameters model

In [19]:
# train and test model with best parameters
rf_reg = RandomForestRegressor(**rf_best_params)
rf_cv_rmse = -cross_val_score(rf_reg, X_train, y_train.values.ravel(), scoring='neg_root_mean_squared_error').mean()
start_time = time()
rf_reg.fit(X_train, y_train.values.ravel())
rf_training_time = time() - start_time
rf_y_pred_train = rf_reg.predict(X_train)  
rf_y_pred_test = rf_reg.predict(X_test)
rf_rmse_train = getRMSE(y_train, rf_y_pred_train)
rf_rmse_test = getRMSE(y_test, rf_y_pred_test)
rf_r2_train = r2_score(y_train, rf_y_pred_train)
rf_r2_test = r2_score(y_test, rf_y_pred_test)
rf_result = {"training_time":rf_training_time, 
                 "cv_rmse":rf_cv_rmse, 
                 "train_rmse":rf_rmse_train, 
                 "test_rmse":rf_rmse_test,
                 "train_r2":rf_r2_train, 
                 "test_r2":rf_r2_test,
                 }
results["RandomForest Regression"] = rf_result

In [20]:
print("RandomForest Regression: {}".format(rf_result))

RandomForest Regression: {'training_time': 10.284673690795898, 'cv_rmse': 6403.91323953507, 'train_rmse': 6173.5237313886155, 'test_rmse': 6906.983298038885, 'train_r2': 0.7101595692916947, 'test_r2': 0.6389297120664961}
