In [2]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

from utils import split_data, normalize, dimensionality_reduction, fit_model, evaluate_model

In [3]:
training_df = pd.read_csv('training.csv')

In [4]:
def model_selection(models, training_df):
    '''
    Determine degree of accuracy for each model provided on the data.
    '''
    X_train, X_validation, y_train, y_validation = split_data(training_df) # split training data
    X_train, X_validation = normalize(X_train, X_validation) # normalize training and validation data
    X_train, X_validation = dimensionality_reduction(X_train, X_validation) # perform dimensionality reduction on training and validation data
        
    results = []
    for model in models:
        model = fit_model(model, X_train, y_train) # fit model
        result = evaluate_model(model, X_validation, y_validation) # evaluate model
        results.append(result)
    return results
    


In [5]:
models = []
models.append(LinearRegression()) # model for linear regression
models.append(SVR()) # model for support vector regression
models.append(linear_model.Lasso(alpha=0.1)) # model for lasso regression
models.append(RandomForestRegressor(n_estimators = 100, random_state = 0)) # model for reandom forest regression


In [6]:
model_results = model_selection(models, training_df)

In [7]:
print("Results of model training:")
for result in model_results:
    print(result)

Results of model training:
LinearRegression RMSE = 22.2516
SVR RMSE = 17.4962
Lasso RMSE = 22.247
RandomForestRegressor RMSE = 10.3225


In [8]:
# Conclusions:
# Linear regression performed the worst. This is possible for the following reasons - the data we are trying to model is non linear and possible overfitting.
# Lasso regression performed slightly better than linear regression. This could be since the model is more resilient towards overfitting, however, is susceptible to leaving out key features as only 1 feature is selected from a group of correlated features.
# Support vector regression performed a lot better than linear and lasso since it is resilient to outliers, however, it is possible that there is still noise affecting the model.
# Random forest regression seemed to perform the best since it is good at modelling non linear relationships.


