In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [30]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.svm import SVR
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from xgboost.sklearn import XGBRegressor
import numpy as np
from sklearn.cluster import DBSCAN



pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

## **Read data:**

In [19]:
X_train = pd.read_pickle("Selected_X_train.pkl")
X_test = pd.read_pickle("Selected_X_test.pkl")
y_train = pd.read_pickle("Selected_y_train.pkl")
y_test = pd.read_pickle("Selected_y_test.pkl")

In [31]:
# check same length of samples and labels
# train
assert X_train.shape[0] == y_train.shape[0]
# test
assert X_test.shape[0] == y_test.shape[0]

# check same features
assert X_train.shape[1] == X_test.shape[1]

## **Baseline prediction:**

In [32]:
y_baseline = np.full(y_test.shape,y_train.mean())
print(f'{np.sqrt(mean_squared_error(y_test, y_baseline))}')

1.4742915570115163


## **Linear Regression (Normalized)**

In [33]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression(normalize=True).fit(X_train, y_train)
y_pred_lin_reg_norm = reg.predict(X_test)

In [34]:
print(f'{np.sqrt(mean_squared_error(y_test, y_pred_lin_reg_norm))}')

1.4646936533737782


## **Linear Regression (Un-Normalized)**

In [35]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression(normalize=False).fit(X_train, y_train)
y_pred_lin_reg = reg.predict(X_test)
print(f'{np.sqrt(mean_squared_error(y_test, y_pred_lin_reg))}')

1.4646936533737784


## **ElasticNet**

In [26]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RandomizedSearchCV

# Use randomized search to tune the parameters:

params = {"l1_ratio": np.arange(0,1.01,0.05),
                  "alpha": [0.00001,0.00003,0.0001,0.0003,0.001,0.003,0.01,0.03,0.1,0.3,1,3,10,30,100,300],
                  "warm_start": [True,False],
                  "selection": ['random', 'cyclic']}

eNet = ElasticNet()
elastic_net_rnd = RandomizedSearchCV(eNet, params, scoring='r2',
                                     n_jobs = -1, verbose = 2)
elastic_net_rnd.fit(X_train, y_train)
y_pred_elasticnet = elastic_net_rnd.predict(X_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   25.0s finished


In [27]:
print(f'{np.sqrt(mean_squared_error(y_test, y_pred_elasticnet))}')

1.4636898895535482


## **Random Forest Regressor**

In [None]:
# Create the parameter grid based on the results of random search 
# Consider using tree pruning
rf = RandomForestRegressor()
params = {
    'bootstrap': [True],
    'max_depth': np.arange(2,50,7),
    'max_features': np.arange(5,115,10),
    'min_samples_leaf': np.arange(2,10,1),
    'min_samples_split': np.arange(2,70,5),
    'n_estimators': [100, 200, 300, 1000]
}
rnd_forest_src = RandomizedSearchCV(rf, params, scoring='r2', n_jobs = -1, verbose = 2)

# Fit the grid search to the data
rnd_forest_src.fit(X_train, y_train)
print(rnd_forest_src.best_params_)
best_grid = rnd_forest_src.best_estimator_
y_pred_rnd_forest = rnd_forest_src.predict(X_test)
print(f'{np.sqrt(mean_squared_error(y_test, y_pred_rnd_forest))}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


## **XGBoost Regressor**

In [None]:
# Various hyper-parameters to tune
xgb = XGBRegressor()
params = {'objective':['reg:squarederror'],
              'learning_rate': [0.001,0.003,0.01,0.03,0.1,0.3,1,3,10],
              'max_depth': np.arange(2,50,7),
              'subsample': [1], # Reduce to prevent overfitting
              'colsample_bytree': [1]} # Reduct to prevent overfitting

xgb_rnd = RandomizedSearchCV(xgb, params, scoring='r2', verbose = 0)

xgb_rnd.fit(X_train,y_train)

print(xgb_rnd.best_score_)
print(xgb_rnd.best_params_)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


In [None]:
y_pred_xgb = xgb_rnd.predict(X_test)
print(f'{np.sqrt(mean_squared_error(y_test, y_pred_xgb))}')

## **Neural Network**

In [36]:
from keras.models import Model, Sequential
from keras.layers import Activation, Dense, LSTM, Embedding, TimeDistributed, recurrent
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from keras import regularizers

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [37]:
# Neural network
nn = Sequential()
nn.add(Dense(128, input_dim=X_train.shape[1], activation='relu',kernel_regularizer=regularizers.l2(0.0001)))
nn.add(Dense(64, activation='relu',kernel_regularizer=regularizers.l2(0.0001)))
nn.add(Dense(1,activation='linear', kernel_regularizer=regularizers.l2(0.0001)))

In [39]:
nn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               24064     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 32,385
Trainable params: 32,385
Non-trainable params: 0
_________________________________________________________________


In [42]:
nn.compile(loss='mean_squared_error', optimizer='adam')

In [46]:
nn.fit(X_train, y_train, epochs=500, batch_size=100,validation_split = 0.2)

Train on 59556 samples, validate on 14889 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/5

<keras.callbacks.callbacks.History at 0x7f55053669e8>

In [49]:
y_pred_nn = nn.predict(X_test)

In [50]:
print(f'{np.sqrt(mean_squared_error(y_test, y_pred_nn))}')

1.6164601135317884
