In [22]:
import numpy as np # arrays & loading data

from sklearn.linear_model import LinearRegression # linear regression class from sklearn library
from sklearn.model_selection import train_test_split # we will split the data in 3 (training + counter variation + testing data)
from sklearn.preprocessing import StandardScaler, PolynomialFeatures # z-score normalization & polynomianls classes 
from sklearn.metrics import mean_squared_error 

In [23]:
# loading all the data
data = np.loadtxt('./hotel_data2.csv', delimiter=',', skiprows=1, usecols=range(1, 5))

X = data[:,0:3] # forming the inpu and output 
y= data[:,3]

print(f"shape of input X is: {X.shape}")
print(f"shape of output y is: {y.shape}")



shape of input X is: (50, 3)
shape of output y is: (50,)


In [24]:
# split into training , cross validation and test sets

# TRAINING SET - 60%
X_train, X_temporary, y_train, y_temporary = train_test_split(X, y, test_size=0.40, random_state=1)

# the rest of 40% - CV SET(20%) and TEST SET(20%)
X_cv, X_test, y_cv, y_test = train_test_split(X_temporary, y_temporary, test_size=0.50, random_state=1)
del X_temporary, y_temporary

print(f"training input shape:{X_train.shape}")
print(f"training output shape:{y_train.shape}")
print(f"cv input shape:{X_cv.shape}")
print(f"cv output shape:{y_cv.shape}")
print(f"test input shape:{X_test.shape}")
print(f"test output shape:{y_test.shape}")

training input shape:(30, 3)
training output shape:(30,)
cv input shape:(10, 3)
cv output shape:(10,)
test input shape:(10, 3)
test output shape:(10,)


In [46]:
# TRAINING DATA OPERATIONS
# applying z-score to all the training data
standard_scaler = StandardScaler()
X_train_scaled = standard_scaler.fit_transform(X_train)

In [47]:
# TRAINING DATA OPERATIONS
# initialize linear regresson class
linear_model = LinearRegression()

# training the model using the class instance from above - ON THE SCALED TRAINING SET
linear_model.fit(X_train_scaled, y_train)

In [48]:
# TRAINING DATA OPERATIONS
# evaluating the model using MSE (mean squared error)

# feed the scaled training set & get predictions
y_predicted = linear_model.predict(X_train_scaled)

# calculate MSE by dividing scikit learn function result by 2 (see the first formula in the README file)
print(f"Training set MSE : {mean_squared_error(y_train, y_predicted)/2}")

Training set MSE : 346.51383183786123


In [49]:
# CV DATA OPERATIONS
# applying the same z-score and prediction on the CV SET, but use transform(), instead of fit_transform()
# because we want to apply the mean & standard variation of the TRAINING SET
X_cv_scaled = standard_scaler.transform(X_cv)
y_predicted = linear_model.predict(X_cv_scaled)
print(f"Cv set MSE : {mean_squared_error(y_cv, y_predicted)/2}")

Cv set MSE : 934.086751266797


In [50]:
# try to add polynomial features
# TRAINING DATA OPERATIONS
polynomial = PolynomialFeatures(degree=2, include_bias=False) # instance of polynomial class

X_train_poly = polynomial.fit_transform(X_train) # apply x^2 to every feature

print(X_train_poly[:10])

[[8.700e+00 2.600e+03 2.800e+01 7.569e+01 2.262e+04 2.436e+02 6.760e+06
  7.280e+04 7.840e+02]
 [8.100e+00 2.200e+03 1.600e+01 6.561e+01 1.782e+04 1.296e+02 4.840e+06
  3.520e+04 2.560e+02]
 [8.200e+00 2.000e+03 2.400e+01 6.724e+01 1.640e+04 1.968e+02 4.000e+06
  4.800e+04 5.760e+02]
 [7.600e+00 1.700e+03 1.800e+01 5.776e+01 1.292e+04 1.368e+02 2.890e+06
  3.060e+04 3.240e+02]
 [7.700e+00 1.500e+03 2.000e+01 5.929e+01 1.155e+04 1.540e+02 2.250e+06
  3.000e+04 4.000e+02]
 [8.700e+00 2.300e+03 3.600e+01 7.569e+01 2.001e+04 3.132e+02 5.290e+06
  8.280e+04 1.296e+03]
 [8.100e+00 2.000e+03 1.900e+01 6.561e+01 1.620e+04 1.539e+02 4.000e+06
  3.800e+04 3.610e+02]
 [8.500e+00 2.000e+03 2.200e+01 7.225e+01 1.700e+04 1.870e+02 4.000e+06
  4.400e+04 4.840e+02]
 [8.300e+00 2.200e+03 2.800e+01 6.889e+01 1.826e+04 2.324e+02 4.840e+06
  6.160e+04 7.840e+02]
 [9.000e+00 3.300e+03 1.500e+01 8.100e+01 2.970e+04 1.350e+02 1.089e+07
  4.950e+04 2.250e+02]]


In [44]:
# apply the scaling in the same way as before
# TRAINING DATA OPERATIONS
scaler_poly = StandardScaler()
X_train_poly_scaled = scaler_poly.fit_transform(X_train_poly)

print(X_train_poly_scaled[:5])

[[ 1.14771057  1.48400874  1.41859873  1.17738786  1.50371607  1.51642351
   1.46953655  2.10510201  1.32629063]
 [-0.04643916  0.48914255 -0.94573249 -0.07862544  0.33961138 -0.83664745
   0.37804325 -0.46615415 -0.82445093]
 [ 0.1525858  -0.00829055  0.63048832  0.12447989 -0.0047696   0.55042596
  -0.09948507  0.40916709  0.4790288 ]
 [-1.04156393 -0.7544402  -0.55167728 -1.05677071 -0.8487455  -0.68803245
  -0.73050463 -0.78072273 -0.54746149]
 [-0.84253898 -1.2518733  -0.15762208 -0.86612583 -1.18100038 -0.3330077
  -1.09433573 -0.82175341 -0.23788505]]


In [53]:
# TRAINING DATA OPERATIONS
#use the same operations as those before adding the polynomial features
model = LinearRegression()
model.fit(X_train_poly_scaled, y_train)
y_predicted = model.predict(X_train_poly_scaled)
print(f"Training set MSE : {mean_squared_error(y_train, y_predicted)/2}")

Training set MSE : 182.5426973445668


In [54]:
# CV DATA OPERATIONS 

# add the polynomial features to cv data and see the mse error
X_cv_poly = polynomial.transform(X_cv)
X_cv_poly_scaled = scaler_poly.transform(X_cv_poly)

y_cv_predicted = model.predict(X_cv_poly_scaled)
print(f"Cross validation MSE: {mean_squared_error(y_cv, y_cv_predicted) / 2}")

Cross validation MSE: 1741.2751689416066
