In [0]:
import pandas as pd
import numpy as np

In [0]:
df = pd.read_csv('master_final.csv')

In [0]:
del df['hpg_store_id'] 
del df['earlist_open_date']
del df['dayofyear']
del df['day_of_week']
del df['day']
del df['latitude']
del df['longitude']

In [0]:
df = df.fillna(0)

In [0]:
df1 = pd.get_dummies(df, columns = ['air_genre_name', 'air_area_name', 
                                    'month','year', 'dayofweek', 'week_of_month', 'air_store_id'])

In [0]:
df2 = df1[pd.to_datetime(df1['calendar_date']) < pd.to_datetime('2017-04-23')]

In [0]:
test = df2[pd.to_datetime(df2['calendar_date']) >= pd.to_datetime('2017-03-15')]
train = df2[pd.to_datetime(df2['calendar_date']) < pd.to_datetime('2017-03-15')]

In [0]:
del test['calendar_date']
del train['calendar_date']

In [0]:
test = test.astype('float64')
train = train.astype('float64')

In [0]:
y_test = test['visitors']
del test['visitors']
y_train = train['visitors']
del train['visitors']

In [0]:
X_test = test
X_train = train

In [0]:
new_x_test = X_test[X_test['closed_flag'] != 2]

In [0]:
new_x_test.head()

In [0]:
y_test.head()

In [0]:
new_y_test = y_test.loc[np.array(new_x_test.index)]
new_y_test.head()

In [0]:
new_y_test.describe()

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn import ensemble
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

In [0]:
lin_reg_pred = linear_model.LinearRegression().fit(X_train, y_train).predict(new_x_test)

In [0]:
i = 0
for prediction in lin_reg_pred:
  if prediction < 0:
    lin_reg_pred[i] = 0

  i += 1

In [0]:
print('Linear Regression')
print('Mean Squared Error')
print(mean_squared_error(new_y_test, lin_reg_pred))
print('Root Mean Squared Log Error')
print(np.sqrt(mean_squared_log_error(new_y_test, lin_reg_pred)))
print('r^2')
print(r2_score(new_y_test, lin_reg_pred))

In [0]:
gbr_hps = {'n_estimators': [100, 200], 'max_depth': [3, 5], 'min_samples_split': [3, 5],
          'learning_rate': [.05, 0.1, 0.15], 'loss': ['ls']}

gbr = ensemble.GradientBoostingRegressor()
gbr_grid = GridSearchCV(gbr, gbr_hps, cv = 5, scoring = 'neg_mean_squared_error')

In [0]:
gbr_pred = gbr_grid.fit(X_train, y_train).predict(new_x_test)

In [0]:
i = 0
for prediction in gbr_pred:
  if prediction < 0:
    gbr_y_pred[i] = 0
  
  i += 1

In [0]:
print('Gradient Boosting Regression')
print('Mean Squared Error')
print(mean_squared_error(new_y_test, gbr_pred))
print('Root Mean Squared Log Error')
print(np.sqrt(mean_squared_log_error(new_y_test, gbr_pred)))
print('r^2')
print(r2_score(y_test, gbr_pred))
print('Best Params:')
print (gbr_grid.best_params_)

In [0]:
sc = StandardScaler()
X_train_norm = sc.fit_transform(X_train)
new_x_test_norm = sc.fit_transform(new_x_test)

In [0]:
ridge_reg = linear_model.RidgeCV(alpha = [0.001, 0.01, 0.1, 1], cv = 5)

In [0]:
ridge_pred = ridge_reg.fit(X_train_norm, y_train).predict(new_x_test_norm)

In [0]:
i = 0
for prediction in ridge_pred:
  if prediction < 0:
    ridge_pred[i] = 0
  
  i += 1

In [0]:
print('Ridge Regression')
print('Mean Squared Error')
print(mean_squared_error(new_y_test, ridge_pred))
print('Root Mean Squared Log Error')
print(np.sqrt(mean_squared_log_error(new_y_test, ridge_pred)))
print('r^2')
print(r2_score(new_y_test, ridge_pred))

In [0]:
lasso_reg = linear_model.LassoCV(alpha = [0.001, 0.01, 0.1, 1], cv = 5)

In [0]:
lasso_pred = lasso_reg.fit(X_train_norm, y_train).predict(new_x_test_norm)

In [0]:
i = 0
for prediction in ridge_pred:
  if prediction < 0:
    ridge_pred[i] = 0
  
  i += 1

In [0]:
print('Lasso Regression')
print('Mean Squared Error')
print(mean_squared_error(new_y_test, lasso_pred))
print('Root Mean Squared Log Error')
print(np.sqrt(mean_squared_log_error(new_y_test, lasso_pred)))
print('r^2')
print(r2_score(new_y_test, lasso_pred))

In [0]:
k_range = list(range(1, 25))
weight_options = ["uniform", "distance"]
param_grid = dict(n_neighbors = k_range, weights = weight_options)
knnR = neighbors.KNeighborsRegressor()

In [0]:
knn_grid = GridSearchCV(knnR, param_grid, cv = 5, scoring = 'neg_mean_squared_error')

In [0]:
knn_pred = knn_grid.fit(X_train_norm, y_train).predict(new_x_test_norm)

In [0]:
i = 0
for prediction in knn_pred:
  if prediction < 0:
    knn_pred[i] = 0
  
  i += 1

In [0]:
print('k-NN Regression')
print('Mean Squared Error')
print(mean_squared_error(new_y_test, knn_pred))
print('Root Mean Squared Log Error')
print(np.sqrt(mean_squared_log_error(new_y_test, knn_pred)))
print('r^2')
print(r2_score(new_y_test, knn_pred))
print('Best Params:')
print (knn_grid.best_params_)