In [1]:
#imports
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

from library.sb_utils import save_file

In [2]:
#Load Data
ski_data = pd.read_csv('../clean_data/ski_data_step3_features.csv')
ski_data.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Name,Alyeska Resort,Eaglecrest Ski Area,Hilltop Ski Area,Arizona Snowbowl,Sunrise Park Resort,Yosemite Ski & Snowboard Area,Dodge Ridge,Donner Ski Ranch,Mammoth Mountain Ski Area,Mt. Shasta Ski Park
Region,Alaska,Alaska,Alaska,Arizona,Arizona,Northern California,Sierra Nevada,Sierra Nevada,Sierra Nevada,Sierra Nevada
state,Alaska,Alaska,Alaska,Arizona,Arizona,California,California,California,California,California
summit_elev,3939,2600,2090,11500,11100,7800,8200,8012,11053,6890
vertical_drop,2500,1540,294,2300,1800,600,1600,750,3100,1435
base_elev,250,1200,1796,9200,9200,7200,6600,7031,7953,5500
trams,1,0,0,0,0,0,0,0,3,0
fastSixes,0,0,0,1,0,0,0,0,2,0
fastQuads,2,0,0,0,1,0,0,0,9,0
quad,2,0,0,2,2,0,1,0,1,0


In [3]:
#Big Mountain Data
big_mountain = ski_data[ski_data.Name == 'Big Mountain Resort']

In [4]:
big_mountain.T

Unnamed: 0,124
Name,Big Mountain Resort
Region,Montana
state,Montana
summit_elev,6817
vertical_drop,2353
base_elev,4464
trams,0
fastSixes,0
fastQuads,3
quad,2


In [5]:
ski_data.shape

(277, 36)

In [6]:
ski_data = ski_data[ski_data.Name != 'Big Mountain Resort']

In [7]:
ski_data.shape

(276, 36)

In [8]:
#Train/Test Split
#important to not train ML on all the data as to not over-fit
len(ski_data) * .7, len(ski_data) * .3

(193.2, 82.8)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(ski_data.drop(columns='AdultWeekend'), 
                                                    ski_data.AdultWeekend, test_size=0.3, 
                                                    random_state=47)

In [10]:
X_train.shape, X_test.shape

((193, 35), (83, 35))

In [11]:
y_train.shape, y_test.shape

((193,), (83,))

In [12]:
#save the 'Name', 'state', 'Region' columns from the train/test data into names_train and names_test
#then drop the columns from X_train and x_test
names_list = ['Name', 'state', 'Region']
names_train = X_train[names_list]
names_test = X_test[names_list]
X_train.drop(columns=names_list, inplace= True)
X_test.drop(columns=names_list, inplace=True)

In [13]:
X_train.dtypes

summit_elev                             int64
vertical_drop                           int64
base_elev                               int64
trams                                   int64
fastSixes                               int64
fastQuads                               int64
quad                                    int64
triple                                  int64
double                                  int64
surface                                 int64
total_chairs                            int64
Runs                                  float64
TerrainParks                          float64
LongestRun_mi                         float64
SkiableTerrain_ac                     float64
Snow Making_ac                        float64
daysOpenLastYear                      float64
yearsOpen                             float64
averageSnowfall                       float64
projectedDaysOpen                     float64
NightSkiing_ac                        float64
resorts_per_state                 

In [14]:
X_test.dtypes

summit_elev                             int64
vertical_drop                           int64
base_elev                               int64
trams                                   int64
fastSixes                               int64
fastQuads                               int64
quad                                    int64
triple                                  int64
double                                  int64
surface                                 int64
total_chairs                            int64
Runs                                  float64
TerrainParks                          float64
LongestRun_mi                         float64
SkiableTerrain_ac                     float64
Snow Making_ac                        float64
daysOpenLastYear                      float64
yearsOpen                             float64
averageSnowfall                       float64
projectedDaysOpen                     float64
NightSkiing_ac                        float64
resorts_per_state                 

In [15]:
#Initial Not-Even-A-Model
#calc mean of y_train
train_mean = y_train.mean()
train_mean

63.811088082901556

In [16]:
dumb_reg= DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[63.81108808]])

In [17]:
#Calculate the R^2 as defined above
def r_squared(y, ypred):
    """R-squared score.
    
    Calculate the R-squared, or coefficient of determination, of the input.
    
    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    ybar = np.sum(y) / len(y) #yes, we could use np.mean(y)
    sum_sq_tot = np.sum((y - ybar)**2) #total sum of squares error
    sum_sq_res = np.sum((y - ypred)**2) #residual sum of squares error
    R2 = 1.0 - sum_sq_res / sum_sq_tot
    return R2

In [18]:
y_tr_pred_ = train_mean * np.ones(len(y_train))
y_tr_pred_[:5]

array([63.81108808, 63.81108808, 63.81108808, 63.81108808, 63.81108808])

In [19]:
y_tr_pred_ = dumb_reg.predict(X_train)
y_tr_pred_[:5]

array([63.81108808, 63.81108808, 63.81108808, 63.81108808, 63.81108808])

In [20]:
r_squared(y_train,y_tr_pred_)

0.0

In [21]:
y_te_pred  = train_mean * np.ones(len(y_test))
r_squared(y_test, y_te_pred)

-0.0031235200417913944

In [22]:
#Code task 7#
#Calculate the MAE as defined above
def mae(y, ypred):
    """Mean absolute error.
    
    Calculate the mean absolute error of the arguments

    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    abs_error = np.abs(y - ypred)
    mae = np.mean(abs_error)
    return mae

In [23]:
mae(y_train, y_tr_pred_)

17.923463717146785

In [24]:
mae(y_test, y_te_pred)

19.136142081278486