# Capstone Step 4. Pre-Processing and Training Data Development

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()

In [2]:
df = pd.read_csv('../3. Exploratory Data Analysis/step3_output_with_outliers.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,Name,state,summit_elev,vertical_drop,base_elev,trams,fastEight,fastSixes,fastQuads,quad,...,SkiableTerrain_ac,Snow Making_ac,daysOpenLastYear,yearsOpen,averageSnowfall,AdultWeekday,AdultWeekend,projectedDaysOpen,NightSkiing_ac,clusters
0,Alyeska Resort,Alaska,3939,2500,250,1,0.0,0,2,2,...,1610.0,113.0,150.0,60.0,669.0,65.0,85.0,150.0,550.0,1
1,Eaglecrest Ski Area,Alaska,2600,1540,1200,0,0.0,0,0,0,...,640.0,60.0,45.0,44.0,350.0,47.0,53.0,90.0,0.0,1
2,Hilltop Ski Area,Alaska,2090,294,1796,0,0.0,0,0,0,...,30.0,30.0,150.0,36.0,69.0,30.0,34.0,152.0,30.0,1
3,Arizona Snowbowl,Arizona,11500,2300,9200,0,0.0,1,0,2,...,777.0,104.0,122.0,81.0,260.0,89.0,89.0,122.0,0.0,0
4,Sunrise Park Resort,Arizona,11100,1800,9200,0,0.0,0,1,2,...,800.0,80.0,115.0,49.0,250.0,74.0,78.0,104.0,80.0,0


## Create dummy features for categorical variables

In [4]:
dfd = pd.get_dummies(df['state'])
df = pd.concat([df.drop('state', axis=1), dfd], axis=1)

In [5]:
df.head()

Unnamed: 0,Name,summit_elev,vertical_drop,base_elev,trams,fastEight,fastSixes,fastQuads,quad,triple,...,Rhode Island,South Dakota,Tennessee,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,Alyeska Resort,3939,2500,250,1,0.0,0,2,2,0,...,0,0,0,0,0,0,0,0,0,0
1,Eaglecrest Ski Area,2600,1540,1200,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Hilltop Ski Area,2090,294,1796,0,0.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,Arizona Snowbowl,11500,2300,9200,0,0.0,1,0,2,2,...,0,0,0,0,0,0,0,0,0,0
4,Sunrise Park Resort,11100,1800,9200,0,0.0,0,1,2,3,...,0,0,0,0,0,0,0,0,0,0


## Standardize the magnitude of numeric features

In [6]:
# first we import the preprocessing package from the sklearn library
from sklearn import preprocessing

# Declare an explanatory variable, called X,and assign it the result of dropping 'Name' and 'AdultWeekend' from the df
X = df.drop(['Name','AdultWeekend','summit_elev','base_elev'], axis=1)

# Declare a response variable, called y, and assign it the AdultWeekend column of the df 
y = df.loc[:,'AdultWeekend']

# Here we use the StandardScaler() method of the preprocessing package, and then call the fit() method with parameter X 
scaler = preprocessing.StandardScaler().fit(X)

# Declare a variable called X_scaled, and assign it the result of calling the transform() method with parameter X 
X_scaled=scaler.transform(X) 

## Split into training and testing datasets

In [7]:
# Import the train_test_split function from the sklearn.model_selection utility.  
from sklearn.model_selection import train_test_split

# Get the 1-dimensional flattened array of our response variable y by calling the ravel() function on y
y = y.ravel()

# Call the train_test_split() function with the first two parameters set to X_scaled and y 
# Declare four variables, X_train, X_test, y_train and y_test separated by commas 
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=1)

# Capstone Step 5. Modeling


## Fit Models with a Training Dataset

**<font color='teal'> Using sklearn, fit the model on your training dataset.</font>**

#### Model 1

In [8]:
#all first model set
from sklearn import linear_model
from sklearn.metrics import explained_variance_score,mean_absolute_error
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)

In [9]:
# Make a variable called y_pred and assign it the result of calling predict() on our model variable with parameter X_test
y_pred = lm.predict(X_test)

## Review Model Outcomes — Iterate over additional models as needed

In [10]:
# You might want to use the explained_variance_score() and mean_absolute_error() metrics.
# To do so, you will need to import them from sklearn.metrics. 
# You can plug y_test and y_pred into the functions to evaluate the model
mae = mean_absolute_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
print('The Mean Absolute Error is: ' + str(round(mae,2)))
print('The Explained Variance Score is: ' + str(round(evs,2)))

The Mean Absolute Error is: 4.9
The Explained Variance Score is: 0.94


In [11]:
print(lm.intercept_)

64.10902051518494


In [12]:
# You might want to make a pandas DataFrame displaying the coefficients for each state like so: 
df_coef = pd.DataFrame(abs(lm.coef_), X.columns, columns=['Coefficient'])
df_coef.sort_values('Coefficient', ascending=False).head(10)

Unnamed: 0,Coefficient
total_chairs,23340150000000.0
fastQuads,8848302000000.0
surface,8290196000000.0
double,7305628000000.0
triple,6517122000000.0
quad,5281886000000.0
fastSixes,2623084000000.0
New York,2397210000000.0
Michigan,2262315000000.0
trams,2253826000000.0


#### Model 2

In [13]:
dfns = df.drop(df.iloc[:, 25:].head(), axis=1)

In [14]:
X = dfns.drop(['Name','AdultWeekend'], axis=1)
y = dfns.loc[:,'AdultWeekend']
scaler = preprocessing.StandardScaler().fit(X)
X_scaled=scaler.transform(X) 

In [15]:
y = y.ravel()
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=1)

In [16]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)

In [17]:
y_pred = lm.predict(X_test)

In [18]:
mae = mean_absolute_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
print('The Mean Absolute Error is: ' + str(round(mae,2)))
print('The Explained Variance Score is: ' + str(round(evs,2)))

The Mean Absolute Error is: 5.5
The Explained Variance Score is: 0.92


In [19]:
df_coef = pd.DataFrame(abs(lm.coef_), X.columns, columns=['Coefficient'])
df_coef.sort_values('Coefficient', ascending=False).head(10)

Unnamed: 0,Coefficient
AdultWeekday,20.236154
summit_elev,12.589954
base_elev,9.768969
vertical_drop,4.353822
averageSnowfall,1.931216
quad,1.55659
triple,1.386754
surface,1.258475
daysOpenLastYear,1.172892
Runs,1.060986


#### Model 3

In [27]:
dfns_ne = dfns.drop(columns=['summit_elev'])

In [28]:
X = dfns_ne.drop(['Name','AdultWeekend'], axis=1)
y = dfns_ne.loc[:,'AdultWeekend']
scaler = preprocessing.StandardScaler().fit(X)
X_scaled=scaler.transform(X) 

In [29]:
y = y.ravel()
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=1)

In [30]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)

In [31]:
y_pred = lm.predict(X_test)

In [32]:
mae = mean_absolute_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
print('The Mean Absolute Error is: ' + str(round(mae,2)))
print('The Explained Variance Score is: ' + str(round(evs,2)))

The Mean Absolute Error is: 5.33
The Explained Variance Score is: 0.93


In [33]:
df_coef = pd.DataFrame(abs(lm.coef_), X.columns, columns=['Coefficient'])
df_coef.sort_values('Coefficient', ascending=False).head(10)

Unnamed: 0,Coefficient
AdultWeekday,20.202135
averageSnowfall,1.883148
quad,1.515205
triple,1.369951
vertical_drop,1.337814
daysOpenLastYear,1.212713
surface,1.206895
Runs,1.174835
base_elev,0.823453
fastQuads,0.734223


## Identify the Final Model

| Model | Explained Variance| Mean Absolute Error|Features Dropped|
| --- | --- | --- | --- |
| Model 1. | 0.94| 4.9 |-|
| Model 2. | 0.92| 5.5 |'state'|
| Model 3. | 0.93 | 5.33 |'state','summit_elev'|


Model Selection:

**Model 1**