# SVM

In [72]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# import dataset
dataset = pd.read_csv('Team1/cleaned_and_normalized_training_set.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,Store,DayOfWeek,day,month,year,Sales,Customers,Open,Promo,...,storeType_a,storeType_b,storeType_c,storeType_d,Assortment_a,Assortment_b,Assortment_c,public_holiday,easter,christmas
0,0,1,5,31,7,2015,5263,0.099708,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,2,5,31,7,2015,6064,0.112468,1,1,...,1,0,0,0,1,0,0,0,0,0
2,2,3,5,31,7,2015,8314,0.148195,1,1,...,1,0,0,0,1,0,0,0,0,0
3,3,4,5,31,7,2015,13995,0.2716,1,1,...,0,0,1,0,0,0,1,0,0,0
4,4,5,5,31,7,2015,4822,0.100437,1,1,...,1,0,0,0,1,0,0,0,0,0


In [73]:
# Labels are the values we want to predict
labels = dataset['Sales']

# Remove the labels from the features
# axis 1 refers to the columns
features= dataset.drop('Sales', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
#features = np.array(features)

In [74]:
feature_list

['Unnamed: 0',
 'Store',
 'DayOfWeek',
 'day',
 'month',
 'year',
 'Customers',
 'Open',
 'Promo',
 'SchoolHoliday',
 'CompetitionDistance',
 'Competition',
 'Promo2',
 'storeType_a',
 'storeType_b',
 'storeType_c',
 'storeType_d',
 'Assortment_a',
 'Assortment_b',
 'Assortment_c',
 'public_holiday',
 'easter',
 'christmas']

In [75]:
from sklearn.model_selection import TimeSeriesSplit

tss = TimeSeriesSplit(n_splits = 3)#

for train_index, test_index in tss.split(features):
    X_train, X_test = features.iloc[train_index, :], features.iloc[test_index,:]
    y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]

In [66]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (538978, 31)
Training Labels Shape: (538978,)
Testing Features Shape: (179659, 31)
Testing Labels Shape: (179659,)


In [77]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model 
rf = RandomForestRegressor(n_estimators= 10, random_state=42)

# Train the model on training data
rf.fit(X_train, y_train);

In [32]:
rf_new = RandomForestRegressor(n_estimators = 100, criterion = 'mse', max_depth = None, 
                               min_samples_split = 2, min_samples_leaf = 1)

In [33]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 457.33 degrees.


In [34]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 92.75 %.


ModuleNotFoundError: No module named 'pydot'

In [78]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Customers            Importance: 0.74
Variable: CompetitionDistance  Importance: 0.06
Variable: storeType_d          Importance: 0.05
Variable: Store                Importance: 0.04
Variable: Promo                Importance: 0.03
Variable: Assortment_b         Importance: 0.02
Variable: Unnamed: 0           Importance: 0.01
Variable: DayOfWeek            Importance: 0.01
Variable: day                  Importance: 0.01
Variable: month                Importance: 0.01
Variable: Promo2               Importance: 0.01
Variable: storeType_b          Importance: 0.01
Variable: year                 Importance: 0.0
Variable: Open                 Importance: 0.0
Variable: SchoolHoliday        Importance: 0.0
Variable: Competition          Importance: 0.0
Variable: storeType_a          Importance: 0.0
Variable: storeType_c          Importance: 0.0
Variable: Assortment_a         Importance: 0.0
Variable: Assortment_c         Importance: 0.0
Variable: public_holiday       Importance: 0.0
V

In [80]:
# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators=80, random_state=42)

# Extract the two most important features
#important_indices = [feature_list.index('Customers'), feature_list.index('CompetitionDistance'), feature_list.index('storeType_d'), feature_list.index('Store'), feature_list.index('Promo'), feature_list.index('Assortment_b')]
train_important = X_train[['Customers', 'CompetitionDistance', 'storeType_d', 'Store', 'Promo', 'Assortment_b', 'DayOfWeek', 'day', 'month', 'Promo2', 'storeType_b']]
test_important = X_test[['Customers', 'CompetitionDistance', 'storeType_d', 'Store', 'Promo', 'Assortment_b', 'DayOfWeek', 'day', 'month', 'Promo2', 'storeType_b']]

# Train the random forest
rf_most_important.fit(train_important, y_train)

# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)

In [58]:
predictions.shape
len(predictions)
len(y_test)

179659

In [61]:
errors = abs(predictions - y_test)

# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'Euros.')

mape = np.sqrt(sum(100 * (errors / y_test)**2)/len(y_test))
accuracy = 100 - mape

print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 426.95 Euros.
Accuracy: 99.09 %.


In [81]:
preds = np.array(predictions)
actuals = np.array(y_test)
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])
metric(preds, actuals)

8.771089381411905

In [38]:
# Gridsearch
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 6, 8],


In [39]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}