# Stockland Project 

Visitation Predition based on weather data.

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Import data set 
### 1, Stockland Visitation dataset: 


In [2]:
df_1 = pd.read_csv("data/Stockland - Visitation Data.csv", encoding=None )
df_1.head()
df_1.count()

Asset               8190
Date                8190
Count visitation    7050
dtype: int64

### 2, Weather dataset (collected from BOM.gov.au)

In [3]:
df_2 = pd.read_csv("data/weather.csv", encoding=None)
df_2.head()
df_2.count()

Asset              8190
Date               8190
Min temperature    8068
Max temperature    8126
Solar exposure     8186
Rainfall           7886
Raining period     5597
dtype: int64

In [4]:
df_full = pd.merge(df_1, df_2, on = ['Asset','Date'], how = "inner")
df_full.head()


Unnamed: 0,Asset,Date,Count visitation,Min temperature,Max temperature,Solar exposure,Rainfall,Raining period
0,Baldivis,1/01/2021,11878.0,19.9,30.0,30.7,0.0,
1,Baldivis,2/01/2021,7962.0,19.5,32.6,30.2,0.0,
2,Baldivis,3/01/2021,12918.0,18.0,32.0,24.3,0.0,
3,Baldivis,4/01/2021,12796.0,17.3,32.7,30.9,0.0,
4,Baldivis,5/01/2021,13321.0,18.2,34.2,30.8,0.0,


In [5]:
df_full["Solar exposure"].dtypes

dtype('float64')

In [6]:
df_full.count()

Asset               8190
Date                8190
Count visitation    7050
Min temperature     8068
Max temperature     8126
Solar exposure      8186
Rainfall            7886
Raining period      5597
dtype: int64

## Cleaning data
### Filling missing values

( detailed explanation) 

In [7]:
# check na and missing values
df_full.isna().sum()

Asset                  0
Date                   0
Count visitation    1140
Min temperature      122
Max temperature       64
Solar exposure         4
Rainfall             304
Raining period      2593
dtype: int64

#### Impure missing data for rainfall adn rain period: 
For rainfall, if there is a missing data, we assume that the rainfall is 0 (no rain). 
By looking at the raining period columns in the dataset collected from BOM, 

In [8]:
df_full['Rainfall ']=df_full['Rainfall '].fillna(0)

In [9]:
# we use backwards fill for raining period 
df_full['Raining period']= df_full['Raining period'].bfill()


In [10]:
for i in range(len(df_full)):
    if df_full['Rainfall '].loc[i]==0:
        df_full['Raining period'].loc[i]=0 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [11]:
## use forward fill to impute missing values of min temperature, max temperature and solar exposure
df_full['Max temperature']=df_full['Max temperature'].ffill()
df_full['Min temperature']=df_full['Min temperature'].ffill()
df_full['Solar exposure']=df_full['Solar exposure'].ffill()

### Create dummies variable for Assets and month 

In [12]:
df_full = pd.get_dummies(df_full,columns=['Asset'])

In [13]:
## Extract month variable
from datetime import datetime 
for i in range(len(df_full)):   
    df_full['Date'].loc[i]= datetime.strptime(df_full['Date'].loc[i],'%d/%m/%Y')


df_full['Date'].loc[1].month
df_full['Month']= ""
for i in range(len(df_full)):
    df_full['Month'].loc[i]=df_full['Date'].loc[i].month

In [14]:
## convert into categorical month

df_full['Month']= df_full['Month'].astype(str)

In [15]:
df_full = pd.get_dummies(df_full,columns=['Month'])

In [16]:
df_full.isna().sum()

Date                       0
Count visitation        1140
Min temperature            0
Max temperature            0
Solar exposure             0
Rainfall                   0
Raining period             0
Asset_Baldivis             0
Asset_Balgowlah            0
Asset_Birtinya             0
Asset_Bull Creek           0
Asset_Burleigh             0
Asset_Gladstone            0
Asset_Glendale             0
Asset_Green Hills          0
Asset_Harrisdale           0
Asset_Hervey Bay           0
Asset_Merrylands           0
Asset_Riverton             0
Asset_Rockhampton          0
Asset_Wendouree            0
Asset_Wetherill Park       0
Month_1                    0
Month_10                   0
Month_11                   0
Month_12                   0
Month_2                    0
Month_3                    0
Month_4                    0
Month_5                    0
Month_6                    0
Month_7                    0
Month_8                    0
Month_9                    0
dtype: int64

In [17]:
df_clean = df_full[df_full['Count visitation'].notna() ]

In [18]:
df_clean.count()

Date                    7050
Count visitation        7050
Min temperature         7050
Max temperature         7050
Solar exposure          7050
Rainfall                7050
Raining period          7050
Asset_Baldivis          7050
Asset_Balgowlah         7050
Asset_Birtinya          7050
Asset_Bull Creek        7050
Asset_Burleigh          7050
Asset_Gladstone         7050
Asset_Glendale          7050
Asset_Green Hills       7050
Asset_Harrisdale        7050
Asset_Hervey Bay        7050
Asset_Merrylands        7050
Asset_Riverton          7050
Asset_Rockhampton       7050
Asset_Wendouree         7050
Asset_Wetherill Park    7050
Month_1                 7050
Month_10                7050
Month_11                7050
Month_12                7050
Month_2                 7050
Month_3                 7050
Month_4                 7050
Month_5                 7050
Month_6                 7050
Month_7                 7050
Month_8                 7050
Month_9                 7050
dtype: int64

In [19]:
df_clean.columns

Index(['Date', 'Count visitation', 'Min temperature', 'Max temperature',
       'Solar exposure', 'Rainfall ', 'Raining period', 'Asset_Baldivis',
       'Asset_Balgowlah', 'Asset_Birtinya', 'Asset_Bull Creek',
       'Asset_Burleigh', 'Asset_Gladstone', 'Asset_Glendale',
       'Asset_Green Hills', 'Asset_Harrisdale', 'Asset_Hervey Bay',
       'Asset_Merrylands', 'Asset_Riverton', 'Asset_Rockhampton',
       'Asset_Wendouree', 'Asset_Wetherill Park', 'Month_1', 'Month_10',
       'Month_11', 'Month_12', 'Month_2', 'Month_3', 'Month_4', 'Month_5',
       'Month_6', 'Month_7', 'Month_8', 'Month_9'],
      dtype='object')

In [20]:
from sklearn.utils import shuffle

df_clean =shuffle(df_clean)

## Fiiting models 
### Linear regression model 

In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler


y_train = df_clean['Count visitation']
X_train = df_clean.drop(columns = ['Count visitation', 'Date'])

#sc = StandardScaler()

#sc.fit(X_train)

# print(dir(sc))
# print(sc.mean_, sc.scale_)

print('means:', X_train.mean(axis=0), X_train_scaled.mean(axis=0))
print('sigmas', X_train.std(axis=0), X_train.std(axis=0))

NameError: name 'X_train_scaled' is not defined

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
scores_lr = cross_val_score(estimator=lr, X=X_train, y=y_train, cv=10, n_jobs=1)

# print(scores_v2)

print(f'CV accuracy scores\n {scores_lr.reshape(-1,1)}')
print(f'CV accuracy: {np.mean(scores_lr):.3f} +/- {np.std(scores_lr):.3f}')

CV accuracy scores
 [[-5.42272894e+19]
 [ 3.47430929e-01]
 [-4.27449463e+20]
 [-1.75062121e-01]
 [-2.31926676e+20]
 [ 8.40371199e-01]
 [-6.19294216e+20]
 [ 5.50454821e-01]
 [-5.64572802e+20]
 [-5.88766818e-01]]
CV accuracy: -189747044566345121792.000 +/- 241178769772818759680.000


## Decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

tree = DecisionTreeRegressor(max_depth=30)

tree.fit(X_train, y_train)

y_train_pred = tree.predict(X_train)


print(f'MSE train: {mean_squared_error(y_train, y_train_pred):.3f}')
print(f'R^2 train: {r2_score(y_train, y_train_pred):.3f}')



MSE train: 130427.133
R^2 train: 0.998


In [None]:
scores_lr = cross_val_score(estimator=tree, X=X_train, y=y_train, cv=10, n_jobs=1)

# print(scores_v2)

print(f'CV accuracy scores\n {scores_lr.reshape(-1,1)}')
print(f'CV accuracy: {np.mean(scores_lr):.3f} +/- {np.std(scores_lr):.3f}')

CV accuracy scores
 [[0.64605473]
 [0.59057213]
 [0.64914497]
 [0.67950281]
 [0.64571156]
 [0.57371892]
 [0.7077282 ]
 [0.56660276]
 [0.61669283]
 [0.63819033]]
CV accuracy: 0.631 +/- 0.043


## Random Forrest regressor

In [28]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

forest = RandomForestRegressor()

forest.fit(X_train, y_train)

y_train_pred = forest.predict(X_train)


print(f'MSE train: {mean_squared_error(y_train, y_train_pred):.3f}')
print(f'R^2 train: {r2_score(y_train, y_train_pred):.3f}')

MSE train: 1522586.243
R^2 train: 0.971


In [29]:
scores_lr = cross_val_score(estimator=forest, X=X_train, y=y_train, cv=10, n_jobs=1)

# print(scores_v2)

print(f'CV accuracy scores\n {scores_lr.reshape(-1,1)}')
print(f'CV accuracy: {np.mean(scores_lr):.3f} +/- {np.std(scores_lr):.3f}')

CV accuracy scores
 [[0.81558455]
 [0.77405033]
 [0.77795415]
 [0.78625463]
 [0.7257761 ]
 [0.80183222]
 [0.77949239]
 [0.7982009 ]
 [0.83047601]
 [0.8095366 ]]
CV accuracy: 0.790 +/- 0.027


## Hyper parameter tuning for selected model

In [30]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in range(50,300,50)]
# Number of features to consider at every split
max_features = [int(x) for x in range(4,22)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'bootstrap': bootstrap}
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 10 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 10, verbose=2, random_state=1, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
# Ouput the best params
rf_random.best_params_

{'n_estimators': [50, 100, 150, 200, 250], 'max_features': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'bootstrap': [True, False]}
Fitting 10 folds for each of 10 candidates, totalling 100 fits


{'n_estimators': 250, 'max_features': 11, 'max_depth': 20, 'bootstrap': True}

In [31]:
# final = RandomForestRegressor(n_estimators= 200, max_features= 7, max_depth= 30, bootstrap= False)
# final.fit(X_train, y_train)

final = RandomForestRegressor(n_estimators= 250, max_features= 11, max_depth= 20, bootstrap= True)
final.fit(X_train, y_train)

y_train_pred = final.predict(X_train)


print(f'MSE train: {mean_squared_error(y_train, y_train_pred):.3f}')
print(f'R^2 train: {r2_score(y_train, y_train_pred):.3f}')

MSE train: 2672874.501
R^2 train: 0.950


In [32]:
scores_lr = cross_val_score(estimator=final, X=X_train, y=y_train, cv=10, n_jobs=1)

# print(scores_v2)

print(f'CV accuracy scores\n {scores_lr.reshape(-1,1)}')
print(f'CV accuracy: {np.mean(scores_lr):.3f} +/- {np.std(scores_lr):.3f}')

CV accuracy scores
 [[0.8259003 ]
 [0.78306467]
 [0.79693074]
 [0.80520329]
 [0.75252035]
 [0.81294595]
 [0.79097298]
 [0.80060965]
 [0.84379298]
 [0.82186072]]
CV accuracy: 0.803 +/- 0.024
