In [None]:
# import library

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import string
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV, cross_val_predict, StratifiedShuffleSplit


# 1. Data Exploration

In [None]:
# Read raw dataset

store_info = pd.read_csv('../input/w21proj1/W21_store_info.csv')
test = pd.read_csv('../input/w21proj1/W21_test.csv')
train = pd.read_csv('../input/w21proj1/W21_train.csv')

In [None]:
train.head()
#train.tail()

In [None]:
train.columns

In [None]:
test.head(6)

In [None]:
test.columns

Since ID is not in training data, customers is not in testing data, delete them.
But by prof's advice, we may predict customer first, then predict the price. Just keep it now.

In [None]:
del test['ID']

In [None]:
store_info.head()

I found that store indo can be included in training and testing set, since there is a common column store.

In [None]:
store_info['PromoInterval'].unique()

since PromoInterval is same for all kinds of stores, delete it.

In [None]:
del store_info['PromoInterval']

Create new features with date object 

In [None]:
train['Date'] = pd.to_datetime(train['Date'])   
train['year'] = train.Date.dt.year
train['month'] = train.Date.dt.month
train['day'] = train.Date.dt.day
train.drop('Date', axis=1, inplace=True)

In [None]:
test['Date'] = pd.to_datetime(test['Date'])   
test['year'] = test.Date.dt.year
test['month'] = test.Date.dt.month
test['day'] = test.Date.dt.day
test.drop('Date', axis=1, inplace=True)

In [None]:
# merge dataset without change sequence.


#for store in store_info[,0]:

In [None]:
# combine store_info and training & testing data
train_merge = train.merge(train.merge(store_info, how='left', on='Store', sort=False))
test_merge = test.merge(test.merge(store_info, how='left', on='Store', sort=False))

In [None]:
# train_merge.shape
# train_merge.columns
test_merge.head(6)

In [None]:
train_merge.isnull().sum().sort_values(ascending = False)

We have 13 columns in training dataset. Date and StoreType are cateogrical type, the others are numeric type.

In [None]:
# sample data has lower datapoints for visualization

sample_data = train_merge.sample(frac = 0.1, random_state = 0)

fig = plt.figure(figsize=(12,16))
for index,col in enumerate(sample_data.columns):
    plt.subplot(4,4,index+1)
    sns.countplot(sample_data[sample_data.columns].loc[:,col].dropna())
fig.tight_layout(pad=1.0)

Sales, customers and Competition distance are numerical continuous data.
Date of week, open, promo, School holiday are numerical discrete data.
The others are categorical
Also, States holiday is sparse data, with most count as 0. Delete it.

In [None]:
del train_merge['StateHoliday']
del test_merge['StateHoliday']

We found that most of them are 0. (about 97%)

To reduce multicollinearity, delete one of the pairs of columns that has correlation above 0.95

In [None]:
train_merge.Sales.value_counts()

In [None]:
sns.heatmap(train_merge.corr(), annot = True)

find pairs of correlation above 0.95 & below 0.01

In [None]:
def corrFilter(x: pd.DataFrame, bound: float):
    xCorr = x.corr()
    xFiltered = xCorr[((xCorr >= bound) | (xCorr <= -bound)) & (xCorr !=1.000)]
    xFlattened = xFiltered.unstack().sort_values().drop_duplicates()
    return xFlattened

corrFilter(train_merge, .95)

In [None]:
def corrFilterLow(x: pd.DataFrame, bound: float):
    xCorr = abs(x.corr())
    xFiltered = xCorr[(xCorr <= bound) | (xCorr <= -bound) & (xCorr !=1.000)]
    xFlattened = xFiltered.unstack().sort_values().drop_duplicates()
    return xFlattened

corrFilterLow(train_merge, .01)

In [None]:
corrFilterLow(train_merge, .01).index

There is no columns need be delete, but sales and customer has a high correlation.

Also, from the heat map, open and promo has high correlation with sales. Let's find the relationship. Since open and promo are values of 0 and 1, check if open/protmo is 0, sales is 0.

In [None]:
# scatter plot 
train_merge.value_counts(subset=['Open', 'Sales'])
# Hence, when open is 0, sales = 0

In [None]:
train_merge.value_counts(subset=['Open', 'Sales'])[1].describe()

It shows when open is 0, sales is 0. The other values are evenly distributed.

In [None]:
train_merge.value_counts(subset=['Promo', 'Sales'])

In [None]:
train_merge.value_counts(subset=['Open'])

In [None]:
train_merge.value_counts(subset=['Sales'])

In [None]:
train_merge.value_counts(subset=['Promo'])

there is about 25% of Sales is 0, when protmo is 0.

Total number of open = 0 is 3103. when open is 0, sales is 0 for 3103 rows. The total number of 0s in sales is 3105, which is close to 3103. So now, I consider when the open is 0, predit sales is 0. 

In [None]:
sns.heatmap(train_merge[train_merge["Open"] != 0].drop('Open', axis =1).corr(), annot = True)

In [None]:
train_merge.Sales.describe()

In [None]:
# clustering

In [None]:

le = LabelEncoder()
train_merge['StoreType'] = le.fit_transform(train_merge['StoreType'])
test_merge['StoreType'] = le.fit_transform(test_merge['StoreType'])

In [None]:
train_merge.head()
test_merge.head()

In [None]:
test_merge.head(30)

In [None]:
# Save current data, for doing prediction in R
test_merge.to_csv("../../kaggle/working/test_engineered.csv", index=False)
train_merge.to_csv("../../kaggle/working/train_engineered.csv", index=False)

# 2. Prediction models

2.1 simple prediction

I want to try several models: KNN regression, Logistic regression, Random Forest Regression, MLP neural network, CNN model. I'm using cross_validation to evaluate each model's performance.

In [None]:
# root mean squared percentage error

def rmspe(y_true, y_pred):
    return (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))) * 100

In [None]:
train_X = train_merge.drop(['Sales'], axis = 1)
train_y = train_merge['Sales']

In [None]:
# change storetype to numeric

le = LabelEncoder()
train_X['StoreType'] = le.fit_transform(train_X['StoreType'])


In [None]:
model = KNeighborsRegressor()

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import StratifiedKFold

N = 5  # number of folds
skf = StratifiedKFold(n_splits=N, random_state=5, shuffle=True)
num = 0
sales = pd.DataFrame(np.zeros((len(train_X), N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)])
RMSPE_score_lis = []

for train_index, test_index in skf.split(train_X, train_y):
    num +=1
    X_train1, X_test1 = train_X.iloc[train_index,:], train_X.iloc[test_index,:]
    y_train1, y_test1 = train_y[train_index], train_y[test_index]
    
    model = model
    model.fit(X_train1, y_train1)
    
    #saleprice.loc[:, 'Fold_{}'.format(num)] = model.predict(test_data)
    prediction = model.predict(X_test1)
    #RMSPE score
    RMSPE_score = rmspe(y_test1, prediction)
    RMSPE_score_lis = RMSPE_score_lis + [RMSPE_score]
    #print("RMSPE score: ", RMSPE_score)
print("average RMSPE score:",sum(RMSPE_score_lis)/5) 

In [None]:
def Stratified_model(model, train_X, train_y, N = 5, random_state=5):

    skf = StratifiedKFold(n_splits=N, random_state=random_state, shuffle=True)
    num = 0
    sales = pd.DataFrame(np.zeros((len(train_X), N)), columns=['Fold_{}'.format(i) for i in range(1, N + 1)])
    RMSPE_score_lis = []

    for train_index, test_index in skf.split(train_X, train_y):
        num +=1
        X_train1, X_test1 = train_X.iloc[train_index,:], train_X.iloc[test_index,:]
        y_train1, y_test1 = train_y[train_index], train_y[test_index]

        model = model
        model.fit(X_train1, y_train1)

        #saleprice.loc[:, 'Fold_{}'.format(num)] = model.predict(test_data)
        prediction = model.predict(X_test1)
        #RMSPE score
        RMSPE_score = rmspe(y_test1, prediction)
        RMSPE_score_lis = RMSPE_score_lis + [RMSPE_score]
        #print("RMSPE score: ", RMSPE_score)
    print("average RMSPE score:",sum(RMSPE_score_lis)/5) 

In [None]:
# check if cross-validation works

Stratified_model(model, train_X, train_y)

# Logistic Regression

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0)
Stratified_model(model, train_X, train_y)

# Random Forest

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=0)
Stratified_model(model, train_X, train_y)

In [None]:
# MLP neural network

Stratified_model(model, train_X, train_y)

In [None]:
# CNN

normal method without cross validation

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(train_X, train_y,
                                                    test_size=0.2,
                                                    random_state=0)


In [None]:
def model_func(model, x_train = x_train, y_train = y_train, x_test = x_test, y_test = y_test, random_state=5):
    model.fit(x_train, y_train)
    prediction = model.predict(x_test)
    RMSPE_score = rmspe(y_test, prediction)
    print("RMSPE score: ", RMSPE_score)

In [None]:
model = KNeighborsRegressor()
model_func(model)

Logistic Regression

In [None]:
model = LogisticRegression(random_state=0)
model_func(model)

Random forest

In [None]:
model = RandomForestRegressor(random_state=0)
model_func(model)

MLP

In [None]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor(random_state=0, max_iter = 200)
model_func(model)

CNN