# IMPORTS

## Libraries

In [1]:
import pandas as pd
import numpy as np
import random
import datetime
import pickle

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import HTML

from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso

import xgboost as xgb
from lightgbm import LGBMRegressor

## Helper Functions

In [2]:
def crossValidation(XTraining, kfold, modelName, model='default', verbose=False):
    maeList = []
    mapeList = []
    rmseList = []

    for k in reversed(range(1, kfold+1)):
        if verbose:
            print(f'\nKFold Number: {k}')
        # Start and End Date for Validation
        startDateValid = XTraining['Date'].max() - datetime.timedelta(days=k*6*7)
        endDateValid = XTraining['Date'].max() - datetime.timedelta(days=(k-1)*6*7)

        # Filtering Dataset
        training = XTraining[XTraining['Date'] < startDateValid]
        validation = XTraining[(XTraining['Date'] >= startDateValid) & (XTraining['Date'] <= endDateValid)]

        # Training and Validation Dataset
        # Training
        XKFoldTraining = training.drop(['Date', 'Sales'], axis=1)
        yKFoldTraining = training['Sales']

        # Validation
        XKFoldValidation = validation.drop(['Date', 'Sales'], axis=1)
        yKFoldValidation = validation['Sales']

        # Model
        ## Model Map
        modelMap = {
            'Linear Regression': LinearRegression(),
            'Lasso': Lasso(alpha=0.01),
            'Random Forest Regressor': RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42),
            'XGBoost Regressor': xgb.XGBRegressor( objective='reg:squarederror', n_estimators=500, eta=0.01, max_depth=10, 
                                                      subsample=0.7, colsample_bytree=0.9),
            'Lightgbm Regressor': LGBMRegressor(num_leaves=10, min_data_in_leaf=50, n_jobs=-1, random_state=42, n_estimators=500)   
        }
        
        ## Mapped Model
        if model == 'default':
            model = modelMap[modelName]
        else: model = model
        
        model.fit(XKFoldTraining, yKFoldTraining)

        # Prediction
        yhat = model.predict(XKFoldValidation)

        #Performance
        modelResult = mlError('Linear Regression', np.expm1(yKFoldValidation), np.expm1(yhat))
        
        #Store Performance of each KFold iteration
        maeList.append(modelResult['MAE'].tolist())
        mapeList.append(modelResult['MAPE'].tolist())
        rmseList.append(modelResult['RMSE'].tolist())


    dictResult = {
                    'Model Name': [modelName],
                    'MAE CV': [np.round(np.mean(maeList),2).astype(str) + ' +/- ' + np.round(np.std(maeList),2).astype(str)],
                    'MAPE CV': [np.round(np.mean(mapeList),2).astype(str) + ' +/- ' + np.round(np.std(mapeList),2).astype(str)],
                    'RMSE CV': [np.round(np.mean(rmseList),2).astype(str) + ' +/- ' + np.round(np.std(rmseList),2).astype(str)]
                }

    return pd.DataFrame(dictResult)


def mean_percentage_error( y, yhat ):
    return np.mean( ( y - yhat ) / y )

def mean_absolute_percentage_error(y, yhat):
    return np.mean(np.abs((y - yhat) / y))


def mlError(modelName, y, yhat):
    mae = mean_absolute_error(y, yhat)
    mape = mean_absolute_percentage_error(y, yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    
    return pd.DataFrame({
                            'ModelName': modelName,
                            'MAE': mae,
                            'MAPE': mape,
                            'RMSE': rmse,
                        }, index=[0])



def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 16]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [3]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## Loading Data

In [6]:
dfDataPreparation = pd.read_csv('../../01-Data/Results/01-FirstRoundCRISP/dfFeatureEngineering.csv', low_memory=False, parse_dates=['Date'])

# Data Preparation

In [9]:
dfRaw1 = dfDataPreparation.copy()

## Rescaling

In [10]:
numAttributes = dfRaw1.select_dtypes(include=['int64', 'float64'])

rs = RobustScaler()
mms = MinMaxScaler()

#Competion Distance >> Presence of well defined outiliers
numAttributes['CompetitionDistance'] = rs.fit_transform(numAttributes[['CompetitionDistance']].values)
pickle.dump(rs, open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/CompetitionDistanceScaler.pkl', 'wb'))


#Competion Time Month >> Presence of well defined outiliers
numAttributes['CompetionTimeMonth'] = rs.fit_transform(numAttributes[['CompetionTimeMonth']].values)
pickle.dump(rs, open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/CompetionTimeMonthScaler.pkl', 'wb'))


#Promo Time Week
numAttributes['PromoTimeWeek'] = mms.fit_transform(numAttributes[['PromoTimeWeek']].values)
pickle.dump(mms, open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/PromoTimeWeekScaler.pkl', 'wb'))


#Year
numAttributes['Year'] = mms.fit_transform(numAttributes[['Year']].values)
pickle.dump(mms, open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/YearScaler.pkl', 'wb'))

## Transformation

In [11]:
#State Holiday -> One Hot Encoding
dfRaw1 = pd.get_dummies(dfRaw1, prefix=['StateHoliday'], columns=['StateHoliday'])

#Store Type -> Label Encoding
le = LabelEncoder()
dfRaw1['StoreType'] = le.fit_transform(dfRaw1['StoreType'])
pickle.dump(le, open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/StoreTypeScaler.pkl', 'wb'))

#Assortment -> Ordinal Encoding
dictAssortment = {
                    'basic': 1,
                    'extra': 2,
                    'extended': 3
                    }
dfRaw1['Assortment'] = dfRaw1['Assortment'].map(dictAssortment)

# Rossmann Class

In [None]:
salesRaw = pd.read_csv('../../01-Data/train.csv', low_memory=False)
storeRaw = pd.read_csv('../../01-Data/store.csv', low_memory=False)

dfRaw1 = salesRaw.merge(storeRaw, how='left', on='Store')

In [18]:
class Rossmann(object):
    def __init__(self):
        self.competitionDistanceScaler = pickle.load(open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/CompetitionDistanceScaler.pkl', 'rb'))
        self.competionTimeMonthScaler =  pickle.load(open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/CompetionTimeMonthScaler.pkl', 'rb'))
        self.promoTimeWeekScaler =       pickle.load(open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/PromoTimeWeekScaler.pkl', 'rb'))
        self.yearScaler =                pickle.load(open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/YearScaler.pkl', 'rb'))
        self.storeTypeScaler =           pickle.load(open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/StoreTypeScaler.pkl', 'rb'))


        
    def dataCleaning(self, dfRaw1):

        ## Data Types
        dfRaw1['Date'] = pd.to_datetime(dfRaw1['Date'])

        ## Fillout NA
        maxValueCompetitionDistance = dfRaw1['CompetitionDistance'].max()

        # CompetitionDistance
            #distance in meters to the nearest competitor store
        dfRaw1['CompetitionDistance'] = dfRaw1['CompetitionDistance'].apply(lambda row: 200000.0 if math.isnan(row) else row)


        # CompetitionOpenSinceMonth
            #gives the approximate month of the time the nearest competitor was opened
        dfRaw1['CompetitionOpenSinceMonth'] = dfRaw1.apply(lambda row: row['Date'].month if math.isnan(row['CompetitionOpenSinceMonth']) else row['CompetitionOpenSinceMonth'], axis=1)


        # CompetitionOpenSinceYear
            # gives the approximate year of the time the nearest competitor was opened
        dfRaw1['CompetitionOpenSinceYear'] = dfRaw1.apply(lambda row: row['Date'].year if math.isnan(row['CompetitionOpenSinceYear']) else row['CompetitionOpenSinceYear'], axis=1)


        # Promo2SinceWeek
            #describes the calendar week when the store started participating in Promo2
        dfRaw1['Promo2SinceWeek'] = dfRaw1.apply(lambda row: row['Date'].week if math.isnan(row['Promo2SinceWeek']) else row['Promo2SinceWeek'], axis=1)


        # Promo2SinceYear
            #describes the year when the store started participating in Promo2
        dfRaw1['Promo2SinceYear'] = dfRaw1.apply(lambda row: row['Date'].year if math.isnan(row['Promo2SinceYear']) else row['Promo2SinceYear'], axis=1)


        # PromoInterval
            #describes the consecutive intervals Promo2 is started, naming the months the promotion is started anew.\
            #E.g. "Feb,May,Aug,Nov" means each round starts in February, May, August, November of any given year for that store
        monthMap = {
                        1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
                    }

        dfRaw1['PromoInterval'].fillna(0, inplace=True)
        dfRaw1['MonthMap'] = dfRaw1['Date'].dt.month.map(monthMap)

        dfRaw1['IsPromo'] = dfRaw1[['PromoInterval', 'MonthMap']].apply(lambda row: 0 if row['PromoInterval'] == 0 else 1 if row['MonthMap'] in row['PromoInterval'].split(',') else 0, axis=1)

        # competiton
        dfRaw1['CompetitionOpenSinceMonth'] = dfRaw1['CompetitionOpenSinceMonth'].astype(int)
        dfRaw1['CompetitionOpenSinceYear'] = dfRaw1['CompetitionOpenSinceYear'].astype(int)

        # promo2
        dfRaw1['Promo2SinceWeek'] = dfRaw1['Promo2SinceWeek'].astype(int)
        dfRaw1['Promo2SinceYear'] = dfRaw1['Promo2SinceYear'].astype(int)
        
        return dfRaw1

    
    
    
    def featureEngineering(self, df2):
        
        #year
        df2['Year'] = df2['Date'].dt.year

        #month
        df2['Month'] = df2['Date'].dt.month

        #day
        df2['Day'] = df2['Date'].dt.day

        #week of year
        df2['WeekOfYear'] = df2['Date'].dt.weekofyear

        #year week
        df2['YearWeek'] = df2['Date'].dt.strftime('%Y-%W')

        #Competion Sinse
        df2['CompetionSinse'] = df2.apply(lambda row: datetime.datetime(year=row['CompetitionOpenSinceYear'], month=row['CompetitionOpenSinceMonth'], day=1), axis=1)
        df2['CompetionTimeMonth'] = ((df2['Date'] - df2['CompetionSinse'])/30).apply(lambda row: row.days).astype(int)

        #Promo Since
        df2['PromoSince'] = df2['Promo2SinceYear'].astype(str) + '-' + df2['Promo2SinceWeek'].astype(str)
        df2['PromoSince'] = df2['PromoSince'].apply(lambda row: datetime.datetime.strptime(row + '-1',  '%Y-%W-%w') - datetime.timedelta(days=7))
        df2['PromoTimeWeek'] = ((dfdf2Raw1['Date'] - df2['PromoSince'])/7).apply(lambda row: row.days).astype(int)

        #Assortment (level: a = basic, b = extra, c = extended)
        level = {
            'a' : 'basic', 'b' : 'extra', 'c' : 'extended'
        }
        df2['Assortment'] = df2['Assortment'].map(level)

        # State Holiday (a = public holiday, b = Easter holiday, c = Christmas, 0 = None)
        holiday = {
            'a' : 'public holiday', 'b' : 'Easter holiday', 'c' : 'Christmas'
        }
        df2['StateHoliday'] = df2['StateHoliday'].map(holiday)
        df2['StateHoliday'].fillna('Regular Day', inplace=True)

        ## Row Fitering
        df2 = df2[(df2['Open'] != 0) & (df2['Sales'] > 0)]

        ## Columns Filtering
        toDrop = ['Customers', 'Open', 'PromoInterval', 'MonthMap']
        df2.drop(toDrop, axis=1, inplace=True)
        
        return df2

    
    def dataPreparation(self, df3):
        
        #Competion Distance >> Presence of well defined outiliers
        df3['CompetitionDistance'] = self.competitionDistanceScaler.fit_transform(df3[['CompetitionDistance']].values)

        #Competion Time Month >> Presence of well defined outiliers
        df3['CompetionTimeMonth'] = self.competionTimeMonthScaler.fit_transform(df3[['CompetionTimeMonth']].values)

        #Promo Time Week
        df3['PromoTimeWeek'] = self.promoTimeWeekScaler.fit_transform(df3[['PromoTimeWeek']].values)

        #Year
        df3['Year'] = self.yearScaler.fit_transform(df3[['Year']].values)

        ### Encoding
        #State Holiday -> One Hot Encoding
        df3 = pd.get_dummies(df3, prefix=['StateHoliday'], columns=['StateHoliday'])

        #Store Type -> Label Encoding
        df3['StoreType'] = self.storeTypeScaler.fit_transform(df3['StoreType'])

        #Assortment -> Ordinal Encoding
        dictAssortment = {
                            'basic': 1,
                            'extra': 2,
                            'extended': 3
                            }
        df3['Assortment'] = df3['Assortment'].map(dictAssortment)

        ### Nature Transformation
        #Month
        df3['MonthSin'] = df3['Month'].apply(lambda row: np.sin(row * (2 * np.pi/12)))
        df3['MonthCos'] = df3['Month'].apply(lambda row: np.cos(row * (2 * np.pi/12)))
        #Day
        df3['DaySin'] = df3['Day'].apply(lambda row: np.sin(row * (2 * np.pi/30)))
        df3['DayCos'] = df3['Day'].apply(lambda row: np.cos(row * (2 * np.pi/30)))
        #Week of Year
        df3['WeekOfYearSin'] = df3['WeekOfYear'].apply(lambda row: np.sin(row * (2 * np.pi/52)))
        df3['WeekOfYearCos'] = df3['WeekOfYear'].apply(lambda row: np.cos(row * (2 * np.pi/52)))
        #Day of Week
        df3['DayOfWeekSin'] = df3['DayOfWeek'].apply(lambda row: np.sin(row * (2 * np.pi/7)))
        df3['DayOfWeekCos'] = df3['DayOfWeek'].apply(lambda row: np.cos(row * (2 * np.pi/7)))
        
        colsSelected = ['Store','Promo','StoreType','Assortment','CompetitionDistance','CompetitionOpenSinceMonth',
                                'CompetitionOpenSinceYear','Promo2','Promo2SinceWeek','Promo2SinceYear','CompetionTimeMonth',
                                'PromoTimeWeek','MonthSin','MonthCos','DaySin','DayCos','WeekOfYearSin','WeekOfYearCos','DayOfWeekSin',
                                'DayOfWeekCos']
        
        return df3[colsSelected]

# API Handler

In [21]:
from flask import Flask, request, Response
import pandas as pd
import pickle
from rossmann.Rossmann import Rossmann

# Loding Model
model = pickle.load(open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/model/modelRossmann.pkl', 'rb' ))

# Initialize API
app = Flask(__name__)

@app.route('/rossmann/predict', methods=['POST'])
def rossmanPredict():
    testJSON = request.get_json()
    
    if testJSON: #there is data
        if isinstance(testJSON, dict):
            testeRaw = pd.DataFrame(testJSON, index=[0]) #unique example
        else:
            testeRaw = pd.DataFrame(testJSON, columns=testJSON[0].keys()) #multiple examples
    
        # Instantiate
        pipeline = Rossmann()
        
        # Data Cleaning
        df1 = pipeline.dataCleaning(testeRaw)
        # Feature Engineering
        df2 = pipeline.featureEngineering(df1)
        # Data Preparation
        df3 = pipeline.dataPreparation(df2)
        # Prediction
        dfResponse = pipeline.getPrediction(model, testeRaw, df3)
        
        return dfResponse
    
    else:
        return Response('{}', status=200, mimetype='application/json')

if __name__ == '__main__':
    app.run('0.0.0.0')