# IMPORTS

## Libraries

In [1]:
import pandas as pd
import numpy as np
import random
import datetime
import pickle
import json
import requests

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import HTML

from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso

import xgboost as xgb
from lightgbm import LGBMRegressor

## Loading Data

In [6]:
dfDataPreparation = pd.read_csv('../../01-Data/Results/01-FirstRoundCRISP/dfFeatureEngineering.csv', low_memory=False, parse_dates=['Date'])

# Data Preparation

In [9]:
dfRaw1 = dfDataPreparation.copy()

## Rescaling

In [10]:
numAttributes = dfRaw1.select_dtypes(include=['int64', 'float64'])

rs = RobustScaler()
mms = MinMaxScaler()

#Competion Distance >> Presence of well defined outiliers
numAttributes['CompetitionDistance'] = rs.fit_transform(numAttributes[['CompetitionDistance']].values)
pickle.dump(rs, open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/CompetitionDistanceScaler.pkl', 'wb'))


#Competion Time Month >> Presence of well defined outiliers
numAttributes['CompetionTimeMonth'] = rs.fit_transform(numAttributes[['CompetionTimeMonth']].values)
pickle.dump(rs, open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/CompetionTimeMonthScaler.pkl', 'wb'))


#Promo Time Week
numAttributes['PromoTimeWeek'] = mms.fit_transform(numAttributes[['PromoTimeWeek']].values)
pickle.dump(mms, open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/PromoTimeWeekScaler.pkl', 'wb'))


#Year
numAttributes['Year'] = mms.fit_transform(numAttributes[['Year']].values)
pickle.dump(mms, open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/YearScaler.pkl', 'wb'))

## Transformation

In [11]:
#State Holiday -> One Hot Encoding
dfRaw1 = pd.get_dummies(dfRaw1, prefix=['StateHoliday'], columns=['StateHoliday'])

#Store Type -> Label Encoding
le = LabelEncoder()
dfRaw1['StoreType'] = le.fit_transform(dfRaw1['StoreType'])
pickle.dump(le, open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/parameter/StoreTypeScaler.pkl', 'wb'))

#Assortment -> Ordinal Encoding
dictAssortment = {
                    'basic': 1,
                    'extra': 2,
                    'extended': 3
                    }
dfRaw1['Assortment'] = dfRaw1['Assortment'].map(dictAssortment)

# Rossmann Class

In [26]:
# import pickle
# import pandas as pd
# import numpy as np
# import math
# import datetime



# class Rossmann(object):
#     def __init__(self):
#         self.home_path = 'D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/'
#         self.competitionDistanceScaler = pickle.load(open(self.homep_path + 'parameter/CompetitionDistanceScaler.pkl', 'rb'))
#         self.competionTimeMonthScaler =  pickle.load(open(self.homep_path + 'parameter/CompetionTimeMonthScaler.pkl', 'rb'))
#         self.promoTimeWeekScaler =       pickle.load(open(self.homep_path + 'parameter/PromoTimeWeekScaler.pkl', 'rb'))
#         self.yearScaler =                pickle.load(open(self.homep_path + 'parameter/YearScaler.pkl', 'rb'))
#         self.storeTypeScaler =           pickle.load(open(self.homep_path + 'parameter/StoreTypeScaler.pkl', 'rb'))


        
#     def dataCleaning(self, dfRaw1):

#         ## Data Types
#         dfRaw1['Date'] = pd.to_datetime(dfRaw1['Date'])

#         ## Fillout NA
#         maxValueCompetitionDistance = dfRaw1['CompetitionDistance'].max()

#         # CompetitionDistance
#             #distance in meters to the nearest competitor store
#         dfRaw1['CompetitionDistance'] = dfRaw1['CompetitionDistance'].apply(lambda row: 200000.0 if math.isnan(row) else row)


#         # CompetitionOpenSinceMonth
#             #gives the approximate month of the time the nearest competitor was opened
#         dfRaw1['CompetitionOpenSinceMonth'] = dfRaw1.apply(lambda row: row['Date'].month if math.isnan(row['CompetitionOpenSinceMonth']) else row['CompetitionOpenSinceMonth'], axis=1)


#         # CompetitionOpenSinceYear
#             # gives the approximate year of the time the nearest competitor was opened
#         dfRaw1['CompetitionOpenSinceYear'] = dfRaw1.apply(lambda row: row['Date'].year if math.isnan(row['CompetitionOpenSinceYear']) else row['CompetitionOpenSinceYear'], axis=1)


#         # Promo2SinceWeek
#             #describes the calendar week when the store started participating in Promo2
#         dfRaw1['Promo2SinceWeek'] = dfRaw1.apply(lambda row: row['Date'].week if math.isnan(row['Promo2SinceWeek']) else row['Promo2SinceWeek'], axis=1)


#         # Promo2SinceYear
#             #describes the year when the store started participating in Promo2
#         dfRaw1['Promo2SinceYear'] = dfRaw1.apply(lambda row: row['Date'].year if math.isnan(row['Promo2SinceYear']) else row['Promo2SinceYear'], axis=1)


#         # PromoInterval
#             #describes the consecutive intervals Promo2 is started, naming the months the promotion is started anew.\
#             #E.g. "Feb,May,Aug,Nov" means each round starts in February, May, August, November of any given year for that store
#         monthMap = {
#                         1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
#                     }

#         dfRaw1['PromoInterval'].fillna(0, inplace=True)
#         dfRaw1['MonthMap'] = dfRaw1['Date'].dt.month.map(monthMap)

#         dfRaw1['IsPromo'] = dfRaw1[['PromoInterval', 'MonthMap']].apply(lambda row: 0 if row['PromoInterval'] == 0 else 1 if row['MonthMap'] in row['PromoInterval'].split(',') else 0, axis=1)

#         # competiton
#         dfRaw1['CompetitionOpenSinceMonth'] = dfRaw1['CompetitionOpenSinceMonth'].astype(int)
#         dfRaw1['CompetitionOpenSinceYear'] = dfRaw1['CompetitionOpenSinceYear'].astype(int)

#         # promo2
#         dfRaw1['Promo2SinceWeek'] = dfRaw1['Promo2SinceWeek'].astype(int)
#         dfRaw1['Promo2SinceYear'] = dfRaw1['Promo2SinceYear'].astype(int)
        
#         return dfRaw1

    
    
    
#     def featureEngineering(self, df2):
        
#         #year
#         df2['Year'] = df2['Date'].dt.year

#         #month
#         df2['Month'] = df2['Date'].dt.month

#         #day
#         df2['Day'] = df2['Date'].dt.day

#         #week of year
#         df2['WeekOfYear'] = df2['Date'].dt.weekofyear

#         #year week
#         df2['YearWeek'] = df2['Date'].dt.strftime('%Y-%W')

#         #Competion Sinse
#         df2['CompetionSinse'] = df2.apply(lambda row: datetime.datetime(year=row['CompetitionOpenSinceYear'], month=row['CompetitionOpenSinceMonth'], day=1), axis=1)
#         df2['CompetionTimeMonth'] = ((df2['Date'] - df2['CompetionSinse'])/30).apply(lambda row: row.days).astype(int)

#         #Promo Since
#         df2['PromoSince'] = df2['Promo2SinceYear'].astype(str) + '-' + df2['Promo2SinceWeek'].astype(str)
#         df2['PromoSince'] = df2['PromoSince'].apply(lambda row: datetime.datetime.strptime(row + '-1',  '%Y-%W-%w') - datetime.timedelta(days=7))
#         df2['PromoTimeWeek'] = ((dfdf2Raw1['Date'] - df2['PromoSince'])/7).apply(lambda row: row.days).astype(int)

#         #Assortment (level: a = basic, b = extra, c = extended)
#         level = {
#             'a' : 'basic', 'b' : 'extra', 'c' : 'extended'
#         }
#         df2['Assortment'] = df2['Assortment'].map(level)

#         # State Holiday (a = public holiday, b = Easter holiday, c = Christmas, 0 = None)
#         holiday = {
#             'a' : 'public holiday', 'b' : 'Easter holiday', 'c' : 'Christmas'
#         }
#         df2['StateHoliday'] = df2['StateHoliday'].map(holiday)
#         df2['StateHoliday'].fillna('Regular Day', inplace=True)

#         ## Row Fitering
#         df2 = df2[df2['Open'] != 0]

#         ## Columns Filtering
#         toDrop = ['Open', 'PromoInterval', 'MonthMap']
#         df2.drop(toDrop, axis=1, inplace=True)
        
#         return df2

    
#     def dataPreparation(self, df3):
        
#         #Competion Distance >> Presence of well defined outiliers
#         df3['CompetitionDistance'] = self.competitionDistanceScaler.fit_transform(df3[['CompetitionDistance']].values)

#         #Competion Time Month >> Presence of well defined outiliers
#         df3['CompetionTimeMonth'] = self.competionTimeMonthScaler.fit_transform(df3[['CompetionTimeMonth']].values)

#         #Promo Time Week
#         df3['PromoTimeWeek'] = self.promoTimeWeekScaler.fit_transform(df3[['PromoTimeWeek']].values)

#         #Year
#         df3['Year'] = self.yearScaler.fit_transform(df3[['Year']].values)

#         ### Encoding
#         #State Holiday -> One Hot Encoding
#         df3 = pd.get_dummies(df3, prefix=['StateHoliday'], columns=['StateHoliday'])

#         #Store Type -> Label Encoding
#         df3['StoreType'] = self.storeTypeScaler.fit_transform(df3['StoreType'])

#         #Assortment -> Ordinal Encoding
#         dictAssortment = {
#                             'basic': 1,
#                             'extra': 2,
#                             'extended': 3
#                             }
#         df3['Assortment'] = df3['Assortment'].map(dictAssortment)

#         ### Nature Transformation
#         #Month
#         df3['MonthSin'] = df3['Month'].apply(lambda row: np.sin(row * (2 * np.pi/12)))
#         df3['MonthCos'] = df3['Month'].apply(lambda row: np.cos(row * (2 * np.pi/12)))
#         #Day
#         df3['DaySin'] = df3['Day'].apply(lambda row: np.sin(row * (2 * np.pi/30)))
#         df3['DayCos'] = df3['Day'].apply(lambda row: np.cos(row * (2 * np.pi/30)))
#         #Week of Year
#         df3['WeekOfYearSin'] = df3['WeekOfYear'].apply(lambda row: np.sin(row * (2 * np.pi/52)))
#         df3['WeekOfYearCos'] = df3['WeekOfYear'].apply(lambda row: np.cos(row * (2 * np.pi/52)))
#         #Day of Week
#         df3['DayOfWeekSin'] = df3['DayOfWeek'].apply(lambda row: np.sin(row * (2 * np.pi/7)))
#         df3['DayOfWeekCos'] = df3['DayOfWeek'].apply(lambda row: np.cos(row * (2 * np.pi/7)))
        
#         colsSelected = ['Store','Promo','StoreType','Assortment','CompetitionDistance','CompetitionOpenSinceMonth',
#                                 'CompetitionOpenSinceYear','Promo2','Promo2SinceWeek','Promo2SinceYear','CompetionTimeMonth',
#                                 'PromoTimeWeek','MonthSin','MonthCos','DaySin','DayCos','WeekOfYearSin','WeekOfYearCos','DayOfWeekSin',
#                                 'DayOfWeekCos']
        
#         return df3[colsSelected]


#     def getPrediction(self, model, originalData, testData):
#         # Prediction
#         pred = model.predict(testData)

#         # Join pred into original Data
#         originalData['Prediction'] = np.exp1m(pred)

#         return originalData.to_json(orient='records', date_format='iso')

# API Handler

In [33]:
# from flask import Flask, request, Response
# import pandas as pd
# import pickle
# #from rossmann.Rossmann import Rossmann

# # Loding Model
# model = pickle.load(open('D:/01-DataScience/04-Projetos/00-Git/Rossmann-Store-Sales/02-Notebooks/01-FirstRoundCRISP/model/modelRossmann.pkl', 'rb' ))

# # Initialize API
# app = Flask(__name__)

# @app.route('/rossmann/predict', methods=['POST'])
# def rossmanPredict():
#     testJSON = request.get_json()
    
#     if testJSON: #there is data
#         if isinstance(testJSON, dict):
#             testeRaw = pd.DataFrame(testJSON, index=[0]) #unique example
#         else:
#             testeRaw = pd.DataFrame(testJSON, columns=testJSON[0].keys()) #multiple examples
    
#         # Instantiate
#         pipeline = Rossmann()
        
#         # Data Cleaning
#         df1 = pipeline.dataCleaning(testeRaw)
#         # Feature Engineering
#         df2 = pipeline.featureEngineering(df1)
#         # Data Preparation
#         df3 = pipeline.dataPreparation(df2)
#         # Prediction
#         dfResponse = pipeline.getPrediction(model, testeRaw, df3)
        
#         return dfResponse
    
#     else:
#         return Response('{}', status=200, mimetype='application/json')

# if __name__ == '__main__':
#     app.run('0.0.0.0')

# API Test

In [15]:
#salesRaw = pd.read_csv('../../01-Data/train.csv', low_memory=False)
storeRaw = pd.read_csv('../../01-Data/store.csv', low_memory=False)
testRaw = pd.read_csv('../../01-Data/test.csv', low_memory=False)

In [64]:
storeList = [22, 24, 12]

# Merge Test dataset + Store
dfTest = pd.merge(testRaw, storeRaw, how='left', on='Store')

# Choose Store for Prediction
dfTest = dfTest[dfTest['Store'].isin(storeList)]

# Remove Closed Days
dfTest = dfTest[dfTest['Open'] != 0]
dfTest = dfTest[~dfTest['Open'].isnull()]
dfTest = dfTest.drop('Id', axis=1)

In [65]:
# Convert DataFrame to JSON
data = json.dumps(dfTest.to_dict(orient='records'))

In [66]:
## API Call
url = 'http://127.0.0.1:5000/rossmann/predict'
header = {'Content-Type': 'application/json'}
data = data

r = requests.post(url, data=data, headers=header)
print('Status Code {}'.format(r.status_code))

Status Code 200


In [67]:
dfResponse = pd.DataFrame(r.json(), columns=r.json()[0].keys())

In [68]:
dfResponse.head()

Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,MonthMap,IsPromo,Year,Month,Day,WeekOfYear,YearWeek,CompetionSinse,CompetionTimeMonth,PromoSince,PromoTimeWeek,Prediction
0,12,4,2015-09-17T00:00:00.000Z,1.0,1,Regular Day,0,a,extended,1070.0,9,2015,1,13,2010,"Jan,Apr,Jul,Oct",Sep,0,2015,9,17,38,2015-37,2015-09-01T00:00:00.000Z,0,2010-03-22T00:00:00.000Z,286,6458.01709
1,22,4,2015-09-17T00:00:00.000Z,1.0,1,Regular Day,0,a,basic,1040.0,9,2015,1,22,2012,"Jan,Apr,Jul,Oct",Sep,0,2015,9,17,38,2015-37,2015-09-01T00:00:00.000Z,0,2012-05-21T00:00:00.000Z,173,4586.236816
2,24,4,2015-09-17T00:00:00.000Z,1.0,1,Regular Day,0,a,extended,4590.0,3,2000,1,40,2011,"Jan,Apr,Jul,Oct",Sep,0,2015,9,17,38,2015-37,2000-03-01T00:00:00.000Z,189,2011-09-26T00:00:00.000Z,207,8445.188477
3,12,3,2015-09-16T00:00:00.000Z,1.0,1,Regular Day,0,a,extended,1070.0,9,2015,1,13,2010,"Jan,Apr,Jul,Oct",Sep,0,2015,9,16,38,2015-37,2015-09-01T00:00:00.000Z,0,2010-03-22T00:00:00.000Z,286,6519.020508
4,22,3,2015-09-16T00:00:00.000Z,1.0,1,Regular Day,0,a,basic,1040.0,9,2015,1,22,2012,"Jan,Apr,Jul,Oct",Sep,0,2015,9,16,38,2015-37,2015-09-01T00:00:00.000Z,0,2012-05-21T00:00:00.000Z,173,4656.111816


In [69]:
dfResponse2 = dfResponse[['Store', 'Prediction']].groupby('Store').sum().reset_index()
dfResponse2

Unnamed: 0,Store,Prediction
0,12,246351.264648
1,22,181228.682861
2,24,331959.9375


In [71]:
dfResponse2 = dfResponse[['Store', 'Prediction']].groupby('Store').sum().reset_index()

for i in range(len(dfResponse2)):
    print('Store Number {} will sell R${:,.2f} in the next 6 weeks'.format(
            dfResponse2.loc[i, 'Store'],
             dfResponse2.loc[i, 'Prediction']))

Store Number 12 will sell R$246,351.26 in the next 6 weeks
Store Number 22 will sell R$181,228.68 in the next 6 weeks
Store Number 24 will sell R$331,959.94 in the next 6 weeks
