<a href="https://colab.research.google.com/github/Charliebond125/CE889_Group_Project/blob/main/Preprocessing_Rossman_Sale_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# importing required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

import seaborn as sns
import warnings

from math import sqrt

from tqdm import tqdm

np.random.seed(42)  # for reproducibility

sns.set(style="whitegrid", color_codes=True)
sns.set(font_scale=1)

pd.set_option('display.max_columns', 60)

%matplotlib inline
warnings.filterwarnings('ignore')

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [12]:
df_train = pd.read_csv('/content/drive/MyDrive/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/test.csv')
df_store = pd.read_csv('/content/drive/MyDrive/store.csv')

In [13]:
df_train = df_train.merge(df_store, on=['Store'], how = 'inner')
df_test = df_test.merge(df_store, on=['Store'], how = 'inner')

In [14]:
df_train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
1,1,4,2015-07-30,5020,546,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
2,1,3,2015-07-29,4782,523,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
3,1,2,2015-07-28,5011,560,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
4,1,1,2015-07-27,6102,612,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,


Next thing to do, is to deal with the dates. For the ease of use, this will be converted into three seperate columns according to day, week, and year.

The season will also be given another row as another explicit feature.

The goal will also be to create a pipeline to pipe in the data and normalize it using several functions.

In [None]:
target = ["Sales"]
    
numeric_columns = ["Customers","Open","Promo","StateHoliday","SchoolHoliday"]
categorical_columns = ["DayOfWeek","Quarter","Month","Year",
                        "Store","Season"]

In [30]:
!pip install pandasql
from pandasql import sqldf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandasql
  Downloading pandasql-0.7.3.tar.gz (26 kB)
Building wheels for collected packages: pandasql
  Building wheel for pandasql (setup.py) ... [?25l[?25hdone
  Created wheel for pandasql: filename=pandasql-0.7.3-py3-none-any.whl size=26787 sha256=dd2aa4c74b0aef2674aa806b83a8f2f6218d44496041efa7cc494d917b7e09c8
  Stored in directory: /root/.cache/pip/wheels/ed/8f/46/a383923333728744f01ba24adbd8e364f2cb9470a8b8e5b9ff
Successfully built pandasql
Installing collected packages: pandasql
Successfully installed pandasql-0.7.3


In [31]:
def concat_data():
    df_train = pd.read_csv('/content/drive/MyDrive/train.csv')
    df_test = pd.read_csv('/content/drive/MyDrive/test.csv')
    df_extra = pd.read_csv('/content/drive/MyDrive/store.csv')
    df_test['Sales'] = -1
    df_full = pd.concat([df_train, df_test]).reset_index(drop=True)

    #Merge extra information about stores
    df_full = df_full.merge(df_extra, left_on=['Store'], right_on=['Store'], how='left')
    
    df_full['Year'] = pd.DatetimeIndex(df_full['Date']).year
    df_full['Month'] = pd.DatetimeIndex(df_full['Date']).month
    df_full['Day'] = pd.DatetimeIndex(df_full['Date']).day
    df_full['WeekOfYear'] = pd.DatetimeIndex(df_full['Date']).weekofyear
    
    # Calculate competition open in months
    df_full['CompetitionOpen'] = 12 * (df_full.Year - df_full.CompetitionOpenSinceYear) + \
        (df_full.Month - df_full.CompetitionOpenSinceMonth)

    # Calculate promo open time in months
    df_full['PromoOpen'] = 12 * (df_full.Year - df_full.Promo2SinceYear) + \
        (df_full.WeekOfYear - df_full.Promo2SinceWeek) / 4.0
    df_full['PromoOpen'] = df_full.PromoOpen.apply(lambda x: x if x > 0 else 0)
    df_full.loc[df_full.Promo2SinceYear == 0, 'PromoOpen'] = 0

    # Transform month interval in a boolean column 
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    df_full['monthStr'] = df_full.Month.map(month2str)
    df_full.loc[df_full.PromoInterval == 0, 'PromoInterval'] = ''
    df_full['IsPromoMonth'] = 0
    for interval in df_full.PromoInterval.unique():
        interval = str(interval)
        if interval != '':
            for month in interval.split(','):
                df_full.loc[(df_full.monthStr == month) & (df_full.PromoInterval == interval), 'IsPromoMonth'] = 1


    return df_full

df_full = concat_data()

In [32]:
def extract_test_data(df_full):
    df_train = df_full.loc[df_full['Sales'] != -1]
    df_test = df_full.loc[df_full['Sales'] == -1]

    return df_train, df_test

df_train, df_test = extract_test_data(df_full)

In [33]:
# Function to calculate missing values by column (By DSA)
def missing_values_table(df):
    # Total missing values
    mis_val = df.isnull().sum()

    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)

    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})

    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)

    # Print some summary information
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
          " columns that have missing values.")

    # Return the dataframe with missing information
    return mis_val_table_ren_columns

missing_values_table(df_full)

Your selected dataframe has 27 columns.
There are 10 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Id,1017209,96.1
Promo2SinceWeek,525263,49.6
Promo2SinceYear,525263,49.6
PromoInterval,525263,49.6
CompetitionOpenSinceMonth,338564,32.0
CompetitionOpenSinceYear,338564,32.0
CompetitionOpen,338564,32.0
Customers,41088,3.9
CompetitionDistance,2738,0.3
Open,11,0.0


In [34]:
def clean_data(use_text_columns = True):
    '''
    Function that clean data and create a new features to enrich the model
    '''
    cols_num = ["Sales", "DayOfWeek", "Open", "Promo", "SchoolHoliday", "CompetitionDistance",
                "CompetitionOpenSinceMonth", "CompetitionOpenSinceYear", "Promo2",
                "Promo2SinceWeek", "Promo2SinceYear", "Wapp", "Avg_Customers", "Year", "Month", "Day",
                "CompetitionOpen", "PromoOpen", "IsPromoMonth", "Store"]

    cols_text = ["StateHoliday", "StoreType", "Assortment"]
    df_train = pd.read_csv('/content/drive/MyDrive/train.csv')
    
    df_extra = pd.read_csv('/content/drive/MyDrive/store.csv')
        
    len_train_data = len(df_train)

    df_test = pd.read_csv('/content/drive/MyDrive/test.csv')

    # Setting null values of column Open in test dataset
    df_test.loc[df_test['DayOfWeek'] != 7, 'Open'] = 1
    df_test.loc[df_test['DayOfWeek'] == 7, 'Open'] = 0

    avg_customer = sqldf(
      """
      SELECT
      Store,
      DayOfWeek,
      sum(case when Customers is not null then Sales/Customers else 0 end) as Wapp,
      round(avg(Customers)) Avg_Customers
      from df_train
      group by Store,DayOfWeek
      """
    )
    
    df_test = sqldf(
      """
      SELECT
      t.*,
      ac.Wapp,
      ac.Avg_Customers
      from df_test t
      left join avg_customer ac on t.Store = ac.Store and t.DayOfWeek = ac.DayOfWeek
      """
    )
    
    df_train = sqldf(
      """
      SELECT
      t.*,
      ac.Wapp,
      ac.Avg_Customers
      from df_train t
      left join avg_customer ac on t.Store = ac.Store and t.DayOfWeek = ac.DayOfWeek
      """
    )

    # Merge train and test dataset
    all_data = pd.concat([df_train, df_test], ignore_index=True)

    df_extra = pd.read_csv('/content/drive/MyDrive/store.csv')
    df_full = pd.concat([df_train, df_test]).reset_index(drop=True)

    # Merge extra information about stores
    all_data = df_full.merge(df_extra, left_on=['Store'], right_on=['Store'], how='left')

    # Separate date in Year, Month and Day
    all_data.loc[all_data['StateHoliday'] == 0, 'StateHoliday'] = 'd'
    all_data['Year'] = pd.DatetimeIndex(all_data['Date']).year
    all_data['Month'] = pd.DatetimeIndex(all_data['Date']).month
    all_data['Day'] = pd.DatetimeIndex(all_data['Date']).day
    all_data['WeekOfYear'] = pd.DatetimeIndex(all_data['Date']).weekofyear

    # Calculate competition open in months
    all_data['CompetitionOpen'] = 12 * (all_data.Year - all_data.CompetitionOpenSinceYear) + \
        (all_data.Month - all_data.CompetitionOpenSinceMonth)

    # Calculate promo open time in months
    all_data['PromoOpen'] = 12 * (all_data.Year - all_data.Promo2SinceYear) + \
        (all_data.WeekOfYear - all_data.Promo2SinceWeek) / 4.0
    all_data['PromoOpen'] = all_data.PromoOpen.apply(lambda x: x if x > 0 else 0)
    all_data.loc[all_data.Promo2SinceYear == 0, 'PromoOpen'] = 0
    
    # Transform month interval in a boolean column 
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    all_data['monthStr'] = all_data.Month.map(month2str)
    all_data.loc[all_data.PromoInterval == 0, 'PromoInterval'] = ''
    all_data['IsPromoMonth'] = 0
    for interval in all_data.PromoInterval.unique():
        interval = str(interval)
        if interval != '':
            for month in interval.split(','):
                all_data.loc[(all_data.monthStr == month) & (all_data.PromoInterval == interval), 'IsPromoMonth'] = 1

    data_numeric = all_data[cols_num]
    
    # Fill NAN values
    # Only column CompetitionDistance is fill NaN with a median value
    data_numeric['CompetitionDistance'].fillna(data_numeric['CompetitionDistance'].median(), inplace = True)

    # Other values is fill with zero
    data_numeric.fillna(0, inplace = True)

    if (use_text_columns):
        data_text = all_data[cols_text]
        data_text = pd.get_dummies(data_text, dummy_na=False)

        complete_data = pd.concat([data_numeric, data_text], axis = 1)

        df_train = complete_data.iloc[:len_train_data,:]
        df_test = complete_data.iloc[len_train_data:,:]
    else:
        df_train = data_numeric.iloc[:len_train_data,:]
        df_test = data_numeric.iloc[len_train_data:,:]

    return df_train, df_test

In [35]:
def load_train_data(scaler_x, scaler_y):
    '''
    Transform train data set and separate a test dataset to validate the model in the end of training and normalize data
    '''
    X_train = train.drop(["Sales"], axis=1) # Features
    y_train = np.array(train["Sales"]).reshape((len(X_train), 1)) # Targets
    X_train = scaler_x.fit_transform(X_train)
    y_train = scaler_y.fit_transform(y_train)

    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

    return (X_train, y_train), (X_test, y_test)

In [36]:
def load_test_data():
    '''
    Remove column of predictions and normalize data of submission test data set.
    '''
    X_test = test.drop(["Sales"], axis=1) # Features
    X_test = StandardScaler().fit_transform(X_test)
    X_val = test.drop(["sales"])
    return X_test

In [37]:
def calculate_outlier(df, column): ## function for calculating outliers
    Q3 = df[column].quantile(0.75)
    Q1 = df[column].quantile(0.25)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    percent_outliers = round(((df[df[column] > upper].shape[0]) + (df[df[column] < lower].shape[0])) / df.shape[0] * 100, 2)
    
    return lower, upper, percent_outliers

In [38]:
train = df_train
test = df_test

In [51]:
from sklearn.preprocessing import StandardScaler

scaler_x = StandardScaler()
scaler_y = StandardScaler()

X_train = train.drop(["Sales"], axis=1) # Features
y_train = np.array(train["Sales"]).reshape((len(X_train), 1)) # Targets
X_train = scaler_x.fit_transform(X_train)
y_train = scaler_y.fit_transform(y_train)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

In [40]:
col_sales = 'Sales'
lower_sales, upper_sales, percent_outliers_sales = calculate_outlier(train, col_sales)

print("lower band = " + str(lower_sales))
print("upper band = " + str(upper_sales))
print("percentage of sales that are outliers = " + str(percent_outliers_sales) + "%")
train[train[col_sales] > upper_sales]
train[train[col_sales] < lower_sales]

train.loc[train[col_sales] > upper_sales, 'Sales'] = 14650
train.loc[train[col_sales] < lower_sales, 'Sales'] = -2366

print("Removing identified outliers for sales..." )

print("Completed: See table for confirmation. ")

train[train['Sales'] > 14650]
train[train['Sales'] < -2366]

lower band = -2466.5
upper band = 14049.5
percentage of sales that are outliers = 2.62%
Removing identified outliers for sales...
Completed: See table for confirmation. 


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth


In [None]:
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth
0,1,5,2015-07-31,5263,555.0,1.0,1,0,1,,c,a,1270.0,9.0,2008.0,0,,,,2015,7,31,31,82.0,0.0,Jul,0
1,2,5,2015-07-31,6064,625.0,1.0,1,0,1,,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,31,31,92.0,64.5,Jul,1
2,3,5,2015-07-31,8314,821.0,1.0,1,0,1,,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2015,7,31,31,103.0,52.25,Jul,1
3,4,5,2015-07-31,13995,1485.0,1.0,1,0,1,,c,c,620.0,9.0,2009.0,0,,,,2015,7,31,31,70.0,0.0,Jul,0
4,5,5,2015-07-31,4822,559.0,1.0,1,0,1,,a,a,29910.0,4.0,2015.0,0,,,,2015,7,31,31,3.0,0.0,Jul,0


In [41]:
col_customers = 'Customers'
lower_customers, upper_customers, percent_outliers_customers = calculate_outlier(train, col_customers)


print("higher band = " + str(upper_customers))

print("percentage of customers that are outliers = " + str(percent_outliers_customers) + "%")

train[train[col_customers] > upper_customers]
train.loc[train[col_customers] > upper_customers, 'Customers'] = 1485


print("Removing identified outliers for customers..." )
print("See table for confirmation. ")

train[train['Customers'] > 1485]

higher band = 1485.0
percentage of customers that are outliers = 3.75%
Removing identified outliers for customers...
See table for confirmation. 


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth


In [42]:
col_sales = 'Sales'
lower_sales, upper_sales, percent_outliers_sales = calculate_outlier(test, col_sales)

print("lower band = " + str(lower_sales))
print("upper band = " + str(upper_sales))
print("percentage of sales that are outliers = " + str(percent_outliers_sales) + "%")
test[test[col_sales] > upper_sales]
test[test[col_sales] < lower_sales]

test.loc[test[col_sales] > upper_sales, 'Sales'] = 14650
test.loc[test[col_sales] < lower_sales, 'Sales'] = -2366

print("Removing identified outliers for sales..." )

print("Completed: See table for confirmation. ")

test[test['Sales'] > 14650]
test[test['Sales'] < -2366]

lower band = -1.0
upper band = -1.0
percentage of sales that are outliers = 0.0%
Removing identified outliers for sales...
Completed: See table for confirmation. 


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth


In [43]:
col_customers = 'Customers'
lower_customers, upper_customers, percent_outliers_customers = calculate_outlier(test, col_customers)


print("higher band = " + str(upper_customers))

print("percentage of customers that are outliers = " + str(percent_outliers_customers) + "%")

test[test[col_customers] > upper_customers]
test.loc[test[col_customers] > upper_customers, 'Customers'] = 1485


print("Removing identified outliers for customers..." )
print("See table for confirmation. ")

test[test['Customers'] > 1485]

higher band = nan
percentage of customers that are outliers = 0.0%
Removing identified outliers for customers...
See table for confirmation. 


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth


In [None]:
test.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Id,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,monthStr,IsPromoMonth
1017209,1,4,2015-09-17,-1,,1.0,1,0,0,1.0,c,a,1270.0,9.0,2008.0,0,,,,2015,9,17,38,84.0,0.0,Sept,0
1017210,3,4,2015-09-17,-1,,1.0,1,0,0,2.0,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",2015,9,17,38,105.0,54.0,Sept,0
1017211,7,4,2015-09-17,-1,,1.0,1,0,0,3.0,a,c,24000.0,4.0,2013.0,0,,,,2015,9,17,38,29.0,0.0,Sept,0
1017212,8,4,2015-09-17,-1,,1.0,1,0,0,4.0,a,a,7520.0,10.0,2014.0,0,,,,2015,9,17,38,11.0,0.0,Sept,0
1017213,9,4,2015-09-17,-1,,1.0,1,0,0,5.0,a,c,2030.0,8.0,2000.0,0,,,,2015,9,17,38,181.0,0.0,Sept,0


In [None]:
train.isnull().sum()

Store                              0
DayOfWeek                          0
Date                               0
Sales                              0
Customers                          0
Open                               0
Promo                              0
StateHoliday                       0
SchoolHoliday                      0
Id                           1017209
StoreType                          0
Assortment                         0
CompetitionDistance             2642
CompetitionOpenSinceMonth     323348
CompetitionOpenSinceYear      323348
Promo2                             0
Promo2SinceWeek               508031
Promo2SinceYear               508031
PromoInterval                 508031
Year                               0
Month                              0
Day                                0
WeekOfYear                         0
CompetitionOpen               323348
PromoOpen                          0
monthStr                           0
IsPromoMonth                       0
d

In [None]:
test.isnull().sum()

Store                            0
DayOfWeek                        0
Date                             0
Sales                            0
Customers                    41088
Open                            11
Promo                            0
StateHoliday                     0
SchoolHoliday                    0
Id                               0
StoreType                        0
Assortment                       0
CompetitionDistance             96
CompetitionOpenSinceMonth    15216
CompetitionOpenSinceYear     15216
Promo2                           0
Promo2SinceWeek              17232
Promo2SinceYear              17232
PromoInterval                17232
Year                             0
Month                            0
Day                              0
WeekOfYear                       0
CompetitionOpen              15216
PromoOpen                        0
monthStr                         0
IsPromoMonth                     0
dtype: int64

In [44]:
def to_category(df):
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        ratio = len(df[col].value_counts()) / len(df)
        if ratio < 0.05:
            df[col] = df[col].astype('category')
    return df

def one_hot(df):
  return pd.get_dummies(df)

def load_train_data(scaler_x, scaler_y):
    '''
    Transform train data set and separate a test dataset to validate the model in the end of training and normalize data
    '''
    X_train = train.drop(["Sales"], axis=1) # Features
    y_train = np.array(train["Sales"]).reshape((len(X_train), 1)) # Targets
    X_train = scaler_x.fit_transform(X_train)
    y_train = scaler_y.fit_transform(y_train)

    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

    return (X_train, y_train), (X_test, y_test)
def load_test_data():
    '''
    Remove column of predictions and normalize data of submission test data set.
    '''
    X_test = test.drop(["Sales"], axis=1) # Features
    X_test = StandardScaler().fit_transform(X_test)
    X_val = test.drop(["sales"])
    return X_test

def replace_nan(df):
  return df.fillna(df.mode())

In [27]:
def load_test_data():
    '''
    Remove column of predictions and normalize data of submission test data set.
    '''
    X_test = test.drop(["Sales"], axis=1) # Features
    X_test = StandardScaler().fit_transform(X_test)

    return X_test

In [45]:
train, test = clean_data(use_text_columns=True)

In [46]:
scaler = StandardScaler()

scaler.fit_transform(train)

array([[-0.13268286,  0.50148416,  0.45239852, ...,  0.94481508,
        -0.09066814, -0.9294682 ],
       [ 0.07537318,  0.50148416,  0.45239852, ...,  0.94481508,
        -0.09066814, -0.9294682 ],
       [ 0.65980026,  0.50148416,  0.45239852, ...,  0.94481508,
        -0.09066814, -0.9294682 ],
       ...,
       [-1.49972273, -1.00047591, -2.21044047, ..., -1.05840817,
        -0.09066814,  1.07588404],
       [-1.49972273, -1.00047591, -2.21044047, ..., -1.05840817,
        -0.09066814,  1.07588404],
       [-1.49972273, -1.00047591, -2.21044047, ..., -1.05840817,
        -0.09066814,  1.07588404]])

In [47]:
scaler.fit_transform(test)

array([[ 0.        ,  0.01033678,  0.41319694, ...,  1.01175128,
        -0.10308122, -0.99069747],
       [ 0.        ,  0.01033678,  0.41319694, ...,  1.01175128,
        -0.10308122, -0.99069747],
       [ 0.        ,  0.01033678,  0.41319694, ..., -0.9883852 ,
        -0.10308122,  1.00938988],
       ...,
       [ 0.        ,  1.00266767,  0.41319694, ..., -0.9883852 ,
        -0.10308122,  1.00938988],
       [ 0.        ,  1.00266767,  0.41319694, ..., -0.9883852 ,
        -0.10308122,  1.00938988],
       [ 0.        ,  1.00266767,  0.41319694, ..., -0.9883852 ,
        -0.10308122,  1.00938988]])

In [48]:
X_train = train.drop(["Sales"], axis=1) # Features
y_train = np.array(train["Sales"]).reshape((len(X_train), 1)) # Targets

In [49]:
# processed_train_data = (train.
# pipe(date_time).
# pipe(to_category).
# pipe(one_hot))


train = (train.pipe(to_category).pipe(one_hot).pipe(replace_nan))

In [50]:
test = (test.pipe(to_category).pipe(one_hot).pipe(replace_nan))

In [None]:
train.to_csv('train_scaled.csv')

In [None]:
test.to_csv('test_scaled.csv')

In [None]:
train.shape

(1017209, 31)

In [None]:
from sklearn.preprocessing import RobustScaler

scaler_robust = RobustScaler()

In [None]:
robust_train = scaler_robust.fit_transform(train)
robust_test = scaler_robust.fit_transform(test)

In [None]:
test.shape

(41088, 31)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler_x = StandardScaler()
scaler_y = StandardScaler()

(X_train, y_train), (X_test, y_test) = load_train_data(scaler_x, scaler_y)
(X_train, y_train), (X_val, y_val) = load_train_data(scaler_x, scaler_y)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler_x = StandardScaler()
scaler_y = StandardScaler()

In [None]:
X_test = test.drop(["Sales"], axis=1) # Features
X_test = StandardScaler().fit_transform(X_test)

In [None]:
X_train.shape

(813767, 30)

In [None]:
test_data = load_test_data()

In [None]:
mean_sales = y_train.mean()
print("Average Sales :", mean_sales)

Average Sales : 0.00022326985706113255


In [None]:
df_1 = pd.read_csv('https://raw.githubusercontent.com/Charliebond125/CE889_Group_Project/main/train_out1.csv')
df_2 = pd.read_csv('https://raw.githubusercontent.com/Charliebond125/CE889_Group_Project/main/train_out2.csv')
train = pd.DataFrame(np.concatenate([df_1.values, df_2.values], axis=0), columns=df_1.columns)

In [None]:
train.shape

(1017209, 31)

In [None]:
train.head()

Unnamed: 0,Sales,DayOfWeek,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Wapp,Avg_Customers,Year,Month,Day,CompetitionOpen,PromoOpen,IsPromoMonth,Store,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c
0,5263.0,5,1.0,1,1,1270.0,9.0,2008.0,0,0.0,0.0,1012.0,537.0,2015,7,31,82.0,0.0,0,1,1,0,0,0,0,0,1,0,1,0,0
1,6064.0,5,1.0,1,1,570.0,11.0,2007.0,1,13.0,2010.0,1013.0,537.0,2015,7,31,92.0,64.5,1,2,1,0,0,0,1,0,0,0,1,0,0
2,8314.0,5,1.0,1,1,14130.0,12.0,2006.0,1,14.0,2011.0,1098.0,747.0,2015,7,31,103.0,52.25,1,3,1,0,0,0,1,0,0,0,1,0,0
3,13995.0,5,1.0,1,1,620.0,9.0,2009.0,0,0.0,0.0,870.0,1245.0,2015,7,31,70.0,0.0,0,4,1,0,0,0,0,0,1,0,0,0,1
4,4822.0,5,1.0,1,1,29910.0,4.0,2015.0,0,0.0,0.0,1022.0,540.0,2015,7,31,3.0,0.0,0,5,1,0,0,0,1,0,0,0,1,0,0


In [None]:
train.shape

(1017208, 31)

In [None]:
test.shape

(41088, 32)

In [52]:
X_train.shape

(813767, 30)

In [None]:
train.head()

In [None]:
train = train.drop(['Assortment_a'], axis=1)

In [None]:
train = train.drop(['Assortment_b'], axis=1)

In [None]:
train = train.drop(['Assortment_c'], axis=1)

In [None]:
test = test.drop(['Assortment_a'], axis=1)

In [None]:
test = test.drop(['Assortment_b'], axis=1)

In [None]:
test = test.drop(['Assortment_c'], axis=1)

In [None]:
X_val.shape

(162754, 31)

In [None]:
print("Train size : {}".format(len(train)))
print("Test size : {}".format(len(test)))

Train size : 1017209
Test size : 41088


In [54]:
from sklearn.metrics import mean_squared_error

def show_info(model, X, y, log, weights = None):
    '''
    Show metrics about the evaluation model and plots about loss, rmse and rmspe
    '''
    if (log != None):
        # summarize history for loss
        plt.figure(figsize=(14,10))
        plt.plot(log.history['loss'])
        plt.plot(log.history['val_loss'])
        plt.title('Model Loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()
        print('\n')
        
        # summarize history for rmse
        plt.figure(figsize=(14,10))
        plt.plot(log.history['rmse'])
        plt.plot(log.history['val_rmse'])
        plt.title('Model RMSE')
        plt.ylabel('rmse')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()
        print('\n')
        
        # summarize history for rmspe
        plt.figure(figsize=(14,10))
        plt.plot(log.history['rmspe'])
        plt.plot(log.history['val_rmspe'])
        plt.title('Model RMSPE')
        plt.ylabel('rmspe')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()

    if (weights != None):
        model.load_weights(weights)

    predictions = model.predict(X, verbose=1)

    mse = mean_squared_error(y, predictions)
    rmse = sqrt(mse)
    #rmspe = rmspe_val(y, predictions)

    print('MSE: %.3f' % mse)
    print('RMSE: %.3f' % rmse)
    #print('RMSPE: %.3f' % rmspe)

In [55]:
def rmspe_val(y_true, y_pred):
    '''
    RMSPE calculus to validate evaluation metric about the model
    '''
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true), axis=0))[0]

In [56]:
def rmspe(y_true, y_pred):
    '''
    RMSPE calculus to use during training phase
    '''
    return K.sqrt(K.mean(K.square((y_true - y_pred) / y_true), axis=-1))

In [57]:
def rmse(y_true, y_pred):
  from keras import backend as K
  '''
  RMSE calculus to use during training phase
  '''
  return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [58]:
from keras import backend as K


def rmse(y_true, y_pred):
   return K.abs(K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)))

In [None]:
# def rmspe_val(y_true, y_pred):
#     '''
#     RMSPE calculus to validate evaluation metric about the model
#     '''
#     return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true), axis=0))[0]

In [66]:
from tensorflow.python.ops.init_ops import RandomNormal
from tensorflow import keras
from keras.models import Sequential
from keras import layers
from keras.layers import SimpleRNN, Flatten, LSTM, Dropout, Dense
from keras.optimizers import SGD, Adam, RMSprop

from keras.layers import LeakyReLU


def create_model():
  ''' Create NN '''
  initializer = RandomNormal(mean=0.0, stddev=0.05, seed=None)
  model = Sequential()

  # Initialize first layer
  model.add(LSTM(units=50, activation='tanh', return_sequences=True, input_shape=(X_train.shape[1],1)))
  model.add(Dropout(0.2))
  # initialise second layer
  model.add(LSTM(units=50, activation='relu', return_sequences=True, input_shape=(X_train.shape[1],1)))
  #model.add(LeakyReLU(alpha=0.05))
  model.add(Dropout(0.2))
  # initialise third layer
  model.add(LSTM(units=50, activation='relu', return_sequences=True, input_shape=(X_train.shape[1],1)))
  #model.add(LeakyReLU(alpha=0.05))
  model.add(Dropout(0.2))
  # initialise fourth layer
  model.add(LSTM(units=50, activation='relu', return_sequences=True, input_shape=(X_train.shape[1],1)))
  model.add(Dropout(0.2))
  # Output layer
  model.add(Dense(units=25))

  
  # model.add(Dense(512, input_dim=X_train.shape[1], kernel_initializer=initializer))
  # model.add(Dropout(0.4))
  # model.add(Dense(512, input_dim=X_train.shape[1], activation='tanh', kernel_initializer=initializer))
  # model.add(Dropout(0.4))
  # model.add(Dense(512, input_dim=X_train.shape[1], activation='tanh', kernel_initializer=initializer))
  # model.add(Dropout(0.4))
  # model.add(Dense(512, activation="sigmoid", kernel_initializer=initializer))

  adam = Adam(lr=0.9)
  model.compile(loss="mean_squared_error", optimizer="adam", metrics=["mean_absolute_error"])
  return model

batch_size = 64 # shttps://www.sciencedirect.com/science/article/pii/S2405959519303455
epochs = 10

print("Building model...")
model = create_model()
model.summary()

# regressor.add(SimpleRNN(units = 50, activation = "tanh", return_sequences = True, input_shape = (X_train.shape[1], 1)))
# regressor.add(Dropout(0.2))

# #second layer
# regressor.add(SimpleRNN(units = 50, activation = "tanh", return_sequences = True))
# regressor.add(Dropout(0.2))                      

# # Third RNN layer and some Dropout regularisation
# regressor.add(SimpleRNN(units = 50, activation = "tanh", return_sequences = True))
# regressor.add(Dropout(0.2))

# # Fourth RNN layer and some Dropout regularisation
# regressor.add(SimpleRNN(units = 50))
# regressor.add(Dropout(0.2))

# # Add the output layer
# regressor.add(Dense(units = 1))

# # Compiling the RNN
# regressor.compile(optimizer = "adam", loss = "mean_squared_error")

# # Fitting the RNN to the training set
# regressor.fit(X_train, y_train, epochs = 100, batch_size = 32)


# add embedding layer expecting input and output

# model.add(layers.Embedding(input_dim=1000, output_dim=64))
# model.add(layers.LSTM(128, input_shape = (5, 10)))
# model.add(layers.Dense(10))
# model.build()
# model.summary()


Building model...




Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 30, 50)            10400     
                                                                 
 dropout_4 (Dropout)         (None, 30, 50)            0         
                                                                 
 lstm_5 (LSTM)               (None, 30, 50)            20200     
                                                                 
 dropout_5 (Dropout)         (None, 30, 50)            0         
                                                                 
 lstm_6 (LSTM)               (None, 30, 50)            20200     
                                                                 
 dropout_6 (Dropout)         (None, 30, 50)            0         
                                                                 
 lstm_7 (LSTM)               (None, 30, 50)           

In [68]:
X_train.shape

(813767, 30)

In [69]:
y_train.shape

(813767, 1)

In [72]:
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
print("Fitting model...")

epochs = 300
batch_size=128

file_model = "best_weights_rossman_team.hdf5"
checkpoint = ModelCheckpoint(file_model, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=50)
callbacks_list = [checkpoint, early_stopping]
log = create_model.fit(X_train, y_train,
                validation_split=0.20, batch_size=batch_size, epochs = epochs, shuffle=True, callbacks=callbacks_list)

Fitting model...
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 512)               15872     
                                                                 
 dropout_8 (Dropout)         (None, 512)               0         
                                                                 
 dense_3 (Dense)             (None, 512)               262656    
                                                                 
 dropout_9 (Dropout)         (None, 512)               0         
                                                                 
 dense_4 (Dense)             (None, 512)               262656    
                                                                 
 dropout_10 (Dropout)        (None, 512)               0         
                                                                 
 dense_5 (Dense)             (None, 1

AttributeError: ignored

In [None]:
train, test = clean_data(use_text_columns = True)

In [70]:
def create_model():
    '''
    Create a neural network
    '''
    initializer = RandomNormal(mean=0.0, stddev=0.05, seed=None)

    model = Sequential()
    model.add(Dense(512, input_dim=X_train.shape[1], activation="relu", kernel_initializer=initializer))
    model.add(Dropout(0.4))
    model.add(Dense(512, input_dim=X_train.shape[1], activation="relu", kernel_initializer=initializer))
    model.add(Dropout(0.4))
    model.add(Dense(512, input_dim=X_train.shape[1], activation="relu", kernel_initializer=initializer))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation="linear", kernel_initializer=initializer))
    adam = Adam(lr=1e-3, decay=1e-3)

    # Compile model
    model.compile(loss="mean_squared_error", optimizer=adam, metrics=[rmse, rmspe])

    return model