<h1> Imports </h1>

In [1]:
import pickle
from datetime import datetime
from sklearn import preprocessing
import numpy as np
import random
random.seed(42)

<h1> Load data </h1>

Load 3 lists: <br>
<ol>
<li> train_data </li>
<li> num_records </li>
<li> store </li>

In [2]:

# Open pickle files
with open('train_data.pickle', 'rb') as f:
    train_data = pickle.load(f)
    num_records = len(train_data)

with open('store_data.pickle', 'rb') as f:
    store_data = pickle.load(f)


# Print the first three store_data, train_data, and num_records
print(store_data[:3])
print(train_data[:3])
print(num_records)

[{'Store': '1', 'StoreType': 'c', 'Assortment': 'a', 'CompetitionDistance': '1270', 'CompetitionOpenSinceMonth': '9', 'CompetitionOpenSinceYear': '2008', 'Promo2': '0', 'Promo2SinceWeek': '0', 'Promo2SinceYear': '0', 'PromoInterval': '0', 'State': 'HE'}, {'Store': '2', 'StoreType': 'a', 'Assortment': 'a', 'CompetitionDistance': '570', 'CompetitionOpenSinceMonth': '11', 'CompetitionOpenSinceYear': '2007', 'Promo2': '1', 'Promo2SinceWeek': '13', 'Promo2SinceYear': '2010', 'PromoInterval': 'Jan,Apr,Jul,Oct', 'State': 'TH'}, {'Store': '3', 'StoreType': 'a', 'Assortment': 'a', 'CompetitionDistance': '14130', 'CompetitionOpenSinceMonth': '12', 'CompetitionOpenSinceYear': '2006', 'Promo2': '1', 'Promo2SinceWeek': '14', 'Promo2SinceYear': '2011', 'PromoInterval': 'Jan,Apr,Jul,Oct', 'State': 'NW'}]
[{'Store': '1115', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1114', 'DayOfWeek': '2', '

<h1> Define Feature List Function </h1>

Extracts features from the records in train_data and the get the value of sales (y)



In [3]:

def feature_list(record):
    # Convert date to datetime object and then store the year, month, and day
    dt = datetime.strptime(record['Date'], '%Y-%m-%d')
    # Convert the store index to an integer
    store_index = int(record['Store'])
    year = dt.year
    month = dt.month
    day = dt.day
    day_of_week = int(record['DayOfWeek'])

    try:
        store_open = int(record['Open'])
    except:
        store_open = 1

    promo = int(record['Promo'])

    # Return a list of features
    return [store_open,
            store_index,
            day_of_week,
            promo,
            year,
            month,
            day,
            store_data[store_index - 1]['State']
            ]

In [4]:
# Create the training set
train_data_X = []
train_data_y = []

# Iterate over the training records
for record in train_data:
    # Check if the store is open
    if record['Sales'] != '0' and record['Open'] != '':
        # Extract the features and append to train_data_X and train_data_y
        fl = feature_list(record)
        train_data_X.append(fl)
        train_data_y.append(int(record['Sales']))
print("Number of train datapoints: ", len(train_data_y))

print(min(train_data_y), max(train_data_y))

Number of train datapoints:  844338
46 41551


<h1> Representat data efficently </h1> 

Using LabelEncoder for this task

In [5]:
# Convert the list of lists to a numpy array
full_X = train_data_X
full_X = np.array(full_X)
train_data_X = np.array(train_data_X)

les = []

# # Loop over each column in the training data
for i in range(train_data_X.shape[1]):
    # Transform the data in the current column of 'train_data_X' using the fitted label encoder
    le = preprocessing.LabelEncoder()
    le.fit(full_X[:, i])
    les.append(le)
    train_data_X[:, i] = le.transform(train_data_X[:, i])

# Save the list of label encoders to a pickle file for later use
with open('les.pickle', 'wb') as f:
    pickle.dump(les, f, -1)

train_data_X = train_data_X.astype(int)
train_data_y = np.array(train_data_y)

# Save the processed training data to a pickle file for later use
with open('feature_train_data.pickle', 'wb') as f:
    pickle.dump((train_data_X, train_data_y), f, -1)
    print(train_data_X[0], train_data_y[0])


[  0 109   1   0   0   0   0   7] 5961


In [6]:
# Print shapes of train_data_X and train_data_y
print(train_data_X.shape)
print(train_data_y.shape)

(844338, 8)
(844338,)
