# Imports

In [1]:
import pickle
import csv

# Define Functions

In [2]:
# The following function iterates a csv file and returns a list of dictionaries
def csv2dicts(csvfile):
    data = [] # stores a list of dictionaries. Each dictionary is a row in the csv file
    keys = [] # stores the keys of the dictionary
    for row_index, row in enumerate(csvfile):
        if row_index == 0: # first row is the header
            keys = row # store the keys
            continue # skip the rest of the code and go to the next iteration
        data.append({key: value for key, value in zip(keys, row)}) # create a dictionary for each row
    return data # return the list of dictionaries

# The following function iterates over the data and replaces empty strings with a given string (default is '0')
def set_nan_as_string(data, replace_str='0'):
    for i, x in enumerate(data):
        for key, value in x.items():
            if value == '': # Check if string is empty
                x[key] = replace_str # Replace empty string with a given string
        data[i] = x # Update the dictionary

# Declare Paths

In [3]:
train_data = "train.csv"
store_data = "store.csv"
store_states = 'store_states.csv'

# Data Extraction

In [4]:
# Read train.csv and convert it to a list of dictionaries. Reverse the data so that it is sorted by date. Save the data as a pickle file
with open(train_data) as csvfile:
    data = csv.reader(csvfile, delimiter=',') # read the csv file
    with open('train_data.pickle', 'wb') as f:
        data = csv2dicts(data) # convert the csv file to a list of dictionaries
        data = data[::-1] # reverse the data so that it is sorted by date
        pickle.dump(data, f, -1) # save the data as a pickle file
        print(data[:3]) # print the first 3 rows of the data

# Read store.csv and store_states.csv. Convert them to lists of dictionaries. Remove empty strings from stores.csv. Add the state column from store_states.csv to stores.csv. Save the data as a pickle file
with open(store_data) as csvfile, open(store_states) as csvfile2:
    data = csv.reader(csvfile, delimiter=',') # read the store.csv file
    state_data = csv.reader(csvfile2, delimiter=',') # read the store_states.csv file
    with open('store_data.pickle', 'wb') as f:
        data = csv2dicts(data) # convert the store.csv file to a list of dictionaries
        state_data = csv2dicts(state_data) # convert the store_states.csv file to a list of dictionaries
        set_nan_as_string(data) # remove empty strings from the store.csv file
        for index, val in enumerate(data): # iterate over the store.csv file
            state = state_data[index] # get the state from the store_states.csv file
            val['State'] = state['State'] # add the state to the store.csv file
            data[index] = val # update the dictionary
        pickle.dump(data, f, -1) # save the data as a pickle file
        print(data[:2]) # print the first 2 rows of the data

[{'Store': '1115', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1114', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1113', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}]
[{'Store': '1', 'StoreType': 'c', 'Assortment': 'a', 'CompetitionDistance': '1270', 'CompetitionOpenSinceMonth': '9', 'CompetitionOpenSinceYear': '2008', 'Promo2': '0', 'Promo2SinceWeek': '0', 'Promo2SinceYear': '0', 'PromoInterval': '0', 'State': 'HE'}, {'Store': '2', 'StoreType': 'a', 'Assortment': 'a', 'CompetitionDistance': '570', 'CompetitionOpenSinceMonth': '11', 'CompetitionOpenSinceYear': '2007', 'Promo2': '1', 'Promo2SinceWeek': '13', 'Promo2SinceYear': '2010', 'PromoInterval': 'Jan,Apr,Jul,Oct', 'Stat