In [8]:
import os

import numpy as np
import pandas as pd

In [9]:
df = pd.read_csv('data/csv/anonymized_sales_data.csv')

## Exploration

In [10]:
df.describe(include='all')

Unnamed: 0,invoice_date,customer_code,location_code,channel_text,customer_status,conventional_synthetic,variety,size,sale_value
count,541796,541796.0,541796,541796,541796,541796,541796,541796,541796.0
unique,1414,,7,6,1,2,4,7,
top,2021-09-07,,TX_IAH,Manufacturer/Distributor,Active,CONVENTIONAL,5W30,Case_12X2_Gallon,
freq,646,,233794,368473,541796,531016,298737,139854,
mean,,203148.639268,,,,,,,1372.337826
std,,192759.723394,,,,,,,2694.498508
min,,3409.0,,,,,,,-12924.91
25%,,14793.0,,,,,,,563.49
50%,,132281.0,,,,,,,696.35
75%,,403912.0,,,,,,,1145.83


## Changes

### customer_status

In [11]:
# Remove it
df = df.drop(columns='customer_status')

### invoice_date

In [12]:
# Convert to datetime object
df['invoice_date'] = pd.to_datetime(df['invoice_date'], format='%Y-%m-%d')

### customer_code

In [13]:
df['customer_code'] = df['customer_code'].astype(np.uint32)

### sale_value

In [14]:
df['sale_value'] = df['sale_value'].astype(np.int32)

### multiple categorical conversion

In [15]:
columns = set(df.columns)
converted = set(['customer_code', 'invoice_date', 'sale_value'])
columns -= converted
for column in columns:
    df[column] = df[column].astype('category')

In [16]:
df.dtypes

invoice_date              datetime64[ns]
customer_code                     uint32
location_code                   category
channel_text                    category
conventional_synthetic          category
variety                         category
size                            category
sale_value                         int32
dtype: object

### drop duplicates

In [17]:
df.shape

(541796, 8)

### drop negative sales

In [18]:
df = df.drop_duplicates()
df.shape

(541487, 8)

In [23]:
df = df[df['sale_value'] > 0]
df.shape

(541480, 8)

### split by location

In [None]:
locs = list(df['location_code'].unique())
locs

['TX_AUS', 'TX_IAH', 'TX_SAT', 'NC_CLT', 'TX_DFW', 'OK_TUL', 'CA_SMF']

In [None]:
%%timeit

if not os.path.isdir('data'):
    os.mkdir('data')
    os.mkdir('data/csv')

for loc in locs:
    path = os.path.join('data', loc) + '.parquet'
    csv_path = os.path.join('data', 'csv', loc + '.csv')
    write_df = df.query(f'location_code == \'{loc}\'')

    write_df.to_parquet(path)
    write_df.to_csv(csv_path)    

1.91 s ± 46.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### benchmarking

In [None]:
%%timeit
# Slower and ...
for loc in ['TX_AUS', 'TX_IAH', 'TX_SAT', 'NC_CLT', 'TX_DFW', 'OK_TUL', 'CA_SMF']:
    _ = pd.read_csv(os.path.join('data', 'csv', loc + '.csv'))

283 ms ± 1.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
# ...does not preserve data column types
pd.read_csv('data/csv/TX_AUS.csv').dtypes

Unnamed: 0                 int64
invoice_date              object
customer_code              int64
location_code             object
channel_text              object
conventional_synthetic    object
variety                   object
size                      object
sale_value                 int64
dtype: object

In [None]:
%%timeit
for loc in ['TX_AUS', 'TX_IAH', 'TX_SAT', 'NC_CLT', 'TX_DFW', 'OK_TUL', 'CA_SMF']:
    _ = pd.read_parquet(os.path.join('data', loc + '.parquet'))

35.8 ms ± 753 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
pd.read_parquet('data/TX_AUS.parquet').dtypes

invoice_date              datetime64[ns]
customer_code                     uint32
location_code                   category
channel_text                    category
conventional_synthetic          category
variety                         category
size                            category
sale_value                         int32
dtype: object