# Project 4 - Background Work

## Setting Up

### imports

In [50]:
import openpyxl
import os
import pandas as pd
import petl as etl

### create data directory

In [51]:
raw_data_path = "data/raw"
raw_data_directory = []

for filename in os.listdir(raw_data_path):
    file = os.path.join(raw_data_path, filename)

    if os.path.isfile(file):
        #print(file)
        raw_data_directory.append(file)

In [52]:
#raw_data_directory

### remove csv duplicates

In [53]:
len(raw_data_directory)

94

In [54]:
for file in raw_data_directory:
    if str(file).endswith('json'):
        if str(file).replace('json', 'csv') in raw_data_directory:
            raw_data_directory.remove(str(file).replace('json', 'csv'))

len(raw_data_directory)

86

In [55]:
#raw_data_directory

### acquire product list

In [56]:
# acquire product list from directory
product_list = list(
    etl.fromcsv('data/other/products_list.csv')\
        .values('product', 'category'))
        
# print for checking
product_list


[('Fairtrade Bananas Loose', 'fruits & vegetables'),
 ('British carrots loose', 'fruits & vegetables'),
 ('Onions Loose', 'fruits & vegetables'),
 ('British baking potatoes loose', 'fruits & vegetables'),
 ('Red pepper', 'fruits & vegetables'),
 ('Mixed pepper', 'fruits & vegetables'),
 ('Brocolli loose', 'fruits & vegetables'),
 ('Lemon', 'fruits & vegetables'),
 ('Spring onions bunch', 'fruits & vegetables'),
 ('Sweet potatoes loose', 'fruits & vegetables'),
 ('courgette loose', 'fruits & vegetables'),
 ('baby potatoes', 'fruits & vegetables'),
 ('british parsnips loose', 'fruits & vegetables'),
 ('fine beans', 'fruits & vegetables'),
 ('garlic', 'fruits & vegetables'),
 ('celery', 'fruits & vegetables'),
 ('aubergine', 'fruits & vegetables'),
 ('raspberries', 'fruits & vegetables'),
 ('british bramley cooking apples', 'fruits & vegetables'),
 ('easy peeler loose', 'fruits & vegetables'),
 ('scottish salmon', 'meat & fish'),
 ('beef mince', 'meat & fish'),
 ('british fresh chicken br

### acquire region list

In [57]:
# acquire branch list from directory
branch_list = list(
    etl.fromxlsx('data/other/branch_list.xlsx')\
        .values('region', 'county', 'branch_name'))



# print for checking
branch_list

[('East of England', 'Bedfordshire', 'Bedfordshire store'),
 ('East of England', 'East Cambridgeshire', 'East Cambridgeshire outlet'),
 ('West Midlands', 'Warwickshire', 'Warwickshire branch'),
 ('East Midlands', 'Lincolnshire', 'Lincolnshire store'),
 ('London', 'Islington', 'Islington branch'),
 ('North East England', 'Stockton-on-Tees', 'Stockton-on-Tees store'),
 ('Northern Ireland', 'Belfast', 'Belfast branch'),
 ('North West England', 'Wyre', 'Wyre branch'),
 ('South East England', 'Mole Valley', 'Mole Valley store'),
 ('Scotland', 'Orkney', 'Orkney store'),
 ('South West England', 'Dorset', 'Dorset outlet'),
 ('East Midlands', 'Nottinghamshire', 'Nottinghamshire store'),
 ('Yorkshire and the Humber', 'North Yorkshire', 'North Yorkshire outlet'),
 ('South East England', 'Reigate and Banstead', 'Reigate and Banstead branch'),
 ('South East England', 'Epsom and Ewell', 'Epsom and Ewell store'),
 ('Wales', 'Wrexham', 'Wrexham store'),
 ('Wales', 'Isle of Anglesey', 'Isle of Anglesey

### creation of blank dfs

In [58]:
all_branches_df = pd.DataFrame()

## Cleanup & Transform Functions

### adding product category

In [59]:
# add product category function
def add_product_category(prv, cur, nxt):
    for row in product_list:
        if cur.product == row[0]:
            return row[1]

### adding region & city

In [60]:
# add region & city function
def add_region_city(table, file):
    # acquire city name from file
    branch_name = str(file).split('_', 1).pop(1)\
        .rsplit('.', 1).pop(0)\
        .replace('_', ' ')
    # check through branch list for a match
    for row in branch_list:
        if branch_name == row[2]:
            table = etl.addfields(table, [('county', row[1]), ('region', row[0])])
            return table

### cleanup

#### fix headers

In [61]:
def fix_headers(table):
    
# if table has the header sku or item.. change to product
    if 'sku' in etl.header(table):
        table = etl.rename(
                        table, 
                        'sku', 'product')
    elif 'item' in etl.header(table):
        table = etl.rename(
                        table, 
                        'item', 'product')

# rename header to only have the one name (total_quantity_purchase)
    if 'quantity' in etl.header(table):
        table = etl.rename(
                        table,
                        'quantity', 'total_quantity_purchased'
    )
    elif 'total_quantity' in etl.header(table):
        table = etl.rename(
                        table,
                        'total_quantity', 'total_quantity_purchased'
    )
    elif 'quantity_purchased' in etl.header(table):
        table = etl.rename(
                        table,
                        'quantity_purchased', 'total_quantity_purchased'
    )

    return table

#### dtype conversions

In [62]:
def str_conversion(table):
    # convert field to int
    table = etl.convert(
                    table, 
                    ['total_quantity_purchased'],
                    int)

# convert field to float
# round up field to 2 decimal points
    table = etl.convert(
                    table, 
                    'amount_in_gbp',
                    lambda cell: round(float(cell), 2))

    return table

#### father function

In [63]:
# cleanup function
def table_transform(table, file):

# fix miss labelled headers
    table = fix_headers(table)

# convert field types from string to number
    table = str_conversion(table)

# adding product categories to table
    table = etl.addfieldusingcontext(table, 'product_category', add_product_category)

# adding region & city
    table = add_region_city(table, file)

# return the tabled converted to a df
    return etl.todataframe(table)


## Iterating Through Files

In [64]:
file_range = "2012"
# for each file in the directory
for file in raw_data_directory:

        if str(file).split('-').pop(0).endswith(file_range):
                print(file)
# check if csv file type
                if str(file).endswith('csv'):
# extract from said type
                        current_table = etl.fromcsv(file)
# initiate cleanup functions
# fix miss labelled headers
                        current_table = fix_headers(current_table)

# convert field types from string to number
                        current_table = str_conversion(current_table)

# adding product categories to current_table
                        current_table = etl.addfieldusingcontext(current_table, 'product_category', add_product_category)

# adding region & city
                        current_table = add_region_city(current_table, file)

# return the current_tabled converted to a df
                        current_df = etl.todataframe(current_table)

# append the current df to the total df for branches
                        all_branches_df = all_branches_df.append(current_df, ignore_index=True)

# in case of other file type
                else:
# extract from json file type
                        current_table = etl.fromjson(file)
# initiate cleanup functions
# fix miss labelled headers
                        current_table = fix_headers(current_table)

# convert field types from string to number
                        current_table = str_conversion(current_table)

# adding product categories to current_table
                        current_table = etl.addfieldusingcontext(current_table, 'product_category', add_product_category)

# adding region & city
                        current_table = add_region_city(current_table, file)

# return the current_tabled converted to a df
                        current_df = etl.todataframe(current_table)
                
# append the current df to the total df for branches
                        all_branches_df = all_branches_df.append(current_df, ignore_index=True)


data/raw\2012-2020_Armagh_outlet.csv
data/raw\2012-2020_Ballymoney_store.csv
data/raw\2012-2020_Bargoed_outlet.json
data/raw\2012-2020_Bedfordshire_store.json
data/raw\2012-2020_Colchester_outlet.json
data/raw\2012-2020_Darlington_store.csv
data/raw\2012-2020_East_Dunbartonshire_branch.json
data/raw\2012-2020_East_Hertfordshire_branch.json
data/raw\2012-2020_Edinburgh_City_branch.json
data/raw\2012-2020_Glasgow_City_outlet.csv
data/raw\2012-2020_Hackney_store.csv
data/raw\2012-2020_Isle_of_Anglesey_outlet.json
data/raw\2012-2020_Lancashire_store.json
data/raw\2012-2020_Lincolnshire_store.json
data/raw\2012-2020_Neath_Port_Talbot_outlet.json
data/raw\2012-2020_Newark_and_Sherwood_store.json
data/raw\2012-2020_Reigate_and_Banstead_branch.csv
data/raw\2012-2020_Rugby_branch.json
data/raw\2012-2020_Rushcliffe_branch.csv
data/raw\2012-2020_Selby_branch.json
data/raw\2012-2020_Sevenoaks_branch.csv
data/raw\2012-2020_Shepway_store.csv
data/raw\2012-2020_Stockton-on-Tees_store.csv
data/raw\201

In [65]:
all_branches_df

Unnamed: 0,year,month,day,hour,product,total_quantity_purchased,amount_in_gbp,product_category,county,region
0,2012,1,10,0,brenton t-shirt,202.0,9502.08,clothing,Armagh,Northern Ireland
1,2012,1,10,0,salmon fillets,222.0,428.90,meat & fish,Armagh,Northern Ireland
2,2012,1,10,0,apple macbook air mid 2009,11.0,17556.00,computing,Armagh,Northern Ireland
3,2012,1,10,0,vanilla yoghurt,17.0,14.28,dairy,Armagh,Northern Ireland
4,2012,1,10,0,dual shock controller,220.0,9240.00,gaming,Armagh,Northern Ireland
...,...,...,...,...,...,...,...,...,...,...
25549408,2020,12,25,23,fresh whole chicken,45.0,63.00,meat & fish,York,Yorkshire and the Humber
25549409,2020,12,25,23,oxford shirt,35.0,455.00,clothing,York,Yorkshire and the Humber
25549410,2020,12,25,23,grey blazers,143.0,4862.00,clothing,York,Yorkshire and the Humber
25549411,2020,12,25,23,gaming mouse,194.0,14550.00,gaming,York,Yorkshire and the Humber


## space

In [66]:
#temp_all_branches_file = pd.read_csv('data/refined/all_branches.csv')

#temp_all_branches_file = temp_all_branches_file.append(all_branches_df)

#temp_all_branches_file.to_csv('data/refined/all_branches.csv')

#temp_all_branches_file

#all_branches_df = all_branches_df.set_index(['region', 'county', 'year'])

all_branches_df.to_csv('data/refined/branches_established_in_2012.csv', index=False)