# preprocessing.ipynb

## Overview
This notebook is for cleaning and other data preprocessing of all the datasets in `/home/jovyan/ODBiz/1-PreProcessing/raw`

This notebook includes cells that do the following:
- Converts .shp files to .csv files
- Extracts lat/lon coordinates from cells that recorded them as JSON strings. Seems like the only dataset this applies to is `BC_Vancouver_Business_Licences.csv`
- Fixes `Indigenous_Business_Directory.csv`, which contained commas inside their cells
- Removes leading and trailing whitespaces from `NT_Yellowknife_Business_Directory.csv` and also fixes it's weirdly formatted phone numbers
- Standardizes dates for the `date_established` variables
- Moves processed datasets into the `2-OpenTabulate` folder

## External custom scripts
A few cells run external scripts have been written as .py files. Here's a list of links to those scripts:
- [process_shp_files.py](https://kubeflow.aaw.cloud.statcan.ca/notebook/deil-lode/odbiz-processing/doc/tree/ODBiz/1-PreProcessing/process_shp_files.py): Converts .shp files to .csv files
- [standardize_dates.py](https://kubeflow.aaw.cloud.statcan.ca/notebook/deil-lode/odbiz-processing/doc/tree/ODBiz/1-PreProcessing/standardize_dates.py): Standardizes dates for the `date_established` variables


In [2]:
# Import required packages
import os
from pathlib import Path
import shutil
import pandas as pd
import glob
import geopandas as gpd
import numpy as np

In [None]:
# First bring all raw files into processed. 
# We do this because more files won't need any cleaning work, so this is quicker than moving them manually. 
# Any files that require processing work will simply replace the raw files in the processed folder later

src = '../1-PreProcessing/raw'
dst = '../1-PreProcessing/processed'

if os.path.exists(dst):
    shutil.rmtree(dst)
    shutil.copytree(src, dst)
    
files_in_directory = os.listdir(dst)
filtered_files = [file for file in files_in_directory if not file.endswith(".csv")]
for file in filtered_files:
    try:
        path_to_file = os.path.join(dst, file)
        os.remove(path_to_file)
    except:
        pass

In [None]:
# Shapefile processing has been replaced with process_shp_files.py 
# This script includes Port Moody!
import process_shp_files
process_shp_files.main()

# # All shapefiles (except Port Moody)

# import os
# for root, dirs, files in os.walk("../1-PreProcessing/raw/shapefiles"):
#     for file in files:
#         if file.endswith(".shp"):
#             try:
#                 head, tail = os.path.split(os.path.join(root, file))
#                 head = head.replace('shapefile', '')
#                 head = head.replace("/home/jovyan/ODBiz/1-PreProcessing/raw/shapefiles/", '')
#                 head = head.replace("/", '')          
#                 tail = tail.replace('.shp', '')
#                 name = head + tail
#                 print(name)

#                 fp = (os.path.join(root, file))
#                 city = gpd.read_file(fp)
#                 print(city.crs)
#                 city = city.to_crs(epsg=4326)
#                 print(city.crs)
#                 sub_city = city.head(500)
#                 city['lon'] = city.geometry.x
#                 city['lat'] = city.geometry.y

#                 city.to_csv("../1-PreProcessing/raw/"+name+".csv")
#                 city.to_csv("../1-PreProcessing/processed/"+name+".csv")

#             except:
#                 print('error with file above')
#                 pass

In [None]:
# # BC Port Moody Shapefile

# fp = "../1-PreProcessing/raw/shapefiles/BC_Port_Moody_shapefile/Business_Directory.shp"
# name = "port moody"

# city = gpd.read_file(fp)

# print(city.crs)
# city = city.to_crs(epsg=4326)
# print(city.crs)

# sub_city = city.head(500)

# city['lon'] = city.centroid.x
# city['lat'] = city.centroid.y

# #print(city.head)

# city.to_csv("../1-PreProcessing/raw/BC_Port_Moody_Business_Directory.csv")
# city.to_csv("../1-PreProcessing/processed/BC_Port_Moody_Business_Directory.csv")

In [None]:
# BC vancouver lat/long

df = pd.read_csv('../1-PreProcessing/raw/BC_Vancouver_Business_Licences.csv')

def strip_point(x):   
    try:
        t = x.strip(r'{""coordinates"": [')
        t = t.rstrip('], ""type"": ""Point""}')
        t = t.replace(',', '')
        return t.split()
    except:
        return np.nan

LONGS=[]
LATS=[]
for i in df["Geom"]:
    try:
        LONGS.append(strip_point(i)[0])
        LATS.append(strip_point(i)[1])
    except:
        LONGS.append(np.nan)
        LATS.append(np.nan)

df["long"]=LONGS
df["lat"]=LATS

df.to_csv('../1-PreProcessing/processed/BC_Vancouver_Business_Licences.csv')

In [None]:
# NT Yellowknife whitespaces and phone numbers

df = pd.read_csv('../1-PreProcessing/raw/NT_Yellowknife_Business_Directory.csv')

cols = ['BUSINESSNAME', 'MUNICIPAL ADDRESS3', 'BUSINESSTYPE', 'PHONE', 'EMAILADDRESS']
df[cols] = df[cols].apply(lambda x: x.str.strip())

df['PHONE'] = df['PHONE'].str.extract(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')

df.to_csv('../1-PreProcessing/processed/NT_Yellowknife_Business_Directory.csv')

In [None]:
# Fix the cells with commas in them in the Indigenous_Business_Directory.csv file

import csv

# Define csv file path
in_file = '../1-PreProcessing/raw/Indigenous_Business_Directory.csv'
out_file = '../1-PreProcessing/processed/Indigenous_Business_Directory.csv'

# Initialize a list that will be our new corrected csv
newcsv = []

# Open a read only copy of the csv file
with open(in_file, mode = 'r', newline='', encoding='utf8') as csvfile:
    csvreader = csv.reader(csvfile)

    # For each row in the csv...
    i = 0
    for row in csvreader:

        # For each cell in each row...
        j = 0
        for val in row:

            # If the known anomoly is found...
            if '$25' in val:

                # Then perform the fixing operation
                newval = ','.join(row[j:j+3])   # Concat the 3 cells that make up the sentence
                del row[j:j+3]                  # Delete the 3 cells from the list
                row.insert(j, newval)           # Insert the concat'd value back into the list
            j += 1

        # Append the row to the newcsv file
        newcsv.append(row)

        # Delete the unnecessary 19th column
        row_len = len(row)
        if row_len == 19:

            # Give a warning message if non-empty values are deleted
            if row[-1] != '':
                print('WARNING, DELETED VALUE:', row[-1])

            # Delete extra column
            del row[-1]

        # Add extra commas if they're missing
        row_len = len(row)
        print(i,':', row_len)
        while(len(row) < 18):
            row.append('')

        i += 1

# Delete the first row
del newcsv[0]

# Delete that one empty row after the header row
del newcsv[1]

# Save newcsv to a .csv file
with open(out_file, mode = 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    print(f'Saving the newcsv to {out_file} ...')
    writer.writerows(newcsv)
print(f'Saved newcsv to {out_file}')


In [None]:
# Standardize the dates of the csvs with non-empty date_established fields using standardize_dates.py
import standardize_dates
standardize_dates.main()

In [3]:
# transfer files directly from PreProcessing/processed to opentabulate/data/input
src = '../1-PreProcessing/processed'
dst = '../2-OpenTabulate/data/input'

if os.path.exists(dst):
    shutil.rmtree(dst)
    shutil.copytree(src, dst)

In [None]:
# List the number of files(/folders?) in each directory listed below
raw = '../1-PreProcessing/raw'
pro = '../1-PreProcessing/processed'
input_ = '../2-OpenTabulate/data/input'
output_ = '../2-OpenTabulate/data/output'
print(len(os.listdir(raw)))
print(len(os.listdir(pro)))
print(len(os.listdir(input_)))
print(len(os.listdir(output_)))