### This code downloads the data from google sheets to local device
The data is stored in a local folder ./Data.

The code checks if the ./Data folder is updated with the latest data from the most recent date, then download all missing data until the most recent date.

In [1]:
#download data stored in google sheets to local device.
#all spreadsheet keys stored at https://docs.google.com/spreadsheets/d/1bms8J3Hiv_F3Mycsr14gwVZiJi1-ngLj0fNhdRkIAwQ/edit#gid=0
def download_from_google_sheets():
    # Import libraries
    import pandas as pd
    import os
    import gspread
    from oauth2client.service_account import ServiceAccountCredentials

    # Credential stuff
    scope = ["https://spreadsheets.google.com/feeds",
            "https://www.googleapis.com/auth/drive"]
    credentials = ServiceAccountCredentials.from_json_keyfile_name("./creds.json", scope)
    gc = gspread.authorize(credentials)

    ID_of_spreadsheet_keys = "1bms8J3Hiv_F3Mycsr14gwVZiJi1-ngLj0fNhdRkIAwQ"
    ID_of_one_set_data = "1J6c50eUbTeOLbMQ88dTpB5lV_fEg2l9Ke2rdhVGW9Mw"
    worksheet_name = 'Sheet1'

    #open sheet that stores all spreadsheet keys
    sheet = gc.open_by_key(ID_of_spreadsheet_keys)
    worksheet = sheet.worksheet(worksheet_name)
    spreadsheet_keys = {}
    dates_raw = worksheet.col_values(1)
    IDs = worksheet.col_values(2)
    for i in range(len(IDs)):
        spreadsheet_keys[(int(dates_raw[i].split("/")[0]), int(dates_raw[i].split("/")[1]))] = IDs[i]

    #check existing data in ./Data/
    existing_data = os.listdir("./Data/")
    existing_dates = [(int(i.split("_")[0]), int(i.split("_")[1])) for i in existing_data]
    for date in spreadsheet_keys.keys():
        if date not in existing_dates:
            #open sheet, select worksheet
            sheet = gc.open_by_key(spreadsheet_keys[date])
            worksheet = sheet.worksheet(worksheet_name)

            #download values into a dataframe
            df = pd.DataFrame(worksheet.get_all_values())

            #for data before 26/3/2021: only have 4 columns.
            #assume that "C" lotType carpark always have more available slots than other types of carpark. Remove duplicated data with same carpark ID.
            if (len(df.columns) == 4):
                df.columns = ["Time", "Date", "CarParkID","AvailableLots"]
                df = df.sort_values(by=["CarParkID", "AvailableLots"])
                df = df.drop_duplicates(subset=["CarParkID", "Time"], keep="last")
                
                #open sheet that stores one set of data (without time, date and availableLots)
                sheet = gc.open_by_key(ID_of_one_set_data)
                worksheet = sheet.worksheet(worksheet_name)
                one_set_data = pd.DataFrame(worksheet.get_all_values())
                one_set_data.columns = ["CarParkID","Area","Development", "Location", "LotType", "Agency"]
                one_set_data = pd.DataFrame(one_set_data[one_set_data["LotType"] == "C"])
                
                df = df.merge(one_set_data, on="CarParkID")
                
            #for data since 26/3/2021
            #leave only "C" LotType carpark
            elif (len(df.columns) == 9):
                df.columns = ["Time", "Date", "CarParkID", "Area","Development", "Location", "AvailableLots", "LotType", "Agency"]
                df = pd.DataFrame(df[df["LotType"] == "C"])

            #export as csv
            filename = os.path.join("./Data/", ((sheet.title+".csv").replace('/','_')))
            df.to_csv(filename, index=False)
            
            print((sheet.title+".csv").replace('/','_'), "downloaded")

In [2]:
download_from_google_sheets()

9_4_2021_carpark_availability.csv downloaded
10_4_2021_carpark_availability.csv downloaded
11_4_2021_carpark_availability.csv downloaded
12_4_2021_carpark_availability.csv downloaded
13_4_2021_carpark_availability.csv downloaded
