# MINI PROJECT
## Project Name: SG CARPARKS

### By: Group 5
- Wang Sunmeng
- Wong Ting Wen Adelina
- Yew Fu Yen

### Problem Definition:
> How many available carpark lots are there near a certain location at a certain time ?

#### First, we import all the libraries we need for this project:

1. pandas, json, numpy: for data representation
2. datetime: to handle time data
3. os: for file manipulation on local device
4. gspread, df2gspread, googleapiclient.discovery: to integrate data collection with google sheets and google drive
5. ServiceAccountCredentials: for google api credentials
6. time: to implement sleep function
7. geopy: for geopatial search and distance calculation function
8. geocoder: to get user's IP location
9. seaborn, matplotlib.pyplot: for data visualization
10. sklearn modules: for machine learning

#### (please download these libraries on your local device before running the code):

In [1]:
## import pandas as pd
import pandas as pd

#For API request and file/data manipulation
import requests
import json
from datetime import datetime
import os

#GSheets Stuff
import gspread
from df2gspread import df2gspread as d2g
from df2gspread import gspread2df as g2d
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient import discovery

#For Sleep
import time

#For Geocoding
from geopy.geocoders import Nominatim
# import geocoder

#For Graphing
import seaborn as sb
import matplotlib.pyplot as plt 
import numpy as np
sb.set()

#Linreg
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

### Data collection:
1. download_from_google_sheets()
    - a function that downloads all of our collected data from google drive to the local device.
    - The data will be stored in .csv format and in a folder called "Data"
    - The "Data" folder should be created in the currect directory before running the code
    - After the data is downloaded to the local device, it would take a much shorter time for pandas to read the data and convert them into dataframes 

### Short list of what we did in the code:
1. Check if the data for a certain date is already present on the local device
2. If not, fetch the data from the spreadsheet of that certain date
3. Add headers to the data
4. Concatenate different spreadsheets depending on how the data is collected
4. Download the data in .csv format into the "./Data" folder

In [2]:
#download data stored in google sheets to local device.
#all spreadsheet keys stored at https://docs.google.com/spreadsheets/d/1bms8J3Hiv_F3Mycsr14gwVZiJi1-ngLj0fNhdRkIAwQ/edit#gid=0
#downloaded data has header
def download_from_google_sheets():
    # Credential stuff
    scope = ["https://spreadsheets.google.com/feeds",
            "https://www.googleapis.com/auth/drive"]
    credentials = ServiceAccountCredentials.from_json_keyfile_name("./creds.json", scope)
    gc = gspread.authorize(credentials)

    ID_of_spreadsheet_keys = "1bms8J3Hiv_F3Mycsr14gwVZiJi1-ngLj0fNhdRkIAwQ"
    ID_of_one_set_data = "1J6c50eUbTeOLbMQ88dTpB5lV_fEg2l9Ke2rdhVGW9Mw"
    worksheet_name = 'Sheet1'

    #open sheet that stores all spreadsheet keys
    sheet = gc.open_by_key(ID_of_spreadsheet_keys)
    worksheet = sheet.worksheet(worksheet_name)
    spreadsheet_keys = {}
    dates_raw = worksheet.col_values(1)
    IDs = worksheet.col_values(2)
    for i in range(len(IDs)):
        spreadsheet_keys[(int(dates_raw[i].split("/")[0]), int(dates_raw[i].split("/")[1]))] = IDs[i]

    #check existing data in ./Data/
    existing_data = os.listdir("./Data/")
    existing_dates = [(int(i.split("_")[0]), int(i.split("_")[1])) for i in existing_data]
    for date in spreadsheet_keys.keys():
        if date not in existing_dates:
            #open sheet, select worksheet
            sheet = gc.open_by_key(spreadsheet_keys[date])
            worksheet = sheet.worksheet(worksheet_name)

            #download values into a dataframe
            df = pd.DataFrame(worksheet.get_all_values())

            #for data before 26/3/2021: only have 4 columns.
            #assume that "C" lotType carpark always have more available slots than other types of carpark. Remove duplicated data with same carpark ID.
            if (len(df.columns) == 4):
                df.columns = ["Time", "Date", "CarParkID","AvailableLots"]
                df = df.sort_values(by=["CarParkID", "AvailableLots"])
                df = df.drop_duplicates(subset=["CarParkID", "Time"], keep="last")
                
                #open sheet that stores one set of data (without time, date and availableLots)
                sheet = gc.open_by_key(ID_of_one_set_data)
                worksheet = sheet.worksheet(worksheet_name)
                one_set_data = pd.DataFrame(worksheet.get_all_values())
                one_set_data.columns = ["CarParkID","Area","Development", "Location", "LotType", "Agency"]
                one_set_data = pd.DataFrame(one_set_data[one_set_data["LotType"] == "C"])
                
                df = df.merge(one_set_data, on="CarParkID")
                sheet = gc.open_by_key(spreadsheet_keys[date])
                
            #for data since 26/3/2021
            #leave only "C" LotType carpark
            elif (len(df.columns) == 9):
                df.columns = ["Time", "Date", "CarParkID", "Area","Development", "Location", "AvailableLots", "LotType", "Agency"]
                df = pd.DataFrame(df[df["LotType"] == "C"])

            #export as csv
            filename = os.path.join("./Data/", ((sheet.title+".csv").replace('/','_')))
            df.to_csv(filename, index=False)
            
            print((sheet.title+".csv").replace('/','_'), "downloaded")
        time.sleep(30)

### Data Curation and Preparation:
> To solve our problem, we have to clean our data so that it contains only several carparks that are within a certain range from a desired location.

#### To achieve this, we have to:
   1. get the user's input location
   2. create a new dataframe that contains only the data of nearby carparks

### Functions:
#### 1. Get Search Location:
   - uses geopy library
   - gives several possible locations through a keyword search
   - ask user to choose a location, then returns the coordinates or that location 
    
    
#### 2. Get User Location:
   - uses geocoder library
   - returns the coordinates of the user's device's IP location
    
    
#### 3. Carparks Nearby:
   - using coordinates returned from get_search/user_location, calculate the bbox coordinates of a certain radius
   - strip out data from the input dataframe so that it contains only the data from relevant carparks within the bbox range
   - add columns to store latitude and longitude of each carpark in the dataframe


In [3]:
#returns coordinates of desired search result chosen by user
def get_search_location():
    #This uses HERE api but seems not as good as geopy
    """import requests
    import pandas as pd
    
    URL = "https://geocode.search.hereapi.com/v1/geocode"
    #input("Enter the location here: ")
    api_key = "VhUOdWKYgNfrPSPdaYcBarB9OLFvT-rLAqW4wBc-Wy0" # Acquired from developer.here.com
    PARAMS = {'apikey':api_key,'q':str(search_term)} 

    # sending get request and saving the response as response object 
    response = requests.get(url = URL, params = PARAMS)
    location = pd.DataFrame(response.json())
    location = pd.DataFrame(location["items"].apply(pd.Series))
    
    return location"""
    
    #This uses Nominatim api through geopy
    geolocator = Nominatim(user_agent="user-636@project-306014.iam.gserviceaccount.com")
    search = geolocator.geocode
    
    while True:
        search_term = input("Please enter a location: ")

        location = search(search_term + ", Singapore", exactly_one = False) #exactly_one = False to return more than one result
        location = pd.DataFrame(location)
        if location.empty == True:
            print("No relevant search result.\n")
            continue
        location = pd.concat([location.iloc[:,:2], location[1].apply(pd.Series)], axis=1) #split lat and lng
        location.columns = ["address","coordinates", "lat", "lng"]

        print("\nDo you mean?")
        for i in range(len(location["address"])):
            print(str(i+1) + ". " + location["address"][i])
        print(str(len(location["address"])+1) + ". Try again")

        while True:
            try:
                choice = int(input("Your choice: "))
                if choice > len(location["address"]) + 1:
                    print("Invalid response!")
                else:
                    break
            except:
                print("Invalid response!")
        
        print()
        
        if choice != len(location["address"]) + 1:
            break
        
    return location["coordinates"][choice-1]

#returns coordinates of user's ip location
def get_user_location():
    g=geocoder.ip("me")
    location = pd.json_normalize(g.json)
    return (location["lat"], location["lng"])

#return a dataset of all carpark within search radius. Include data for every time
def carparks_nearby(df, coordinates, radius):
    df["Location"] = df["Location"].str.split(" ", n=1, expand=False)
    df = pd.concat([df[["Time", "Date", "Day", "CarParkID", "Area","Development", "Location", "AvailableLots", "LotType", "Agency"]], (pd.DataFrame(df["Location"].to_list(), columns=['lat', 'lng']))], axis=1)
    df['lat'] = pd.to_numeric(df['lat'], errors='coerce')
    df['lng'] = pd.to_numeric(df['lng'], errors='coerce')

    radius_in_degree = radius/111.2
    df = pd.DataFrame(df[abs(df["lat"]-coordinates[0]) < radius_in_degree])
    df = pd.DataFrame(df[abs(df["lng"]-coordinates[1]) < radius_in_degree])
    return df

### Data Curation and Preparation:
> Prepare data for further analysis

#### Get Data:
- Further cleans the data
- Depending on whether the input date is a weekday or weekend, pick only data that corresponds with the date
- Add in a column in the dataframe to indicate which day of a week the data is from
- Strip out data from public holidays (public holiday's data is considered as an outlier)
- Convert data to suitable data types (Time -> timedelta in seconds, Date-> datetime, Lots -> int64, others -> str)
- Output the cleaned dataframe for machine learning

In [13]:
#coordinates is a tuple of (latitude, longitude) representing the coordinates of the location requested
#predDate is a string in the format dd/mm/yyyy
#byDay is a bool, if True: if predDate is on Monday, display only Monday data; if False: display all weekday/weekend data
def getData(coordinates, predDate, byDay): 
    df = pd.DataFrame(columns=["Time", "Date", "Day", "CarParkID","Area","Development", "Location", "AvailableLots", "LotType", "Agency", "lat", "lng"])
    predDay = datetime.strptime(predDate, '%d/%m/%Y').weekday()
    
    #single out public holiday
    public_holiday = ["2/4/2021"]
    
    for filename in os.listdir("./Data/"):
        if filename.endswith(".csv"): 
            date = "/".join(filename.split("_")[:3])
            day = datetime.strptime(date, '%d/%m/%Y').weekday()
            
            if byDay:
                condition = (day == predDay) and (date not in public_holiday)
            else:
                condition = (day < 5 and predDay < 5 and date not in public_holiday) or ((day >= 5 and predDay >= 5) or date in public_holiday) and (date != "2/4/2021")
            
            if condition: 
                path = os.path.join("./Data/", filename)
                df_full = pd.read_csv(path, dtype="string")
                df_full["Day"] = str(day)
                df = df.append(carparks_nearby(df_full, coordinates, 1))
                #print(date)
        else:
            continue
    
    df['Time'] = pd.to_timedelta(df['Time'])
    df['Time'] = df['Time'].dt.total_seconds().div(60).astype(int)
    df['AvailableLots'] = pd.to_numeric(df['AvailableLots'])
    df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%Y")
    
    return df

### Machine Learning
> In order to predict the number of carparks in the future, we have to approximate and find out the relationship between the number of available carpark lots with time. Since both of these data can be represented in numeric form, we figure that regression would be a good way to find out their relationship.

> However, from exploratory data analysis, we see that the number of carpark lots do not vary linearly with time. We noticed that the number of available lots varies mostly in a curve-like manner. Most of them have a parabolic curve pattern, while some exhibit other patterns like w-shaped curves and other irregular curves. With that in mind, we conclude that the linear regression method that we have learnt in class would not be a good way for us to solve the problem.

> To account with the many types of curve patterns found in our data, we decide to explore another regression method, namely polynoial regression. In polynomial regression, instead of fitting the data into a best fit linear equation, it fits the data into a specified nth degree polynomial equation. The regression line of higher degree polynomials will exhibit different curve patterns with multiple turning points, hence we think that polynomial regression would be a good method for us to solve our problem.

> To find out which degree of polynomial fits our data the best, we tried to fit our data with iteratively increasing degree of polynomial, then find out the r^2 value of the model for each degree. Then, we take the model that has the highest r^2 value, which means that the model at that degree of polynomial fits our data the best (least error). The model will then be used to predict the number of carpark lots for a given time.


### Functions:
#### 1. Polynomial Transform:
   - transform the input data into a matrix form that represents nth degree polynomial
   - returns the data matrix
   
#### 2. Fit Model:
   - fit the data matrix that represents some nth degree polynomial into the regression model
   - returns fitted model

#### 3. Make Prediction:
   - loops through each carparks nearby
   - interatively increase the degree of polynomial and fit the data for each degree
   - calculate r^2 value of each model and select the best model (highest r^2)
   - use the model selected to predict the number of available lots for each carpark
   - every carpark will have a different model so that the model best fits the carpark's data
   - print out the number of available carparks and other relevant informations (eg: carpark location, carpark rates, degree of polynomial, r^2 values)
   - visualize the results by plotting out the data by scatterplot, regression line by lineplot, and predicted results by barplot

In [5]:
#df is a dataframe containing all data of all carparks to be predicted
#predhr is an int of hour in 24h format
def makePrediction(df, predhr):
    print("It is predicted that there are:")
    
    CarParkID = df["CarParkID"].unique()
    f, axes = plt.subplots(len(CarParkID), 2, figsize=(30,12*len(CarParkID)))
    legend_map = {'0': 'Monday',
                  '1': 'Tuesday',
                  '2': 'Wednesday',
                  '3': 'Thursday',
                  '4': 'Friday'}

    counter = 0
    for ID in CarParkID:
        counter += 1
        lotinfo = df[df["CarParkID"] == ID]
        lotinfo = lotinfo.sort_values(by=['Date'])
        lotinfo = lotinfo.reset_index(drop=True)
        
        dates = lotinfo["Date"].unique()
        test_data_portion = len(dates) // 4

        train_data = pd.DataFrame()
        test_data = pd.DataFrame()

        for i in dates[:-test_data_portion]:
            train_data = train_data.append(lotinfo[lotinfo["Date"] == i])
        for i in dates[-test_data_portion:]:
            test_data = test_data.append(lotinfo[lotinfo["Date"] ==i])

        train_x = pd.DataFrame(train_data['Time'])
        train_y = pd.DataFrame(train_data['AvailableLots'])
        test_x = pd.DataFrame(test_data['Time'])
        test_y = pd.DataFrame(test_data['AvailableLots'])

        max_degree = 10
        best_degree = 1
        r2_train = 0
        for degree in range(1, max_degree+1):
            model = fitModel(degree, train_x, train_y)
            train_y_poly_pred = model.predict(polynomialTransform(degree, train_x))

            r2 = r2_score(train_y, train_y_poly_pred)
            if r2 > r2_train:
                r2_train = r2
                best_degree = degree
        
        model = fitModel(best_degree, train_x, train_y)
        train_y_poly_pred = model.predict(polynomialTransform(best_degree, train_x))
        #axes[counter-1, 0].scatter(train_data["Time"]/60, train_data["AvailableLots"])
        sb.scatterplot(x=train_data["Time"]/60, y=train_data["AvailableLots"], hue=train_data['Day'].map(legend_map), ax=axes[counter-1, 0])
        sb.lineplot(x=train_data["Time"]/60, y=pd.DataFrame(train_y_poly_pred)[0], ax=axes[counter-1, 0], linewidth = 5)
        
        #Calculate test r2 score
        test_y_poly_pred = model.predict(polynomialTransform(best_degree, test_x))
        r2_test = r2_score(test_y, test_y_poly_pred)
        #sb.scatterplot(x=test_data["Time"]/60, y=test_data["AvailableLots"], hue=test_data['Day'].map(legend_map), ax=axes[counter-1, 1])
        #sb.lineplot(x=test_data["Time"]/60, y=pd.DataFrame(test_y_poly_pred)[0], ax=axes[counter-1, 1], linewidth = 5)
        
        #barplot
        rows=[]
        for a in range(6, 25):
            predtime = [[a*60]]
            pred = pd.DataFrame(predtime, columns = ['Time'])
            slots = round(model.predict(polynomialTransform(best_degree, pred))[0][0])
            if slots < 0:
                slots = 0
            rows.append([a,slots])
        carparkslots=pd.DataFrame(rows, columns = ['Time','Predicted carpark slots'])
        carparkslots.plot.bar(x="Time", y="Predicted carpark slots", rot=0, ax=axes[counter-1, 1])
        
        #predict
        predtime = [[predhr*60]]
        pred = pd.DataFrame(predtime, columns = ['Time'])
        slots = round(model.predict(polynomialTransform(best_degree, pred))[0][0])
        if slots < 0:
            slots = 0
        
        development = lotinfo["Development"][0]
        

        rates = getRates(lotinfo["Agency"][0], ID, "16/4/2021")
        
        print(str(counter)+ ". " + str(slots) + " carpark slots in " + development +" at "+ str(predhr) +" 00 HRS. Rates: " + rates)
        print("Best degree:", best_degree)
        print("Degree of fitness:", r2_train)
        print("Degree of fitness to test data:", r2_test)
        print()
        
def fitModel(degree, x, y):
    x_poly = polynomialTransform(degree, x)
    linreg = LinearRegression()
    linreg.fit(x_poly, y)
    return linreg

def polynomialTransform(degree, x):
    poly_reg = PolynomialFeatures(degree=degree)
    x_poly = poly_reg.fit_transform(x)
    return x_poly

#### Get Rates:
   - utilises other static datasets to obtain information about the each carparks' rates

In [10]:
def getRates(agency, ID, predDate):
    try:
        if agency == "HDB":
            within_central_area = ["ACB", "BBB", "BRB1", "CY", "DUXM", "HLM", "KAB", "KAM", "KAS", "PRM", "SLS", "SR1", "SR2", "TPM", "UCS", "WCB"]
            if (ID in within_central_area) and (int(datetime.strftime(predhr, '%H')) > 17):
                rates = "$1.20 per half an hr"
            else:
                rates = "$0.60 per half an hr"
        elif agency == "LTA":
            LTAdf = pd.read_csv("./CarParkRates.csv")
            if(datetime.strptime(predDate, '%d/%m/%Y').weekday() < 6):
                rates = LTAdf.loc[LTAdf['CarParkID']==int(ID)]["WeekDays_Rate_1"].values[0]
            else:
                rates = LTAdf.loc[LTAdf['CarParkID']==int(ID)]["Saturday_Rate"].values[0]
            # URA
        else:
            URAdf = pd.read_csv("./URA.csv")
            if (datetime.strptime(predDate, '%d/%m/%Y').weekday() < 6):
                rates = str(URAdf.loc[URAdf['ppCode']==ID]["weekdayRate"].values[0])  + " per half an hr"
            else:
                rates = str(URAdf.loc[URAdf['ppCode']==ID]["sunPHRate"].values[0])  + " per half an hr"
    except:
        rates = "unknown"
    return rates

#### Data Handling:
   - This is our main program that compiles everything that we have done.

In [11]:
def datahandling():
    mainpage = '''Options:
    1. Upload today's data
    2. Update to latest data
    3. Search'''
    print(mainpage)

    while True:
        try:
            action = int(input())
            if action < 1 or action > 3:
                print("Invalid input!")
            else:
                break
        except:
            print("Invalid input!")
            
    if action == 1:
        upload_to_sheets()
        
    elif action == 2:
        download_from_google_sheets()
        
    elif action == 3:
        coordinates = get_search_location()
        date = input("Enter the date in d/m/yyyy format: ")
        while True:
            try:
                hour = int(input("Enter the hour in 24h format (0 ~ 23): "))
                if hour < 0 or hour > 23:
                    print("Invalid input!")
                else:
                    break
            except:
                print("Invalid input!")
        df = getData(coordinates, date, False)
        makePrediction(df, hour)