## Imports

In [None]:
#pip install geopandas

In [None]:
#pip install fuzzywuzzy

In [None]:
#pip install python-dotenv

In [1]:
import pandas as pd
import requests
import geopandas as gpd
from shapely.geometry import Point
import argparse
from fuzzywuzzy import process
import os
from dotenv import dotenv_values



In [None]:
# Function argparser
def argument_parser():
    # Create ArgumentParser with the app description
    parser = argparse.ArgumentParser(description = 'This app find the BiciMAD/BiciPARK station closest to a set of public\
    schools')
    # Create message to help to the users
    help_message = 'You have two options:\
    \n(1) str="All": to get the table for every "Place of interest" included in the dataset (or a set of them).\
    \n(2) str=school_name: to get the table for a specific "public school" imputed by the user.'  
    # Use '-p' as a flag to select opcion 1 or 2
    parser.add_argument('-p', '--parameter', help=help_message, type=str)
    # Obtain argument
    args = parser.parse_args()
    return args

## Functions

#### Cleaning functions

In [None]:
# FUNCTION 'transform_df'
def transform_df(df):
    # Function to transform and clean the dataframe import from csv.
    
    # Extract the column, delete '[' and ']'. Split the string using ',' and convert to float. Store this data in two
    # columns: longitude and latitude. Add this two columns to the original dataframe
    temp_df = df['geometry.coordinates'].str.strip('[]').str.split(',', expand=True).astype('float64')
    temp_df.columns = ['longitude', 'latitude']
    df= pd.concat([df,temp_df],axis=1)
    
    # Delete column 'geometry.coordinates' and 'Unnamed: 0' columns
    df = df.drop(['Unnamed: 0', 'geometry.coordinates'], axis=1)
    
    # Change the name of 'geometry.type' column becase include '.' in the name, and it could be a potential error
    df = df.rename(columns={'geometry.type':'geometry_type'})
    
    # In case the column names were e.g. 'stationId', extract each column name, if includes 'station', repleace that for 
    # ' '. And change the string to lowercase
    columns = df.columns.tolist()
    new_column_names = [column_name.replace('station', '').lower() for column_name in columns]
    df.columns = new_column_names
    
    return df


# FUNCTION 'extract_dict2df'
def extract_dict2df(df):
    # Function to extract the dictionaries that are included in the cells of some of the columns. Create a new column for each 
    # key and store in it the corresponding values. Finally delete the original columns where the dictionaries are located.
    # With this function it is possible to extract the dictionaries that are in different columns, regardless of the number of 
    # columns that have dictionaries or the number of items in each of them.
    
    column_names = df.columns.values   # Store the column names in a list called 'column_names'

    # In this loop, iterate over the columns of the DataFrame
    for col_name in column_names:
        # Check if the first cell type is a dictionary and, in this case, check if it includes more than 1 items
        if isinstance(df.at[0, col_name], dict) and len(df.at[0, col_name])>1:
            # Extract the keys from the first dictionary found. Use '.at' to get a single value from the DataFrame.
            keys = list(df.at[0, col_name].keys())
 
            # Iterar sobre las claves y agregar nuevas columnas al DataFrame
            for key in keys:
                new_col_name = f"{col_name}_{key}"  # Nombre de la nueva columna
                df[new_col_name] = df[col_name].apply(lambda x: x.get(key))

            # Delete the previous column with the dictionaries inside each cell
            df = df.drop(columns=[col_name])
        
    return df

#### Geo-calculation functions

In [None]:
def to_mercator(lat, long):
    # transform latitude/longitude data in degrees to pseudo-mercator coordinates in metres
    c = gpd.GeoSeries([Point(lat, long)], crs=4326)
    c = c.to_crs(3857)
    return c

def distance_meters(lat_start, long_start, lat_finish, long_finish):
    # return the distance in metres between to latitude/longitude pair points in degrees 
    # (e.g.: Start Point -> 40.4400607 / -3.6425358 End Point -> 40.4234825 / -3.6292625)
    start = to_mercator(lat_start, long_start)
    finish = to_mercator(lat_finish, long_finish)
    return start.distance(finish)

#### API EMT

In [2]:
def login_emt(BASE_URL):
    """Summary: function to do login in emt mobility web

    Args:
        BASE_URL (string): string with the base url of the emt web

    Returns:
        accessToken (string): key necessary to extract the updated bicimad data
    """
    # Extract the user data necessary to login
    config = dotenv_values('../.env')
    email_user = config.get('CLIENT_ID')   # Extract CLIENT ID from .env file.
    password = config['CLIENT_SECRET']   # Extract CLIENT SECRET from .env file.

    # Built the endpoint and header, make the get operation and extract the token.
    ENDPOINT_LOGIN = "v1/mobilitylabs/user/login/"   # Part of the web adress to login.
    url_login = BASE_URL + ENDPOINT_LOGIN   # Build the endpoint to login.
    headers_longin = {"email": email_user, "password": password}   # Create the headerns needed to include in the get operation.
    kwargs = {"url": url_login, "headers": headers_longin, "timeout": 10}   # Create the arguments to do the get.
    response_emt_login = requests.get(**kwargs)   # Operation get.
    response_emt_login = response_emt_login.json()   # Transform the data to json.
    print(response_emt_login['code'])
    print('Token dentro de la función: ', accessToken)
    
    # If the response code is '00' means that the login operation is correct.
    if (response_emt_login['code'] == '00') or (response_emt_login['code'] == '01'):
        accessToken = response_emt_login['data'][0]['accessToken']   # Extract the accessToken from the json.
        os.putenv("ACCESS_TOKEN", accessToken)   # Store the token in '.env' file
        return accessToken
    else:
        print('Error in the comunication with the emt web')  


def extract_bicimad_data_emt(BASE_URL, accessToken):
    """Summary: function to extract updated bicimad data

    Args:
        BASE_URL (string): url base of the emt web
        accessToken (string): key necessary to extract the updated bicimad data

    Returns:
        bicimad_data (dictionary): bicimad data
    """
    # Extract the token necessary to extract data.
    config = dotenv_values('../.env')
    accessToken = config.get('ACCESS_TOKEN')   # Extract TOKEN from .env file.

    # Built the endpoint and header, make the get operation and extract the information related to bicimad station.
    ENDPOINT_STATIONS = "v1/transport/bicimad/stations/"   # Part of the web adress to extract bicimad data.
    url_stations = BASE_URL + ENDPOINT_STATIONS   # Build the endpoint to login.
    headers = {"accessToken": accessToken}   # Create the headerns needed to include in the get operation.
    kwargs = {"url": url_stations, "headers": headers, "timeout": 10}   # Create the arguments to do the get.
    response_emt_station = requests.get(**kwargs)   # Operation get.
    response_emt_station = response_emt_station.json()   # Transform the data to json.

    return response_emt_station


def process_json(json_data):
    # The dictionary has two keys: '@context' and '@graph'. And the interesting data are in the value of the second key where
    # other dictionaries are included. Extract both keys in a list called 'keys' -> json_data["@graph"] = json_data[keys[1]].
    keys=list(json_data.keys())
    # Create the dataframe with the data stored in '@graph'. This way, if the name of the dictionary change, it will still work.
    df = pd.DataFrame(json_data[keys[1]])
    return df


def import_update_json():
    """Summary: function to import the data from web. This funcion uses 

    Returns:
        _type_: _description_
    """
    # Extract the token necessary to extract data.
    config = dotenv_values('.env')
    accessToken = config.get('ACCESS_TOKEN')   # Extract TOKEN from .env file.
    print('Token al principio del todo: ', accessToken)

    BASE_URL = "https://openapi.emtmadrid.es/"   # Base url of the web.
    json__response = extract_bicimad_data_emt(BASE_URL, accessToken)
    
    # Check if the access token is stil valid. If the token is expired, excute the login function again and create new acess token. With that, 
    # the login operation only is executed when the token is expired. 
    if (json__response['code'] != '00') or  (json__response['description'] == 'Error, token not found in cache'):
        print('The token stored is expired. The program will be login again and create new access token')
        accessToken = login_emt(BASE_URL)
        print('Token después de generarlo de nuevo: ', accessToken)
        # Execute the function to extract the updated bicimad data again with the new token
        json__response_data =  extract_bicimad_data_emt(BASE_URL, accessToken)
        print(json__response_data)
    
    json_data = json__response['data'][0]   # Extract the bicimad data from the json.
    df = process_json(json_data)
    return df

In [3]:
import_update_json()

Token al principio del todo:  None
The token stored is expired. The program will be login again and create new access token
01


UnboundLocalError: local variable 'accessToken' referenced before assignment

## Acquisition and wrangling: import, clean and prepare data

#### Import and clean bicimap.csv

In [None]:
# Read the csv and store the data in a dataframe
bicimad_df = pd.read_csv("../data/raw/bicimad_stations.csv", sep='\t')
# Clean and transform the dataframe
bicimad_df = transform_df(bicimad_df)
# Remove the number (e.g. '1a - , 1b - ....') from the name of each bicimap station
bicimad_df['name'] = bicimad_df['name'].apply(lambda row: row.split(' - ')[1])
bicimad_df.head()

In [None]:
bicimad_df.to_csv(f"../data/processed/bicimad.csv", index=False)

#### Import and clean bicipark.csv

In [None]:
# Read the csv and store the data in a dataframe
bicipark_df = pd.read_csv("../data/raw/bicipark_stations.csv", sep=';')
# Clean and transform the dataframe
bicipark_df = transform_df(bicipark_df)
# Remove the string 'bicipark ' from the name of each bicipark station
bicipark_df['name'] = bicipark_df['name'].apply(lambda row: row.split('Bicipark ')[1])
bicipark_df.head()

In [None]:
bicipark_df.to_csv(f"../data/processed/bicipark.csv", index=False)

#### Fix and prepare biciMAD and BiciPark dataframes

In [None]:
# Create a new dataframe with columns that these will be use in the final app. 
# Extract some columns from bicimad.df in a new dataframe
new_column_names = {'name': 'station_name', 'address': 'station_location', 'latitude': 'latitude', 'longitude': 'longitude'}
bicimad_stations_df = bicimad_df[list(new_column_names.keys())].rename(columns=new_column_names)
bicimad_stations_df['station_type'] = 'BiciMAD'
bicimad_stations_df.head()

In [None]:
# Extract some columns from bicipark.df in a new dataframe
new_column_names = {'name': 'station_name', 'address': 'station_location', 'latitude': 'latitude', 'longitude': 'longitude'}
bicipark_stations_df = bicipark_df[list(new_column_names.keys())].rename(columns=new_column_names)
bicipark_stations_df['station_type'] = 'BiciPARK'
bicipark_stations_df.head()

In [None]:
stations_df = pd.concat([bicimad_stations_df, bicipark_stations_df])
stations_df

#### Import and clean json

In [None]:
url = 'https://datos.madrid.es/egob/catalogo/202311-0-colegios-publicos.json'
response = requests.get(url)

In [None]:
# Obtain json data
json_data = response.json()

# The dictionary has two keys: '@context' and '@graph'. And the interesting data are in the value of the second key where
# other dictionaries are included. Extract both keys in a list called 'keys' -> json_data["@graph"] = json_data[keys[1]]
keys=list(json_data.keys())

# Create the dataframe with the data stored in '@graph'. This way, if the name of the dictionary change, it will still work.
public_schools_df = pd.DataFrame(json_data[keys[1]])

# Use the 'extract_dict2df' function to extract the diccionaries included in some columns and create new columns with them.
public_schools_df = extract_dict2df(public_schools_df) 
public_schools_df['organization_organization-desc'][3]   
        

In [None]:
# Extract some columns from bicipark.df in a new dataframe
new_column_names = {'title': 'school_name', 'address_street-address': 'school_location', 
                    'location_latitude': 'latitude', 'location_longitude': 'longitude'}
schools_df = public_schools_df[list(new_column_names.keys())].rename(columns=new_column_names)
schools_df['place_type'] = 'Colegios públicos'
schools_df.head()

## Analysis: calculations

#### Geo-calculation

In [None]:
%%time

# Use a dataframe fragment to test the code
schools_test = schools_df#.iloc[0:2]
bicimad_test = stations_df#.iloc[0:10]
# Filter the dataframe and extract only the rows related to bicimad
#bicimad_test = stations_df.loc[stations_df['station_type'] == 'BiciMAD'].iloc[0:10]

# Merge public schools and bicimad/bicipark dataframe. Before the merge, create a new column called 'key' which value will
# be '1'. Merge using this column and use drop to remove this column in the merged dataframe. This new dataframe will be 
# a dataframe in which 
merge_df = pd.merge(schools_test.assign(key=1), bicimad_test.assign(key=1), on='key').drop('key', axis=1)
# Obtain the distance
merge_df['distance'] = merge_df.apply(lambda row: distance_meters(row['latitude_x'], row['longitude_x'],
                                                             row['latitude_y'], row['longitude_y']), axis=1)
merge_df.head()

In [None]:
# Store the dataframe in '.csv' and this way I won't have to wait for the distance calculations to be performed if I want 
# to work with the dataframe
merge_df.to_csv("../data/processed/distance_calculated.csv", index=False)


In [None]:
# Import the data to work with them
merge_df = pd.read_csv("../data/processed/distance_calculated.csv", sep=',')

In [None]:
# Short the dataframe for each school from minimum to maximum of the distance from each station to the school that 
# corresponds to it. Reset index and remove the new column index created
merge_short_df = merge_df.sort_values(by=['school_name', 'distance']).reset_index().drop('index', axis=1)
# To extract only the biciMAD items, it's neccesary to apply a filter
bicimad_filter = merge_short_df['station_type'] == 'BiciMAD'
# Obtain the resulting dataframe with the school and bicimad station with minimum distance. As the dataframe is already 
# sorted, with the distance values from smallest to largest, only the first value for each school needs to be extracted. 
# To do this, it's neccesary to apply the filter calculated above.
minimum_df = merge_short_df[bicimad_filter].groupby('school_name').head(1)
minimum_df

In [None]:
# Change column names and select the desired columns to adapt the result to the objective
# Create a dictionary with the old and new column names
new_columns_names = {'school_name': 'Place of interest',
                     'place_type': 'Type of place',
                     'school_location': 'Place address',
                     'station_name': 'BiciMAD station',
                     'station_location': 'Station location'}
# Extract the interested columns and rename them.
result_df = minimum_df[list(new_columns_names.keys())].rename(columns=new_columns_names).reset_index(drop=True)
result_df

## Store the data

In [None]:
# Store the results in a new '.csv' file
result_df.to_csv("../data/result/result.csv", index=False)

In [None]:
def show_one_school(df, station_type, school_name):
    # Short dataframe and extract the columns interesting to the goal
    #df = short_store_data(df, station_type)

    # Create a filter with the rows that includes the specific lab
    filter_df = df['Place of interest'] == school_name
    # Evaluate if at least one element in condition is True. If True, it means that there is at least one row that meets the condition. If not
    # the return is a error message
    if filter_df.any():
        return df[filter_df]
    else:
        return 'Error: the name of the lab you typed was not found'

In [None]:
test = show_one_school(result_df, 'bicimad', 'Colegio Público Adolfo Suárez')
test

## Bonus 1

In [None]:
# This bonus was done in '.py'

## Bonus 2

In [None]:
def find_nearest_bicimad(df, name_school):
    # With this function it is possible to get the best match for 'school_name' in the different school names
    best_match = process.extractOne(name_school, result_df['Place of interest'])
    best_match = process.extractOne(school_name, df['Place of interest'])
    print(best_match)

    # If the coincidence is higher than 80%, show the BiciMAD station
    if best_match[1] >= 80:  
        # Extract the row of the chosen school
        choice_school = df.loc[result_df['Place of interest'] == best_match[0]]
        # Extract the value bicimad station name
        bicimad_nearest = choice_school['BiciMAD station'].values
        # Show the result
        return f"The nearest BiciMAD station to the school {name_school} is {bicimad_nearest}."
    else:
        return "No close match was found for the school name."
    
result_str = find_nearest_bicimad(result_df, 'adolfo suarez')
result_str

In [None]:
df = result_df
best_match = process.extractOne('adolfo suarez', df['Place of interest'])
if best_match[1] >= 80:
    filter_df = df['Place of interest'] == best_match[0]
    result_df = df[filter_df]

result_all = result_df
result_df['BiciMAD station'][0]

filter_station = bicimad_df['name'] == result_df['BiciMAD station'][0]
result_station_df =bicimad_df[filter_station]
result_station_df.head()

# Create list with only the interesting columns
interesting_columns = ['total_bases', 'dock_bikes', 'free_bases']
# Create list with the new names to rename the columns
new_columns_names = ['Total bases', 'Dock bikes', 'Free bases']

# Extract the free bases from the dataframe and include that with result_df
data_to_insert = list(result_station_df[interesting_columns].values)
print(data_to_insert)
# Store this data in the dataframe with the other information
result_all[interesting_columns] = data_to_insert

result_all

## Bonus 3

In [None]:
df_data = import_update_json()