<a href="https://colab.research.google.com/github/Dante7/ODD20_semovi/blob/master/read_from_web.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instalación de dependencias

En este caso vamos a usar geopandas para poder cargar los archivos con formato *.shp*

In [1]:
!pip install geopandas --quiet
!pip install shapely --quiet

[K     |████████████████████████████████| 1.0 MB 5.0 MB/s 
[K     |████████████████████████████████| 6.3 MB 48.1 MB/s 
[K     |████████████████████████████████| 16.7 MB 183 kB/s 
[?25h

## Importación de dependencias

Para este caso usaremos las librerias para hacer el request a las paginas necesarias, montar el almacenamiento en drive, crear el dataset y modificar las carpetas del drive

In [2]:
from bs4 import BeautifulSoup
from google.colab import drive
from zipfile import ZipFile
from shapely.geometry import Point, LineString

import geopandas as gpd
import json
import os
import pandas as pd
import multiprocessing as mp
import requests

In [3]:
from firebase_admin import initialize_app, delete_app, get_app
from firebase_admin import credentials
from firebase_admin import firestore

In [4]:
from google.colab import output
output.enable_custom_widget_manager()

## Definición de funciones adicionales


In [5]:
def get_links(url, label, css_class=None):
  page = requests.get(url)
  soup = BeautifulSoup(page.content, 'html.parser')
  css_condition = {"class": css_class} if css_class else {}
  return [a['href'] for a in soup.find_all(label, css_condition)]

def save_files(links, path):
  if not os.path.exists(path):
    os.mkdir(path)
  for link in links:
    file = requests.get(link, stream = True)
    file_name = link.rsplit('/', 1)[-1]
    
    with open(path + file_name,"wb") as shp:
        for chunk in file.iter_content(chunk_size=1024):  
          if chunk:
              shp.write(chunk)

def delete_collection(coll_ref):
    docs = coll_ref.stream()

    for doc in docs:
        print(f'Deleting doc {doc.id} => {doc.to_dict()}')
        doc.reference.delete()


## Definición de almacenamiento

Montado de la carpeta de drive de google y definición de los paths de almacenamiento

In [6]:
drive.mount('/gdrive')
root_path = '/gdrive/My Drive/datasets/'
path = root_path + 'cetram/'

Mounted at /gdrive


## Obtención de datos de CETRAM

En esta parte se hace scraping de los datos de la pagina donde se encuentran para posteriormente almacenarlo en Drive

In [None]:
cetram_url = "http://datos.cdmx.gob.mx/dataset/ubicacion-de-centros-de-transferencia-modal-cetram"
cetram_links = get_links(cetram_url, 'a', 'resource-url-analytics')
save_files(cetram_links, path)

## Obtención de datos de GTFS

En esta parte se hace scraping de los datos de la pagina donde se encuentran para posteriormente almacenarlos en drive

In [None]:
gtfs_url = 'https://datos.cdmx.gob.mx/dataset/gtfs'
gtfs_links = get_links(gtfs_url, 'a', 'resource-url-analytics')
save_files(gtfs_links, path)

## Generación de dataframe para datos de CETRAM

Se extraen los datos y genera el dataframe de estos datos

In [None]:
shp = ZipFile(path + "cetram_shp.zip", 'r')
shp.extractall('shp/')

gdf_cetram = gpd.read_file('shp/cetram_shp/CETRAM.shp')

## Generación de dataframes para los datos de GTFS

In [None]:
gtfs = ZipFile(path + "gtfs.zip", 'r')
gtfs.extractall('gtfs/')

df_agency = pd.read_csv('gtfs/agency.txt')
df_stop_times = pd.read_csv('gtfs/stop_times.txt')
df_routes = pd.read_csv('gtfs/routes.txt')
df_stops = pd.read_csv('gtfs/stops.txt')
df_trips = pd.read_csv('gtfs/trips.txt')

df_calendar = pd.read_csv('gtfs/calendar.txt')
df_shapes = pd.read_csv('gtfs/shapes.txt')
df_frequencies = pd.read_csv('gtfs/frequencies.txt')
# !ls 'gtfs/'

In [None]:
df_trips.head()

Unnamed: 0,trip_id,route_id,service_id,trip_headsign,trip_short_name,direction_id,shape_id
0,010001C000_1,CMX01001C,0,Canal de Chalco - Barranca del Muerto,C.Base Periférico Canal de Chalco to Base Barr...,1,SH01001C000_1
1,010001C000_0,CMX01001C,0,Barranca del Muerto - Canal de Chalco,C.Base Barranca del Muerto to Base Periférico ...,0,SH01001C000_0
2,010001B000_1,CMX01001B,0,Canal de Chalco - Tacubaya,B.Base Periférico Canal de Chalco to Base Tacu...,1,SH01001B000_1
3,010001B000_0,CMX01001B,0,Tacubaya - Canal de Chalco,B.Base Tacubaya to Base Periférico Canal de Ch...,0,SH01001B000_0
4,010001A000_1,CMX01001A,0,Canal de Chalco - Cuatro Caminos,A.Base Canal de Chalco to Base Cuatro Caminos,1,SH01001A000_1


agency > routes > trips > stop_times < stops 

shapes < trips > frequencies

calendar es solo de metro

In [None]:
p = mp.Pool(mp.cpu_count())
# Join stops
df_stops_times = pd.merge(df_stops, df_stop_times, how='left', on='stop_id').fillna(value='')
df_stops_complete = pd.merge(df_stops_times, df_trips, how='left', on='trip_id').fillna(value='')
df_stops_routes = df_stops_complete[['route_id', 'stop_id','stop_name', 'stop_lat', 'stop_lon', 'wheelchair_boarding']].drop_duplicates()

df_trips_shapes = pd.merge(df_trips, df_shapes, how='inner', on='shape_id')
df_trips_schedule = pd.merge(df_trips_shapes, df_frequencies, how='inner', on='trip_id')
df_trips_complete = df_trips_schedule[['trip_id', 'route_id', 'trip_headsign', 'trip_short_name', 'direction_id', 'shape_id', 'shape_dist_traveled', 'end_time', 'start_time', 'shape_pt_lat', 'shape_pt_lon']]
df_trips_complete['lat'] = df_trips_schedule[['shape_pt_lat']].round(6).astype(str)
df_trips_complete['lon'] = df_trips_schedule[['shape_pt_lon']].round(6).astype(str)
df_trips_complete['point'] = df_trips_schedule[['shape_pt_lat', 'shape_pt_lon']].apply(lambda x: [x['shape_pt_lat'], x['shape_pt_lat']], axis=1)
df_trips_complete['point'] = df_trips_complete[['lat', 'lon']].apply(lambda x: ','.join(x), axis=1)

# Join with agency
df_route_agency = pd.merge(df_routes, df_agency, how='left', on='agency_id')

In [None]:
try:
  app = get_app()
except ValueError as e:
  cred = credentials.Certificate("/gdrive/My Drive/credentials/public-transport-cdmx-firebase-adminsdk-bxcsj-0263214255.json")
  initialize_app(cred)


# try:
#   delete_app(app)
# finally:
#   pass

db = firestore.client()
doc_routes = db.collection('gtfs_agency_routes')
doc_stops = db.collection('gtfs_routes_stops')
doc_trips = db.collection('gtfs_routes_trips')

In [None]:
delete_collection(doc_routes)

In [None]:
routes_json = (df_route_agency.groupby(['agency_id','agency_name','agency_lang'])
        .apply(lambda x: pd.Series({ 
            "routes": x[['route_id','route_long_name', 'route_color']].to_dict('records'), 
        }))
       .reset_index()
       .to_json(orient='records')
       )

list(map(lambda x: doc_routes.add(x), json.loads(routes_json)))

In [None]:
delete_collection(doc_stops)

In [None]:
stops_json = (df_stops_routes.groupby(['route_id'])
       .apply(lambda x: pd.Series({ 
            "stops": x[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'wheelchair_boarding']].to_dict('records') 
        }))
       .reset_index()
       .to_json(orient='records')
       )

# d = json.loads(stops_json)
# for i in d:
#   print(i)
list(map(lambda x: doc_stops.add(x), json.loads(stops_json)))

In [None]:
delete_collection(doc_trips)

In [None]:
trips_json = (df_trips_complete.groupby(['trip_id', 'route_id', 'trip_headsign', 'trip_short_name', 'direction_id', 'shape_id', 'end_time', 'start_time'])
        .apply(lambda x: pd.Series({
          "line": x['point'].tolist(),
          "distance": x['shape_dist_traveled'].sum()
        }))
        .reset_index()
        .groupby(['route_id'])
        .apply(lambda x: pd.Series({
            "trips": x[['trip_id', 'trip_headsign', 'trip_short_name', 'direction_id', 'shape_id', 'start_time', 'end_time', 'line', 'distance']].to_dict('records')
        }))
        .reset_index()
        .to_json(orient='records')
)

d = json.loads(trips_json)
for i in d:
  print(i)

# list(map(lambda x: doc_trips.add(x), json.loads(trips_json)))