In [None]:
import requests
import json
import pandas as pd

# Metro station data import and conversion to tabular

In [None]:
# Metro station data import
with open('/content/drive/MyDrive/metro.geojson') as f:
    data = json.load(f)

# conversion to tabular
from pandas import json_normalize
metro_df = json_normalize(data["features"])

# split coordinate into longitude and latitude
long = [metro_df["geometry.coordinates"][i][0] for i in range(0, metro_df.shape[0])]
lat = [metro_df["geometry.coordinates"][i][1] for i in range(0, metro_df.shape[0])]
metro_df["long"] = long
metro_df["lat"] = lat

In [None]:
# feature selection, cleaning and renaming
feature = ['properties.name','properties.description','long','lat']
metro_df = metro_df[feature]

from numpy import NaN
for i in range(0, metro_df.shape[0]):
  if metro_df['properties.description'][i] is not NaN:
     metro_df['properties.description'][i] = metro_df['properties.description'][i].replace('Area STIBM: ', '')

metro_df.rename(columns={"properties.name":"name","properties.description":"Area STIBM"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metro_df['properties.description'][i] = metro_df['properties.description'][i].replace('Area STIBM: ', '')


In [None]:
metro_df

Unnamed: 0,name,Area STIBM,long,lat
0,Gorgonzola,Mi5,9.403618,45.536545
1,Cologno Nord,Mi3,9.291136,45.534292
2,Cologno Centro,Mi3,9.282962,45.527505
3,Rho Fieramilano,Mi3,9.087378,45.519782
4,Dergano,Mi1,9.179567,45.505544
...,...,...,...,...
114,Marche,Mi1,9.194969,45.496352
115,Isola,Mi1,9.191294,45.487612
116,Dateo,,9.218447,45.468001
117,Argonne,,9.231908,45.468083


# Train station data import and conversion to tabular

In [None]:
# Train station data import
with open('/content/drive/MyDrive/treno.geojson') as f:
    data = json.load(f)

# conversion to tabular
from pandas import json_normalize
train_df = json_normalize(data["features"])

# split coordinate into longitude and latitude
long = [train_df["geometry.coordinates"][i][0] for i in range(0, train_df.shape[0])]
lat = [train_df["geometry.coordinates"][i][1] for i in range(0, train_df.shape[0])]
train_df["long"] = long
train_df["lat"] = lat

In [None]:
# feature selection, cleaning and renaming
feature = ['properties.name','properties.description','long','lat']
train_df = train_df[feature]

from numpy import NaN
for i in range(0, train_df.shape[0]):
  if train_df['properties.description'][i] is not NaN:
     train_df['properties.description'][i] = train_df['properties.description'][i].replace('Area STIBM: ', '')

train_df.rename(columns={"properties.name":"name","properties.description":"Area STIBM"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['properties.description'][i] = train_df['properties.description'][i].replace('Area STIBM: ', '')


In [None]:
train_df

Unnamed: 0,name,Area STIBM,long,lat
0,Milano Cadorna,Mi1,9.175523,45.468405
1,Milano Porta Garibaldi (superficie),Mi1,9.187309,45.484530
2,Milano San Cristoforo,,9.130128,45.442302
3,Milano Greco Pirelli,Mi1,9.214188,45.512889
4,Rho,Mi4,9.043565,45.524102
...,...,...,...,...
60,Borgolombardo,Mi4,9.271834,45.403127
61,San Donato Milanese,Mi3,9.253189,45.418503
62,Segrate,Mi3,9.298554,45.480806
63,Milano Romolo,Mi1,9.167850,45.443268


# Train and metro join

In [None]:
train_df['station_type']='train'
metro_df['station_type']='metro'

In [None]:
public_transport_df = pd.concat([train_df,metro_df], ignore_index=True)

In [None]:
# public_transport_df.to_csv('public_transport.csv')

# OSMR API

In [None]:
# distance and travel time by car from A to B
# A and B = string "long,lat"
# distance in meters; travel time in seconds
def OSMR(A,B):
  api_url_root = "https://router.project-osrm.org/route/v1/driving/"
  ind1 = A
  ind2 = B
  params = "?steps=false&overview=false&annotations=false"
  api_url = api_url_root+ind1+";"+ind2+params
  response = requests.get(api_url)
  response_json = response.json()
  duration = response_json["routes"][0]["legs"][0]["duration"]
  distance = response_json["routes"][0]["legs"][0]["distance"]
  return({'travel_time':duration,'distance':distance})

In [None]:
#Exemple
ind1 = "13.388860,52.517037" # !!! longitudine,latitudine !!!
ind2 = "13.397634,52.529407" # !!! longitudine,latitudine !!!
OSMR(ind1,ind2)

{'travel_time': 307.5, 'distance': 2126}

## k-means
In an attempt to reduce the time needed for routing all addresses to the desired destination a k-means geo-clustering is performed on a town/fraction basis. The travel time and distance from the centroids to the destination are used as proxy for all the points of the respective cluster thus reducing the number of API calls and the execution time.

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# address: list of addresses as a df with faetures 'lat' 'long'
# B: destination
# n: reduction factor, #clusters = int(#address/n) + 1
## returns
# centers_routing: a list of dictionaries containing 'travel_time' and 'distance' for aech center
# lables: a list of the cluster lables for each address
def OSMR_address_cluster(address,B,n):
  scaler = StandardScaler()
  scaled_features = scaler.fit_transform(address[['long', 'lat']])
  k = int(address.shape[0]/n) + 1
  kmeans = KMeans(init="random",n_clusters=k,n_init=10,max_iter=300)
  kmeans.fit(scaled_features)
  centers = scaler.inverse_transform(kmeans.cluster_centers_)
  centers_routing = []
  lables = kmeans.labels_
  for i in range(0,len(centers)):
    add = str(centers[i][0]) + ',' + str(centers[i][1])
    centers_routing.append(OSMR(add,B))
  return({'centers':centers,
          'centers_routing':centers_routing,'lables':lables})


In [None]:
# exemple: clustering of metro station,destination = bicocca
OSMR_address_cluster(metro_df,'9.211132,45.513698',10)

{'centers': array([[ 9.1478062 , 45.40563215],
        [ 9.14539851, 45.47854222],
        [ 9.20020307, 45.48578543],
        [ 9.20938043, 45.43919234],
        [ 9.09972257, 45.50429778],
        [ 9.18303223, 45.46546632],
        [ 9.2383656 , 45.46555303],
        [ 9.38298324, 45.53001326],
        [ 9.22638819, 45.50142501],
        [ 9.27038606, 45.52527856],
        [ 9.19677009, 45.51805275],
        [ 9.13224175, 45.46088282]]),
 'centers_routing': [{'travel_time': 1575.7, 'distance': 16808.9},
  {'travel_time': 786.2, 'distance': 8253.9},
  {'travel_time': 390.4, 'distance': 3669.9},
  {'travel_time': 1116.1, 'distance': 10357.6},
  {'travel_time': 1107.4, 'distance': 12019.3},
  {'travel_time': 851.7, 'distance': 7927.9},
  {'travel_time': 930.2, 'distance': 8704},
  {'travel_time': 1686.3, 'distance': 23269.1},
  {'travel_time': 497.8, 'distance': 3640.8},
  {'travel_time': 682.5, 'distance': 7198},
  {'travel_time': 380.1, 'distance': 2328},
  {'travel_time': 1052.3, 'd

# Nearest point
to find the nearest metro or train station

In [None]:
import geopy.distance
 # long,lat of the house; df:
def nearest_point(long, lat):
  coords_1 = (long,lat)
  distance=[]
  for i in range(0, metro_df.shape[0]):
    coords_2 = (metro_df.long[i], metro_df.lat[i]) # diventerà mezzi_pubblici_df
    distance.append(geopy.distance.geodesic(coords_1, coords_2).km)
  return({'index':distance.index(min(distance)), 'distance':round(min(distance),1)})

In [None]:
# destination = bicocca
nearest_point(9.211132,45.513698)

{'index': 111, 'distance': 0.6}

In [None]:
metro_df.iloc[111]

name            Bicocca
Area STIBM          Mi1
long            9.20536
lat           45.514733
Name: 111, dtype: object

In [None]:
# destination = bicocca
nearest_point(9.211132,45.513698)

{'index': 3, 'distance': 0.4}

In [None]:
train_df.iloc[3]

name          Milano Greco Pirelli
Area STIBM                     Mi1
long                      9.214188
lat                      45.512889
Name: 3, dtype: object

# Address -> lat, long (Scraping immobiliare.it)

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
url="https://www.immobiliare.it/annunci/103487334"

html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
dati = soup.find('script', type='application/json',id="__NEXT_DATA__").text
dati = json.loads(dati)

NameError: ignored

In [None]:
#datatest['detailData']
dati['props']['pageProps']['detailData']['realEstate']['properties'][0]['location']['latitude']

45.4561

In [None]:
dati['props']['pageProps']['detailData']['realEstate']['properties'][0]['location']['longitude']

9.24316

# Moovit

In [None]:
#url="https://realpython.com/beautiful-soup-web-scraper-python/#inspect-the-site-using-developer-tools"

url1="https://www.google.it/maps/dir/Melzo,+20066+Melzo+MI/Piazza+della+scienza,+Piazza+della+Scienza,+Milano,+MI/@45.5385547,9.2436457,12z/data=!3m1!4b1!4m14!4m13!1m5!1m1!1s0x4786ca65eb3c2ccf:0xfb63389ce7e834cb!2m2!1d9.4191211!2d45.5021544!1m5!1m1!1s0x4786c76eb68a7eeb:0x72255a7778c1c3c3!2m2!1d9.2113466!2d45.5136573!3e0?entry=ttu"
html_content = requests.get(url1).text
soup = BeautifulSoup(html_content, "lxml")

print(soup)

## Find All <a> Tags
#print(soup.find(class_="duration"))

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/Place" lang="en"> <head> <link href="/maps/preview/opensearch.xml?hl=en" rel="search" title="Google Maps" type="application/opensearchdescription+xml"/> <title>  Google Maps  </title> <meta content=" Find local businesses, view maps and get driving directions in Google Maps. " name="Description"/> <meta content="initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0, user-scalable=no" name="viewport"/> <meta content="chrome=1" http-equiv="X-UA-Compatible"/> <meta content="notranslate" name="google"/> <meta content="origin" name="referrer"/> <meta content="ByHT0GXztW_RcGxS0o86DBf1WtNu02FfqlcT8njnSqU" name="google-site-verification"/> <meta content="Diln__r3p9-tt39P2Cl2Amvx6oFB4PATnxuFBaw6ej8" name="google-site-verification"/> <meta content="Melzo to Piazza della scienza" itemprop="name"/> <meta content="Melzo to Piazza della scienza" property="og:title"/> <meta content="https://maps.google.com/maps/api/staticmap?center=45.5

In [None]:
soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html><head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>ERROR: The request could not be satisfied</title>
</head><body>
<h1>403 ERROR</h1>
<h2>The request could not be satisfied.</h2>
<hr noshade="" size="1px"/>
Request blocked.
We can't connect to the server for this app or website at this time. There might be too much traffic or a configuration error. Try again later, or contact the app or website owner.
<br clear="all"/>
If you provide content to customers through CloudFront, you can find steps to troubleshoot and help prevent this error by reviewing the CloudFront documentation.
<br clear="all"/>
<hr noshade="" size="1px"/>
<pre>
Generated by cloudfront (CloudFront)
Request ID: WmDh_FHQu0AerPQJ7zbVm7Y5iy3L2hvgGsijTW0tsdY_6OjNngC_1w==
</pre>
<address>
</address>
</body></html>

In [None]:
url="https://moovitapp.com/milano_e_lombardia-223/poi/Piazza%20della%20Scienza%202/Piazza%20Leonardo%20da%20Vinci%20154/it?fll=45.536545_9.403618&tll=45.513862_9.211366"


html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")

In [None]:
soup

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html><head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>ERROR: The request could not be satisfied</title>
</head><body>
<h1>403 ERROR</h1>
<h2>The request could not be satisfied.</h2>
<hr noshade="" size="1px"/>
Request blocked.
We can't connect to the server for this app or website at this time. There might be too much traffic or a configuration error. Try again later, or contact the app or website owner.
<br clear="all"/>
If you provide content to customers through CloudFront, you can find steps to troubleshoot and help prevent this error by reviewing the CloudFront documentation.
<br clear="all"/>
<hr noshade="" size="1px"/>
<pre>
Generated by cloudfront (CloudFront)
Request ID: zAS4OnsEydS6toPyC1YM9XuHVSr1yxCo9cu6ogY0XiA9j_TvhPHvyQ==
</pre>
<address>
</address>
</body></html>