In [None]:
complete work about Airports'Data Understanding 

In [39]:
import findspark
findspark.init()
import pyspark # only run after findspark.init()

from pyspark.sql import SparkSession
from pyspark.context import SparkContext 
spark = SparkSession.builder.getOrCreate()#create spark session 
sc = spark.sparkContext#create sparkContext
from pyspark.sql.types import  (StructType, 
                                StructField, 
                                DateType, 
                                BooleanType,
                                DoubleType,
                                IntegerType,
                                StringType,
                                DecimalType,
                                LongType,
                                ArrayType,
                                TimestampType)

# We can let Spark infer the schema of our csv data but proving pre-defined schema makes the reading process faster. 
#Further,it helps us to make the colum names to have the format we want, for example, to avoid spaces in the names of the columns.
#this is a complete csv of airports with correct names 
airport_schema = StructType([StructField("id",IntegerType(), True),
                            StructField("ident", StringType(), True),
                            StructField("type", StringType(), True ),
                            StructField("name", StringType(), True),
                            StructField("latitude_deg", DoubleType(), True  ),
                            StructField("longitude_deg", DoubleType(), True ),
                            StructField("elevation_ft", DoubleType(), True ),
                            StructField("continent",StringType(), True),
                            StructField("iso_country",StringType(), True),
                            StructField("iso_region", StringType(), True),
                            StructField("municipality", StringType(), True),
                            StructField("scheduled_service", StringType(), True),
                            StructField("gps_code", StringType(), True),
                             StructField("iata_code", StringType(), True),
                            StructField("local_code", StringType(), True),
                            StructField("home_link", StringType(), True),
                             StructField("wikipedia_link", StringType(), True),
                            StructField("keywords", StringType(), True)
                            ])


airport = spark.read.csv("C:/PFE/TEST/airports.csv",
                       header = True, 
                        schema = airport_schema)
print(airport.count())
airport.show(5)
print((airport.count(), len(airport.columns)))

55536
+------+-----+-------------+--------------------+-----------------+------------------+------------+---------+-----------+----------+------------+-----------------+--------+---------+----------+---------+--------------+--------+
|    id|ident|         type|                name|     latitude_deg|     longitude_deg|elevation_ft|continent|iso_country|iso_region|municipality|scheduled_service|gps_code|iata_code|local_code|home_link|wikipedia_link|keywords|
+------+-----+-------------+--------------------+-----------------+------------------+------------+---------+-----------+----------+------------+-----------------+--------+---------+----------+---------+--------------+--------+
|  6523|  00A|     heliport|   Total Rf Heliport|   40.07080078125|-74.93360137939453|        11.0|       NA|         US|     US-PA|    Bensalem|               no|     00A|     null|       00A|     null|          null|    null|
|323361| 00AA|small_airport|Aero B Ranch Airport|        38.704022|       -101.473

In [40]:
#group our airports by contry name
groupedByRegion = airport.groupby(airport['iso_country']).count()
groupedByRegion.head(10)

[Row(iso_country='DZ', count=61),
 Row(iso_country='LT', count=58),
 Row(iso_country='MM', count=75),
 Row(iso_country='CI', count=26),
 Row(iso_country='TC', count=8),
 Row(iso_country='AZ', count=35),
 Row(iso_country='FI', count=111),
 Row(iso_country='SC', count=16),
 Row(iso_country='PM', count=2),
 Row(iso_country='UA', count=192)]

In [41]:
#show the count of tunisia airport 
Tunisia=groupedByRegion.filter(groupedByRegion['iso_country']=='TN')
Tunisia.show()

+-----------+-----+
|iso_country|count|
+-----------+-----+
|         TN|   15|
+-----------+-----+



In [42]:
#show tunisia airports
airport.filter(groupedByRegion['iso_country']=='TN').select('ident','type','name','latitude_deg','longitude_deg','municipality','elevation_ft').show()


+-------+--------------+--------------------+------------------+------------------+------------+------------+
|  ident|          type|                name|      latitude_deg|     longitude_deg|municipality|elevation_ft|
+-------+--------------+--------------------+------------------+------------------+------------+------------+
|   DT70| small_airport|    Medenine Airport| 33.35020065307617|10.444000244140625|    Medenine|       427.0|
|   DTKA|medium_airport|Tabarka 7 Novembr...| 36.97999954223633|  8.87693977355957|     Tabarka|       230.0|
|   DTMB|medium_airport|Monastir Habib Bo...| 35.75809860229492| 10.75469970703125|    Monastir|         9.0|
|   DTTA| large_airport|Tunis Carthage In...| 36.85100173950195| 10.22719955444336|       Tunis|        22.0|
|   DTTB|medium_airport| Sidi Ahmed Air Base|         37.245398|           9.79145|  Sidi Ahmed|        20.0|
|   DTTD|medium_airport|     Remada Air Base| 32.30619812011719|10.382100105285645|      Remada|      1004.0|
|   DTTF|m

In [43]:
# Create geohash of our airport localisation with precision of 12
import geohash2
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import pandas as pd
from math import radians, cos, sin, asin, sqrt
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

""" create table location  form our ident,type,name,latitude_deg,longitude_deg,municipality,elevation_ft  ;
    string to float to use our geoash encode with type float
    
"""

airport.createOrReplaceTempView("location")
airport_sqlDF = spark.sql("SELECT ident,type,name,latitude_deg,longitude_deg,municipality,elevation_ft FROM location")
airport_sqlDF_rows = airport_sqlDF.select(
        airport_sqlDF.ident,
        airport_sqlDF.type,
        airport_sqlDF.name,
        airport_sqlDF.municipality,
        airport_sqlDF.latitude_deg.cast("float"), 
        airport_sqlDF.longitude_deg.cast("float"),
        airport_sqlDF.elevation_ft.cast("float")
    )
udf1 = F.udf(lambda x,y: geohash2.encode(x,y,precision=12))
airport_sqlDF_rows=airport_sqlDF_rows.withColumn('Hashcode', udf1('latitude_deg','longitude_deg'))
airport_sqlDF_rows.show(10)

+-----+-------------+--------------------+------------+------------+-------------+------------+------------+
|ident|         type|                name|municipality|latitude_deg|longitude_deg|elevation_ft|    Hashcode|
+-----+-------------+--------------------+------------+------------+-------------+------------+------------+
|  00A|     heliport|   Total Rf Heliport|    Bensalem|     40.0708|     -74.9336|        11.0|dr4eyyej2d9j|
| 00AA|small_airport|Aero B Ranch Airport|       Leoti|    38.70402|   -101.47391|      3435.0|9wzu4vugq8d3|
| 00AK|small_airport|        Lowell Field|Anchor Point|     59.9492|     -151.696|       450.0|bdsjh16m7hb9|
| 00AL|small_airport|        Epps Airpark|     Harvest|     34.8648|     -86.7703|       820.0|dn4q36zy1mtw|
| 00AR|       closed|Newport Hospital ...|     Newport|     35.6087|     -91.2549|       237.0|9yr4e6q5wyb0|
| 00AS|small_airport|      Fulton Airport|        Alex|   34.942802|   -97.818016|      1100.0|9y4qy15he42x|
| 00AZ|small_airpor

In [44]:
from pyspark.sql.functions import col
airport_sqlDF_rows.filter(col('name').contains("Carthage")).show() #show us tunis carthage airport
airport_sqlDF_rows.filter(col('name').contains("Monastir Habib")).show() #show us tunis Monastir airport

+-----+-------------+--------------------+------------+------------+-------------+------------+------------+
|ident|         type|                name|municipality|latitude_deg|longitude_deg|elevation_ft|    Hashcode|
+-----+-------------+--------------------+------------+------------+-------------+------------+------------+
| 0TX3|     heliport|UT Health East Te...|    Carthage|   32.166428|     -94.3461|       296.0|9vsyvper089n|
| DTTA|large_airport|Tunis Carthage In...|       Tunis|      36.851|      10.2272|        22.0|snx38sx0xh8e|
| K08M|small_airport|Carthage-Leake Co...|    Carthage|     32.7612|     -89.5301|       454.0|djb66eypvd23|
+-----+-------------+--------------------+------------+------------+-------------+------------+------------+

+-----+--------------+--------------------+------------+------------+-------------+------------+------------+
|ident|          type|                name|municipality|latitude_deg|longitude_deg|elevation_ft|    Hashcode|
+-----+---------

In [47]:
#here we can decode our geoash airport Tunis Carthage
geohash2.decode('snx38sx0xh8e')

('36.851002', '10.227199')

In [49]:
#with this we can correct the right name  and adrees of airoport /right laltitue and longitude...
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
geolocator = Nominatim(user_agent="change name to address")
location = geolocator.geocode("Tunis Carthage")
print(location.address)
print(location.point)
print((location.latitude, location.longitude))
print(location.raw)

قرطاج, قرطاج الشاطئ, قرطاج, تونس, 2016, تونس
36 51m 5.8572s N, 10 19m 49.5876s E
(36.851627, 10.330441)
{'place_id': 12223039, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'node', 'osm_id': 1136727225, 'boundingbox': ['36.691627', '37.011627', '10.170441', '10.490441'], 'lat': '36.851627', 'lon': '10.330441', 'display_name': 'قرطاج, قرطاج الشاطئ, قرطاج, تونس, 2016, تونس', 'class': 'place', 'type': 'city', 'importance': 0.650740883747435, 'icon': 'https://nominatim.openstreetmap.org/images/mapicons/poi_place_city.p.20.png'}


In [50]:
#geodesic distance between 2 point 
from geopy.distance import geodesic
Monastir =(35.75809860229492, 10.75469970703125)
Tunis= (36.851627, 10.330441)
print(geodesic(Monastir, Tunis).km)

127.18489907304748


In [51]:
#distance between 2 geohash tunis and  monastir airports points using geohash.decode 
print(geodesic(geohash2.decode('snx38sx0xh8e'), geohash2.decode('snrektx648tq')).kilometers)
Tunis= (36.851627, 10.330441)
Monastir =(35.75809860229492, 10.75469970703125)
print (geodesic(Tunis, Monastir).km)

130.1989667289029
127.18489907304748


In [53]:
#have Name position from laltiude , longitude 
>>> location = geolocator.reverse("35.75809860229492, 10.75469970703125")
>>> print(location.address)

المطار الدولي المنستير الحبيب بورقيبة, RR 92 طج, صقانص, المنستير, تونس


In [54]:
#calculate haversine distance between 2 point A and B en m
import math

def haversine(coord1, coord2):
    R = 6372800  # Earth radius in meters
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    
    phi1, phi2 = math.radians(lat1), math.radians(lat2) 
    dphi       = math.radians(lat2 - lat1)
    dlambda    = math.radians(lon2 - lon1)
    
    a = math.sin(dphi/2)**2 + \
        math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    
    return 2*R*math.atan2(math.sqrt(a), math.sqrt(1 - a))
x=haversine((36.851627, 10.330441), (35.75809860229492, 10.75469970703125))
print(x)
#other example
london_coord = 51.5073219,  -0.1276474
cities = {
    'berlin': (52.5170365,  13.3888599),
    'vienna': (48.2083537,  16.3725042),
    'sydney': (-33.8548157, 151.2164539),
    'madrid': (40.4167047,  -3.7035825) 
}
distancesgeo=[930723.2019867426,1235650.1412429416,16997984.55171465,1263769.8859593808]
for city, coord in cities.items():
    distance = geodesic(london_coord, coord)
    
    print(city, distance)
 

127435.05556109041
berlin 933.4107641236288 km
vienna 1238.8047757673298 km
sydney 16988.546466908156 km
madrid 1263.1019239179498 km


In [75]:
#using Pandas power  and  folium to visulase our tunisia airoport  data 
import folium
import pandas as pd
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
Tunisia_DataFrame = airport.filter(airport['iso_country']=='TN').select('ident','type','name','latitude_deg','longitude_deg','municipality','elevation_ft')
Tunisia_DataFrame = pd.DataFrame.from_records(Tunisia_DataFrame.collect(), columns=Tunisia_DataFrame.columns)

Tunisia_DataFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 7 columns):
ident            15 non-null object
type             15 non-null object
name             15 non-null object
latitude_deg     15 non-null float64
longitude_deg    15 non-null float64
municipality     15 non-null object
elevation_ft     14 non-null float64
dtypes: float64(3), object(4)
memory usage: 920.0+ bytes


In [76]:
Tunisia_DataFrame.head(5)

Unnamed: 0,ident,type,name,latitude_deg,longitude_deg,municipality,elevation_ft
0,DT70,small_airport,Medenine Airport,33.350201,10.444,Medenine,427.0
1,DTKA,medium_airport,Tabarka 7 Novembre Airport,36.98,8.87694,Tabarka,230.0
2,DTMB,medium_airport,Monastir Habib Bourguiba International Airport,35.758099,10.7547,Monastir,9.0
3,DTTA,large_airport,Tunis Carthage International Airport,36.851002,10.2272,Tunis,22.0
4,DTTB,medium_airport,Sidi Ahmed Air Base,37.245398,9.79145,Sidi Ahmed,20.0


In [77]:
#Folium is a very powerful and an intercative visulation tool 
from folium.plugins import FloatImage
# generate a new map
folium_map = folium.Map()

 
Tunisia_DataFrame['latitude_deg'] = pd.to_numeric(Tunisia_DataFrame['latitude_deg'],errors='coerce')
Tunisia_DataFrame['Longitude'] = pd.to_numeric(Tunisia_DataFrame['longitude_deg'],errors='coerce')
Tunisia_DataFrame['latitude_deg'] = Tunisia_DataFrame['latitude_deg'].astype(float)
Tunisia_DataFrame['Longitude'] = Tunisia_DataFrame['longitude_deg'].astype(float)
Tunisia_DataFrame=Tunisia_DataFrame.dropna()
icon = folium.features.CustomIcon('https://cdn.icon-icons.com/icons2/973/PNG/512/Airport_icon-icons.com_74913.png',icon_size=(50, 50))
for index, row in Tunisia_DataFrame.iterrows():
    folium.Marker([float(row['latitude_deg']),float( row['longitude_deg'])],
         popup = ('ident 	: ' + str(row['ident'] ).capitalize() + '<br>'
                 'type: ' + str(row['type']) + '<br>'
                 'name: ' + str(row['name']) +'<br>'
                ),
         icon=folium.features.CustomIcon('https://cdn.icon-icons.com/icons2/973/PNG/512/Airport_icon-icons.com_74913.png',icon_size=(50, 50))
         ).add_to(folium_map)
folium.Marker([51.5183, 0.5206], 
              popup='East London',
              icon=folium.Icon(color='red',icon='university', prefix='fa') 
             ).add_to(folium_map)
url = ('https://raw.githubusercontent.com/SECOORA/static_assets/'
       'master/maps/img/rose.png')

FloatImage(url, bottom=40, left=80).add_to(folium_map)
folium_map