# Importing Modules 

In [1]:
import pyspark
import findspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import col , udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import trim

import logging
import pandas as pd 
import numpy as np 
import re

findspark.init('D:\\Spark\\')

# Creating SparkSession

In [2]:
spark=SparkSession.builder\
    .master('local') \
    .appName('Udacity') \
    .getOrCreate()

In [3]:
spark

# Loading and Inspsecting Dataframe

In [4]:
df = spark.read.csv('data/airport-codes_csv.csv',header=True)

In [5]:
df.printSchema()

root
 |-- ident: string (nullable = true)
 |-- type: string (nullable = true)
 |-- name: string (nullable = true)
 |-- elevation_ft: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- iso_country: string (nullable = true)
 |-- iso_region: string (nullable = true)
 |-- municipality: string (nullable = true)
 |-- gps_code: string (nullable = true)
 |-- iata_code: string (nullable = true)
 |-- local_code: string (nullable = true)
 |-- coordinates: string (nullable = true)



In [6]:
df.show()

+-----+-------------+--------------------+------------+---------+-----------+----------+------------+--------+---------+----------+--------------------+
|ident|         type|                name|elevation_ft|continent|iso_country|iso_region|municipality|gps_code|iata_code|local_code|         coordinates|
+-----+-------------+--------------------+------------+---------+-----------+----------+------------+--------+---------+----------+--------------------+
|  00A|     heliport|   Total Rf Heliport|          11|       NA|         US|     US-PA|    Bensalem|     00A|     null|       00A|-74.9336013793945...|
| 00AA|small_airport|Aero B Ranch Airport|        3435|       NA|         US|     US-KS|       Leoti|    00AA|     null|      00AA|-101.473911, 38.7...|
| 00AK|small_airport|        Lowell Field|         450|       NA|         US|     US-AK|Anchor Point|    00AK|     null|      00AK|-151.695999146, 5...|
| 00AL|small_airport|        Epps Airpark|         820|       NA|         US|     

# Cleaning Data 

+ ~~NA in continent~~ 
+ ~~Nulls in iata Code~~

In [7]:
# Some NA's in Continent
df.groupBy('continent').count().show()

+---------+-----+
|continent|count|
+---------+-----+
|       NA|27719|
|       SA| 7709|
|       AS| 5350|
|       AN|   28|
|       OC| 3067|
|       EU| 7840|
|       AF| 3362|
+---------+-----+



In [8]:
# Nulls at iata_code - those airports aren't joinable 
df_nulls=df.where(col('iata_code').isNull())
df_nulls.count()

45886

In [9]:
# Dropping Nulls in iata
df_clean=df.filter(col('iata_code').isNotNull())
# Dropping Continent 
df_clean=df_clean.drop(col('continent'))

In [10]:
# Writing Cleaned table 
df_clean.write.csv('processed/airport.csv',mode='overwrite',header=True)