In [0]:
%fs ls /FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/AllBroadcasts.csv,AllBroadcasts.csv,22178362,1655660919000
dbfs:/FileStore/tables/airports.csv,airports.csv,542810,1659415390000
dbfs:/FileStore/tables/country_vaccinations.csv,country_vaccinations.csv,6801916,1627998693000


In [0]:
df = spark.read.csv("/FileStore/tables/airports.csv", header="true", inferSchema="true")

In [0]:
df.display()

Airport_ID,IATA,Name,Lat,Long,Alt,Timezone,DST
1,GKA,Goroka Airport,-6.081689834590001,145.391998291,5282,10,U
2,MAG,Madang Airport,-5.20707988739,145.789001465,20,10,U
3,HGU,Mount Hagen Kagamuga Airport,-5.826789855957031,144.29600524902344,5388,10,U
4,LAE,Nadzab Airport,-6.569803,146.725977,239,10,U
5,POM,Port Moresby Jacksons International Airport,-9.44338035583496,147.22000122070312,146,10,U
6,WWK,Wewak International Airport,-3.58383011818,143.669006348,19,10,U
7,UAK,Narsarsuaq Airport,61.1604995728,-45.4259986877,112,-3,E
8,GOH,Godthaab / Nuuk Airport,64.19090271,-51.6781005859,283,-3,E
9,SFJ,Kangerlussuaq Airport,67.0122218992,-50.7116031647,165,-3,E
10,THU,Thule Air Base,76.5311965942,-68.7032012939,251,-4,E


In [0]:
df.printSchema()

root
 |-- Airport_ID: integer (nullable = true)
 |-- IATA: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long: double (nullable = true)
 |-- Alt: integer (nullable = true)
 |-- Timezone: string (nullable = true)
 |-- DST: string (nullable = true)



In [0]:
df.show(10)

+----------+----+--------------------+------------------+------------------+----+--------+---+
|Airport_ID|IATA|                Name|               Lat|              Long| Alt|Timezone|DST|
+----------+----+--------------------+------------------+------------------+----+--------+---+
|         1| GKA|      Goroka Airport|-6.081689834590001|     145.391998291|5282|      10|  U|
|         2| MAG|      Madang Airport|    -5.20707988739|     145.789001465|  20|      10|  U|
|         3| HGU|Mount Hagen Kagam...|-5.826789855957031|144.29600524902344|5388|      10|  U|
|         4| LAE|      Nadzab Airport|         -6.569803|        146.725977| 239|      10|  U|
|         5| POM|Port Moresby Jack...| -9.44338035583496|147.22000122070312| 146|      10|  U|
|         6| WWK|Wewak Internation...|    -3.58383011818|     143.669006348|  19|      10|  U|
|         7| UAK|  Narsarsuaq Airport|     61.1604995728|    -45.4259986877| 112|      -3|  E|
|         8| GOH|Godthaab / Nuuk A...|       64.19

# Transformar número em string para numérico

In [0]:
df = df.withColumn("Timezone", df.Timezone.cast("double"))
df.show(10)

+----------+----+--------------------+------------------+------------------+----+--------+---+
|Airport_ID|IATA|                Name|               Lat|              Long| Alt|Timezone|DST|
+----------+----+--------------------+------------------+------------------+----+--------+---+
|         1| GKA|      Goroka Airport|-6.081689834590001|     145.391998291|5282|    10.0|  U|
|         2| MAG|      Madang Airport|    -5.20707988739|     145.789001465|  20|    10.0|  U|
|         3| HGU|Mount Hagen Kagam...|-5.826789855957031|144.29600524902344|5388|    10.0|  U|
|         4| LAE|      Nadzab Airport|         -6.569803|        146.725977| 239|    10.0|  U|
|         5| POM|Port Moresby Jack...| -9.44338035583496|147.22000122070312| 146|    10.0|  U|
|         6| WWK|Wewak Internation...|    -3.58383011818|     143.669006348|  19|    10.0|  U|
|         7| UAK|  Narsarsuaq Airport|     61.1604995728|    -45.4259986877| 112|    -3.0|  E|
|         8| GOH|Godthaab / Nuuk A...|       64.19

In [0]:
df.printSchema()

root
 |-- Airport_ID: integer (nullable = true)
 |-- IATA: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long: double (nullable = true)
 |-- Alt: integer (nullable = true)
 |-- Timezone: double (nullable = true)
 |-- DST: string (nullable = true)



# Transformar texto para numérico com One-Hot Vector

In [0]:
from pyspark.sql.functions import countDistinct

df.groupBy('DST').count().show()
df.select(countDistinct('DST')).show()

+---+-----+
|DST|count|
+---+-----+
| \N|  353|
|  E| 1610|
|  U| 1862|
|  O|  225|
|  Z|   57|
|  A| 1777|
|  N| 1402|
|  S|  412|
+---+-----+

+-------------------+
|count(DISTINCT DST)|
+-------------------+
|                  8|
+-------------------+



In [0]:
from pyspark.ml import feature

indexer = feature.StringIndexer(inputCol='DST', outputCol='index')

In [0]:
encoder = feature.OneHotEncoder(inputCol='index', outputCol='factor')

In [0]:
assembler = feature.VectorAssembler(inputCols=['Airport_ID', 'Lat', 'Long', 'Alt', 'Timezone', 'factor'], outputCol='labels')

In [0]:
from pyspark.ml import Pipeline

pipe = Pipeline(stages=[indexer, encoder, assembler])

In [0]:
new_df = pipe.fit(df).transform(df)
new_df.show(10)

+----------+----+--------------------+------------------+------------------+----+--------+---+-----+-------------+--------------------+
|Airport_ID|IATA|                Name|               Lat|              Long| Alt|Timezone|DST|index|       factor|              labels|
+----------+----+--------------------+------------------+------------------+----+--------+---+-----+-------------+--------------------+
|         1| GKA|      Goroka Airport|-6.081689834590001|     145.391998291|5282|    10.0|  U|  0.0|(7,[0],[1.0])|(12,[0,1,2,3,4,5]...|
|         2| MAG|      Madang Airport|    -5.20707988739|     145.789001465|  20|    10.0|  U|  0.0|(7,[0],[1.0])|(12,[0,1,2,3,4,5]...|
|         3| HGU|Mount Hagen Kagam...|-5.826789855957031|144.29600524902344|5388|    10.0|  U|  0.0|(7,[0],[1.0])|(12,[0,1,2,3,4,5]...|
|         4| LAE|      Nadzab Airport|         -6.569803|        146.725977| 239|    10.0|  U|  0.0|(7,[0],[1.0])|(12,[0,1,2,3,4,5]...|
|         5| POM|Port Moresby Jack...| -9.443380

# Entendendo a representação do One-Hot Vector

In [0]:
df_2 = sqlContext.createDataFrame([
(0, "a"),
(1, "b"),
(2, "c"),
(3, "a"),
(4, "a"),
(5, "c")
], ["id", "category"])

stringIndexer = feature.StringIndexer(inputCol="category", outputCol="categoryIndex")
model = stringIndexer.fit(df_2)
indexed = model.transform(df_2)

encoder = feature.OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
encoded = encoder.fit(indexed)
encoded = encoded.transform(indexed)
encoded.show()

+---+--------+-------------+-------------+
| id|category|categoryIndex|  categoryVec|
+---+--------+-------------+-------------+
|  0|       a|          0.0|(2,[0],[1.0])|
|  1|       b|          2.0|    (2,[],[])|
|  2|       c|          1.0|(2,[1],[1.0])|
|  3|       a|          0.0|(2,[0],[1.0])|
|  4|       a|          0.0|(2,[0],[1.0])|
|  5|       c|          1.0|(2,[1],[1.0])|
+---+--------+-------------+-------------+



0  -> 10<br>
1  -> 01<br>
2  -> 00

# Interpretando os labels

In [0]:
new_df.select('labels').show()

+--------------------+
|              labels|
+--------------------+
|(12,[0,1,2,3,4,5]...|
|(12,[0,1,2,3,4,5]...|
|(12,[0,1,2,3,4,5]...|
|(12,[0,1,2,3,4,5]...|
|(12,[0,1,2,3,4,5]...|
|(12,[0,1,2,3,4,5]...|
|(12,[0,1,2,3,4,7]...|
|(12,[0,1,2,3,4,7]...|
|(12,[0,1,2,3,4,7]...|
|(12,[0,1,2,3,4,7]...|
|(12,[0,1,2,3,8],[...|
|(12,[0,1,2,3,8],[...|
|(12,[0,1,2,3,8],[...|
|(12,[0,1,2,3,8],[...|
|(12,[0,1,2,3,8],[...|
|(12,[0,1,2,3,8],[...|
|(12,[0,1,2,3,8],[...|
|(12,[0,1,2,3,8],[...|
|(12,[0,1,2,3,8],[...|
|(12,[0,1,2,3,8],[...|
+--------------------+
only showing top 20 rows



In [0]:
display(new_df.select('labels'))

labels
"Map(vectorType -> sparse, length -> 12, indices -> List(0, 1, 2, 3, 4, 5), values -> List(1.0, -6.081689834590001, 145.391998291, 5282.0, 10.0, 1.0))"
"Map(vectorType -> sparse, length -> 12, indices -> List(0, 1, 2, 3, 4, 5), values -> List(2.0, -5.20707988739, 145.789001465, 20.0, 10.0, 1.0))"
"Map(vectorType -> sparse, length -> 12, indices -> List(0, 1, 2, 3, 4, 5), values -> List(3.0, -5.826789855957031, 144.29600524902344, 5388.0, 10.0, 1.0))"
"Map(vectorType -> sparse, length -> 12, indices -> List(0, 1, 2, 3, 4, 5), values -> List(4.0, -6.569803, 146.725977, 239.0, 10.0, 1.0))"
"Map(vectorType -> sparse, length -> 12, indices -> List(0, 1, 2, 3, 4, 5), values -> List(5.0, -9.44338035583496, 147.22000122070312, 146.0, 10.0, 1.0))"
"Map(vectorType -> sparse, length -> 12, indices -> List(0, 1, 2, 3, 4, 5), values -> List(6.0, -3.58383011818, 143.669006348, 19.0, 10.0, 1.0))"
"Map(vectorType -> sparse, length -> 12, indices -> List(0, 1, 2, 3, 4, 7), values -> List(7.0, 61.1604995728, -45.4259986877, 112.0, -3.0, 1.0))"
"Map(vectorType -> sparse, length -> 12, indices -> List(0, 1, 2, 3, 4, 7), values -> List(8.0, 64.19090271, -51.6781005859, 283.0, -3.0, 1.0))"
"Map(vectorType -> sparse, length -> 12, indices -> List(0, 1, 2, 3, 4, 7), values -> List(9.0, 67.0122218992, -50.7116031647, 165.0, -3.0, 1.0))"
"Map(vectorType -> sparse, length -> 12, indices -> List(0, 1, 2, 3, 4, 7), values -> List(10.0, 76.5311965942, -68.7032012939, 251.0, -4.0, 1.0))"


In [0]:
df.show(7)

+----------+----+--------------------+------------------+------------------+----+--------+---+
|Airport_ID|IATA|                Name|               Lat|              Long| Alt|Timezone|DST|
+----------+----+--------------------+------------------+------------------+----+--------+---+
|         1| GKA|      Goroka Airport|-6.081689834590001|     145.391998291|5282|    10.0|  U|
|         2| MAG|      Madang Airport|    -5.20707988739|     145.789001465|  20|    10.0|  U|
|         3| HGU|Mount Hagen Kagam...|-5.826789855957031|144.29600524902344|5388|    10.0|  U|
|         4| LAE|      Nadzab Airport|         -6.569803|        146.725977| 239|    10.0|  U|
|         5| POM|Port Moresby Jack...| -9.44338035583496|147.22000122070312| 146|    10.0|  U|
|         6| WWK|Wewak Internation...|    -3.58383011818|     143.669006348|  19|    10.0|  U|
|         7| UAK|  Narsarsuaq Airport|     61.1604995728|    -45.4259986877| 112|    -3.0|  E|
+----------+----+--------------------+------------