In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = (SparkSession.builder.getOrCreate())

In [2]:
filepath = 'sf-fire-calls.csv'
df=spark.read.csv(filepath ,header = True , inferSchema =True)
df.show(2)

+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+-------+-------------+---------+--------+--------------------------+----------------------+------------------+--------------------+--------------------+-------------+-----+
|CallNumber|UnitID|IncidentNumber|        CallType|  CallDate| WatchDate|CallFinalDisposition|       AvailableDtTm|             Address|City|Zipcode|Battalion|StationArea| Box|OriginalPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType|UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|        Neighborhood|            Location|        RowID|Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+--------------------+--------------------+----+-------+---------+-----------+----+----------------+--------+-------------+------

In [3]:
df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

# Note : 
when split data , you have to save new splits in parquet file to guarantee that you save the sorting of data .


In [4]:
df = df.select(['FinalPriority' , 'NumAlarms','Delay'])
print(df.show(3,truncate=False))
test , train = df.randomSplit([0.2, 0.8], 24)
print(test.count())
print(train.count())


+-------------+---------+---------+
|FinalPriority|NumAlarms|Delay    |
+-------------+---------+---------+
|3            |1        |2.95     |
|3            |1        |4.7      |
|3            |1        |2.4333334|
+-------------+---------+---------+
only showing top 3 rows

None
35013
140283


In [7]:
from  pyspark.ml.feature import VectorAssembler

vecAssemble = VectorAssembler(inputCols= ['FinalPriority'] , outputCol='features')
vecAssemble2 = VectorAssembler(inputCols= ['FinalPriority' , 'NumAlarms','Delay'] , outputCol='features')

vecTrain = vecAssemble.transform(train)
vecTrain2 = vecAssemble2.transform(train)

In [8]:
vecTrain2 = vecTrain2.select(['FinalPriority' , 'NumAlarms','features'])
vecTrain2.show()

+-------------+---------+--------------------+
|FinalPriority|NumAlarms|            features|
+-------------+---------+--------------------+
|            2|        1|[2.0,1.0,0.28333333]|
|            2|        1|[2.0,1.0,0.28333333]|
|            2|        1|       [2.0,1.0,0.3]|
|            2|        1|[2.0,1.0,0.31666666]|
|            2|        1|[2.0,1.0,0.33333334]|
|            2|        1|[2.0,1.0,0.36666667]|
|            2|        1|[2.0,1.0,0.38333333]|
|            2|        1|[2.0,1.0,0.38333333]|
|            2|        1|[2.0,1.0,0.38333333]|
|            2|        1|[2.0,1.0,0.38333333]|
|            2|        1|[2.0,1.0,0.38333333]|
|            2|        1|[2.0,1.0,0.38333333]|
|            2|        1|[2.0,1.0,0.38333333]|
|            2|        1|       [2.0,1.0,0.4]|
|            2|        1|       [2.0,1.0,0.4]|
|            2|        1|[2.0,1.0,0.41666666]|
|            2|        1|[2.0,1.0,0.41666666]|
|            2|        1|[2.0,1.0,0.43333334]|
|            