## Examples of PySpark ML

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('Missing').getOrCreate()

In [3]:
training=spark.read.csv('test4.csv', sep=';', header=True, inferSchema=True)
training.show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|  Max| 31|        10| 30000|
|Abdel| 30|         8| 25000|
| Manu| 29|         4| 20000|
| Paul| 24|         3| 20000|
|Kevin| 21|         1| 15000|
|Jonas| 23|         2| 18000|
+-----+---+----------+------+



In [4]:
# [Age, Experience] --> new feature --> independent feature

In [5]:
from pyspark.ml.feature import VectorAssembler

In [10]:
featureassembler=VectorAssembler(inputCols=['Age', 'Experience'], outputCol='Independent features')
featureassembler

VectorAssembler_32ed72d7a19f

In [8]:
output=featureassembler.transform(training)
output.show()

+-----+---+----------+------+--------------------+
| Name|Age|Experience|Salary|Independent features|
+-----+---+----------+------+--------------------+
|  Max| 31|        10| 30000|         [31.0,10.0]|
|Abdel| 30|         8| 25000|          [30.0,8.0]|
| Manu| 29|         4| 20000|          [29.0,4.0]|
| Paul| 24|         3| 20000|          [24.0,3.0]|
|Kevin| 21|         1| 15000|          [21.0,1.0]|
|Jonas| 23|         2| 18000|          [23.0,2.0]|
+-----+---+----------+------+--------------------+



In [12]:
finalized_data=output.select('Independent features', 'Salary')
finalized_data.show()

+--------------------+------+
|Independent features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



In [14]:
from pyspark.ml.regression import LinearRegression
train_data, test_data=finalized_data.randomSplit([0.75, 0.25])
regressor=LinearRegression(featuresCol='Independent features', labelCol='Salary')
regressor=regressor.fit(train_data)

In [15]:
regressor.coefficients

DenseVector([-90.5483, 1608.7819])

In [16]:
regressor.intercept

16079.136690647425

In [17]:
pred_results=regressor.evaluate(test_data)

In [18]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent features|Salary|       prediction|
+--------------------+------+-----------------+
|          [23.0,2.0]| 18000|17214.09079632846|
+--------------------+------+-----------------+

