In [1]:
import pyspark
from pyspark.sql import SparkSession

# Mlib
It is spark's machine learning library

There are 2 different techniques to work with ML:  
1. RDD-based 
2. DataFrames

We will be focusing on DataFrame API's

In [3]:
spark = SparkSession.builder.appName('MLlib-intro').getOrCreate()
spark

In [4]:
data = spark.read.options(header=True, inferSchema=True, delimiter=';').csv('data/test_tut06.csv')
data.show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Krish| 31|        10|190000|
|  Sunny| 30|         8|150000|
|  Sudfa| 29|         4|100000|
| Harsha| 24|         3|100000|
|Shubham| 21|         1| 50000|
|   Paul| 23|         2| 65000|
+-------+---+----------+------+



We are going to predict **salary**

In [5]:
from pyspark.ml.feature import VectorAssembler

In [6]:
independent_features = VectorAssembler(
                        inputCols=['Age', 'Experience'],
                        outputCol='Combination')
data = independent_features.transform(data)
data.show()

+-------+---+----------+------+-----------+
|   Name|Age|Experience|Salary|Combination|
+-------+---+----------+------+-----------+
|  Krish| 31|        10|190000|[31.0,10.0]|
|  Sunny| 30|         8|150000| [30.0,8.0]|
|  Sudfa| 29|         4|100000| [29.0,4.0]|
| Harsha| 24|         3|100000| [24.0,3.0]|
|Shubham| 21|         1| 50000| [21.0,1.0]|
|   Paul| 23|         2| 65000| [23.0,2.0]|
+-------+---+----------+------+-----------+



**Vector Assembler** is a way to combine all our features into a column vector (single independent feature)

Now we want to predict `Salary` using our new single `combination` feature

In [7]:
final_data = data.select(['Combination', 'Salary'])
final_data.show()

+-----------+------+
|Combination|Salary|
+-----------+------+
|[31.0,10.0]|190000|
| [30.0,8.0]|150000|
| [29.0,4.0]|100000|
| [24.0,3.0]|100000|
| [21.0,1.0]| 50000|
| [23.0,2.0]| 65000|
+-----------+------+



In [23]:
from pyspark.ml.regression import LinearRegression

train, test = final_data.randomSplit([0.65, 0.35])
model = LinearRegression(featuresCol='Combination',
                         labelCol='Salary')
model = model.fit(train)
model.coefficients, model.intercept

(DenseVector([9912.2807, 4692.9825]), -162850.87719294647)

How we make a prediction ?

In [26]:
test.show()

+-----------+------+
|Combination|Salary|
+-----------+------+
| [29.0,4.0]|100000|
| [30.0,8.0]|150000|
+-----------+------+



In [24]:
pred_result = model.evaluate(test)

In [25]:
pred_result.predictions.show()

+-----------+------+------------------+
|Combination|Salary|        prediction|
+-----------+------+------------------+
| [29.0,4.0]|100000|143377.19298244864|
| [30.0,8.0]|150000|172061.40350877013|
+-----------+------+------------------+



Also we can use other metrics

In [31]:
pred_result.meanSquaredError

1184143197.9066834

In [32]:
pred_result.meanAbsoluteError

32719.29824560939