In [1]:
# Installing PySpark
!pip install pyspark



In [2]:
# Importing libraries

import os
import io
import time
import json
import struct
import requests 
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.streaming import StreamingContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [3]:
# Starting a Spark session
spark = SparkSession.builder.appName('SalaryExperience').getOrCreate()
spark

# BASICS

In [4]:
# Reading data
df_pyspark = spark.read.option("delimiter",";").csv('SalaryExperience.csv', header=True, inferSchema=True)
df_pyspark.show()

+-----+---+----------+------+------+
| Name|Age|Experience|Sector|Salary|
+-----+---+----------+------+------+
| Mark| 21|         3|    IT|  1500|
| John| 30|        12|    IT|  6000|
|Sarah| 22|         4| Sales|  2000|
| Carl| 45|        27|    IT| 13500|
| Lisa| 23|         5| Sales|  2500|
| Paul| 56|        38|    IT| 19000|
+-----+---+----------+------+------+



In [5]:
# Selecting some columns
df_pyspark.select(['Name', 'Age', 'Salary']).show()

+-----+---+------+
| Name|Age|Salary|
+-----+---+------+
| Mark| 21|  1500|
| John| 30|  6000|
|Sarah| 22|  2000|
| Carl| 45| 13500|
| Lisa| 23|  2500|
| Paul| 56| 19000|
+-----+---+------+



In [6]:
# Counting the number of employees
df_pyspark.count()

6

In [7]:
# Filter: show under-30 only"
df_pyspark.filter("Age<30").show()

+-----+---+----------+------+------+
| Name|Age|Experience|Sector|Salary|
+-----+---+----------+------+------+
| Mark| 21|         3|    IT|  1500|
|Sarah| 22|         4| Sales|  2000|
| Lisa| 23|         5| Sales|  2500|
+-----+---+----------+------+------+



In [8]:
# Computing the average salary
df_pyspark.select(avg ('Salary')).show()

+-----------------+
|      avg(Salary)|
+-----------------+
|7416.666666666667|
+-----------------+



In [9]:
# Computing the average salary grouping by department
df_pyspark.groupBy('Sector').avg ('Salary').show()

+------+-----------+
|Sector|avg(Salary)|
+------+-----------+
| Sales|     2250.0|
|    IT|    10000.0|
+------+-----------+



In [10]:
# Computing the maximum salary grouping by department
df_pyspark.groupBy('Sector').max ('Salary').show()

+------+-----------+
|Sector|max(Salary)|
+------+-----------+
| Sales|       2500|
|    IT|      19000|
+------+-----------+



# ML

In [11]:
training = spark.read.option("delimiter",";").csv('SalaryExperience.csv', header=True, inferSchema=True)
training.show()

+-----+---+----------+------+------+
| Name|Age|Experience|Sector|Salary|
+-----+---+----------+------+------+
| Mark| 21|         3|    IT|  1500|
| John| 30|        12|    IT|  6000|
|Sarah| 22|         4| Sales|  2000|
| Carl| 45|        27|    IT| 13500|
| Lisa| 23|         5| Sales|  2500|
| Paul| 56|        38|    IT| 19000|
+-----+---+----------+------+------+



In [12]:
# Putting together independent variables
featureAssembler =  VectorAssembler(inputCols = ["Age", "Experience"], outputCol = "IndependentFeatures")

In [13]:
# Adding the new vector of independent features
output = featureAssembler.transform(training)
output.show()

+-----+---+----------+------+------+-------------------+
| Name|Age|Experience|Sector|Salary|IndependentFeatures|
+-----+---+----------+------+------+-------------------+
| Mark| 21|         3|    IT|  1500|         [21.0,3.0]|
| John| 30|        12|    IT|  6000|        [30.0,12.0]|
|Sarah| 22|         4| Sales|  2000|         [22.0,4.0]|
| Carl| 45|        27|    IT| 13500|        [45.0,27.0]|
| Lisa| 23|         5| Sales|  2500|         [23.0,5.0]|
| Paul| 56|        38|    IT| 19000|        [56.0,38.0]|
+-----+---+----------+------+------+-------------------+



In [14]:
# Taking columns we're interested in 
data = output.select("IndependentFeatures", "Salary")
data.show()

+-------------------+------+
|IndependentFeatures|Salary|
+-------------------+------+
|         [21.0,3.0]|  1500|
|        [30.0,12.0]|  6000|
|         [22.0,4.0]|  2000|
|        [45.0,27.0]| 13500|
|         [23.0,5.0]|  2500|
|        [56.0,38.0]| 19000|
+-------------------+------+



In [15]:
# Splitting data into train and test set
train_data, test_data = data.randomSplit([0.66, 0.33])

In [16]:
# Creating and training the regressor 
regressor = LinearRegression(featuresCol = "IndependentFeatures", labelCol = "Salary")
regressor = regressor.fit(train_data)

In [17]:
# Regressor makes predictions
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()

+-------------------+------+-----------------+
|IndependentFeatures|Salary|       prediction|
+-------------------+------+-----------------+
|         [23.0,5.0]|  2500|2499.999999999998|
|        [45.0,27.0]| 13500|          13500.0|
+-------------------+------+-----------------+



In [19]:
# Evaluating the model
pred_results.meanAbsoluteError

9.094947017729282e-13