# Spark ML-Regression
Using age and experience to estimate salary

In [2]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('ML').getOrCreate()



In [5]:
df = spark.read.csv('test4.csv', header=True, inferSchema=True)

In [7]:
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [10]:
training = df
training.columns

['Name', 'age', 'Experience', 'Salary']

Training cols: ['age', 'Experience']  
Output col: 'Salary'

In [11]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['age', 'Experience'], outputCol='Grouped Inputs')

In [12]:
output = featureassembler.transform(training)
output.show() # The VectorAssembler only combines the features into an array

+---------+---+----------+------+--------------+
|     Name|age|Experience|Salary|Grouped Inputs|
+---------+---+----------+------+--------------+
|    Krish| 31|        10| 30000|   [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|    [30.0,8.0]|
|    Sunny| 29|         4| 20000|    [29.0,4.0]|
|     Paul| 24|         3| 20000|    [24.0,3.0]|
|   Harsha| 21|         1| 15000|    [21.0,1.0]|
|  Shubham| 23|         2| 18000|    [23.0,2.0]|
+---------+---+----------+------+--------------+



In [14]:
final_dataset = output[['Grouped Inputs','Salary']]
final_dataset.show()

+--------------+------+
|Grouped Inputs|Salary|
+--------------+------+
|   [31.0,10.0]| 30000|
|    [30.0,8.0]| 25000|
|    [29.0,4.0]| 20000|
|    [24.0,3.0]| 20000|
|    [21.0,1.0]| 15000|
|    [23.0,2.0]| 18000|
+--------------+------+



In [15]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = final_dataset.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='Grouped Inputs', labelCol='Salary')
regressor = regressor.fit(train_data)
regressor

LinearRegressionModel: uid=LinearRegression_fa9f20ebebec, numFeatures=2

In [16]:
regressor.coefficients

DenseVector([47.619, 1285.7143])

In [17]:
regressor.intercept

13619.047619047662

In [19]:
pred_results = regressor.evaluate(test_data)
pred_results

<pyspark.ml.regression.LinearRegressionSummary at 0x27affacf240>

In [20]:
pred_results.predictions.show()

+--------------+------+-----------------+
|Grouped Inputs|Salary|       prediction|
+--------------+------+-----------------+
|    [23.0,2.0]| 18000|17285.71428571428|
|   [31.0,10.0]| 30000|27952.38095238097|
+--------------+------+-----------------+



In [24]:
output.show()

+---------+---+----------+------+--------------+
|     Name|age|Experience|Salary|Grouped Inputs|
+---------+---+----------+------+--------------+
|    Krish| 31|        10| 30000|   [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|    [30.0,8.0]|
|    Sunny| 29|         4| 20000|    [29.0,4.0]|
|     Paul| 24|         3| 20000|    [24.0,3.0]|
|   Harsha| 21|         1| 15000|    [21.0,1.0]|
|  Shubham| 23|         2| 18000|    [23.0,2.0]|
+---------+---+----------+------+--------------+



# Multi-Linear Regression

In [26]:
spark = spark.builder.appName('ML').getOrCreate()

In [27]:
df = spark.read.csv('tips.csv', header=True, inferSchema=True)


In [28]:
df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

We want to use ['tip', 'sex', 'smoker', 'day', 'time', 'size'] to predict total_bill

In [34]:
# To handle categorical features
from pyspark.ml.feature import StringIndexer

In [35]:
indexer = StringIndexer(inputCol='sex', outputCol='sex_idx')
df_r = indexer.fit(df).transform(df)
df_r.show()

+----------+----+------+------+---+------+----+-------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_idx|
+----------+----+------+------+---+------+----+-------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    1.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|    0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|    0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|    0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|    1.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|    0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|    0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|    0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|    0.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|    0.0|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|    0.0|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|    1.0|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|    0.0|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|    0.0|
|     14.83|3.02|Female|    No|Sun|Dinner|   2| 

In [36]:
# To index multiple features at the same time
indexer = StringIndexer(inputCols=['smoker','day','time'], outputCols=['smoker_idx','day_idx','time_idx'])
df_r = indexer.fit(df_r).transform(df_r)
df_r.show()

+----------+----+------+------+---+------+----+-------+----------+-------+--------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_idx|smoker_idx|day_idx|time_idx|
+----------+----+------+------+---+------+----+-------+----------+-------+--------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    1.0|       0.0|    1.0|     0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|     0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|    1.0|       0.0|    1.0|     0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|    0.0|       0.0|    1.0|     0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|     0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|    0.0|       0.0|    1.0|     0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|  

In [46]:
df_r.agg({'day_idx': 'max'}).show()

+------------+
|max(day_idx)|
+------------+
|         3.0|
+------------+



In [47]:
df_r.dtypes

[('total_bill', 'double'),
 ('tip', 'double'),
 ('sex', 'string'),
 ('smoker', 'string'),
 ('day', 'string'),
 ('time', 'string'),
 ('size', 'int'),
 ('sex_idx', 'double'),
 ('smoker_idx', 'double'),
 ('day_idx', 'double'),
 ('time_idx', 'double')]

In [48]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['tip','size','sex_idx','smoker_idx','day_idx','time_idx'], outputCol='Independent Feats')
output = featureassembler.transform(df_r)
output[['Independent Feats']].show()

+--------------------+
|   Independent Feats|
+--------------------+
|[1.01,2.0,1.0,0.0...|
|[1.66,3.0,0.0,0.0...|
|[3.5,3.0,0.0,0.0,...|
|[3.31,2.0,0.0,0.0...|
|[3.61,4.0,1.0,0.0...|
|[4.71,4.0,0.0,0.0...|
|[2.0,2.0,0.0,0.0,...|
|[3.12,4.0,0.0,0.0...|
|[1.96,2.0,0.0,0.0...|
|[3.23,2.0,0.0,0.0...|
|[1.71,2.0,0.0,0.0...|
|[5.0,4.0,1.0,0.0,...|
|[1.57,2.0,0.0,0.0...|
|[3.0,4.0,0.0,0.0,...|
|[3.02,2.0,1.0,0.0...|
|[3.92,2.0,0.0,0.0...|
|[1.67,3.0,1.0,0.0...|
|[3.71,3.0,0.0,0.0...|
|[3.5,3.0,1.0,0.0,...|
|(6,[0,1],[3.35,3.0])|
+--------------------+
only showing top 20 rows



In [49]:
from pyspark.ml.regression import LinearRegression
## train.test split
train_data, test_data = output.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='Independent Feats', labelCol='total_bill')
regressor = regressor.fit(train_data)

In [50]:
regressor.coefficients

DenseVector([2.992, 4.0129, -0.8691, 2.615, -0.2392, -0.4504])

In [51]:
regressor.intercept

0.01605366012098355

In [52]:
pred_results = regressor.evaluate(test_data)

In [53]:
pred_results.predictions.show()

+----------+----+------+------+----+------+----+-------+----------+-------+--------+--------------------+------------------+
|total_bill| tip|   sex|smoker| day|  time|size|sex_idx|smoker_idx|day_idx|time_idx|   Independent Feats|        prediction|
+----------+----+------+------+----+------+----+-------+----------+-------+--------+--------------------+------------------+
|      9.94|1.56|  Male|    No| Sun|Dinner|   2|    0.0|       0.0|    1.0|     0.0|[1.56,2.0,0.0,0.0...|12.470197995321872|
|     10.09| 2.0|Female|   Yes| Fri| Lunch|   2|    1.0|       1.0|    3.0|     1.0|[2.0,2.0,1.0,1.0,...|14.603947480192696|
|     10.27|1.71|  Male|    No| Sun|Dinner|   2|    0.0|       0.0|    1.0|     0.0|[1.71,2.0,0.0,0.0...| 12.91899763852952|
|     10.34|1.66|  Male|    No| Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|[1.66,3.0,0.0,0.0...|16.782296160962563|
|     10.65| 1.5|Female|    No|Thur| Lunch|   2|    1.0|       0.0|    2.0|     1.0|[1.5,2.0,1.0,0.0,...|10.732079905177986|


In [54]:
# Some metrics
pred_results.r2, pred_results.meanAbsoluteError, pred_results.meanSquaredError

(0.5202421090995031, 4.908860843722387, 41.35817674907305)