In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('SparkPredictions').getOrCreate()

In [7]:
df = spark.read.csv('tips.csv', header=True, inferSchema=True)

In [8]:
df.show()

+----------+----+------+------+---+---------+----+
|total_bill| tip|   sex|smoker|day|     time|size|
+----------+----+------+------+---+---------+----+
|     16.99|1.01|Female|    No|Sun|   Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|   Dinner|   3|
|     21.01| 3.5|  Male|    No|Sat|   Dinner|   3|
|     23.68|3.31|  Male|    No|Fri|   Dinner|   2|
|     24.59|3.61|Female|    No|Sun|   Dinner|   4|
|     25.29|4.71|  Male|    No|Mon|   Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Breakfast|   2|
|     26.88|3.12|  Male|    No|Tue|   Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|   Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|   Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|    Lunch|   2|
|     35.26| 5.0|Female|    No|Sun|   Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|   Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|   Dinner|   4|
|     14.83|3.02|Female|    No|Sun|   Supper|   2|
|     21.58|3.92|  Male|    No|Sun|   Dinner|   2|
|     10.33|1.67|Female|    No|

### Handling categorical features

In [6]:
from pyspark.ml.feature import StringIndexer

In [9]:
indexer = StringIndexer(inputCols=['sex', 'smoker', 'day', 'time'],
                        outputCols=['sex_indexed', 'smoker_indexed', 'day_indexed', 'time_indexed'])

df_indexed = indexer.fit(df).transform(df)
df_indexed.show()

+----------+----+------+------+---+---------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|     time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|
+----------+----+------+------+---+---------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|   Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|   Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sat|   Dinner|   3|        0.0|           0.0|        0.0|         0.0|
|     23.68|3.31|  Male|    No|Fri|   Dinner|   2|        0.0|           0.0|        3.0|         0.0|
|     24.59|3.61|Female|    No|Sun|   Dinner|   4|        1.0|           0.0|        1.0|         0.0|
|     25.29|4.71|  Male|    No|Mon|   Dinner|   4|        0.0|           0.0|        4.0|         0.0|
|      8.77| 2.0|  Male|    No|Sun|Breakfast|   2|        0.0|           

In [10]:
from pyspark.ml.feature import VectorAssembler

feature_assembler = VectorAssembler(
    inputCols=['tip', 'size', 'sex_indexed', 'smoker_indexed', 'day_indexed', 'time_indexed'],
    outputCol='Independent Features'
)

output = feature_assembler.transform(df_indexed)

In [11]:
output.show()

+----------+----+------+------+---+---------+----+-----------+--------------+-----------+------------+--------------------+
|total_bill| tip|   sex|smoker|day|     time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|Independent Features|
+----------+----+------+------+---+---------+----+-----------+--------------+-----------+------------+--------------------+
|     16.99|1.01|Female|    No|Sun|   Dinner|   2|        1.0|           0.0|        1.0|         0.0|[1.01,2.0,1.0,0.0...|
|     10.34|1.66|  Male|    No|Sun|   Dinner|   3|        0.0|           0.0|        1.0|         0.0|[1.66,3.0,0.0,0.0...|
|     21.01| 3.5|  Male|    No|Sat|   Dinner|   3|        0.0|           0.0|        0.0|         0.0| (6,[0,1],[3.5,3.0])|
|     23.68|3.31|  Male|    No|Fri|   Dinner|   2|        0.0|           0.0|        3.0|         0.0|[3.31,2.0,0.0,0.0...|
|     24.59|3.61|Female|    No|Sun|   Dinner|   4|        1.0|           0.0|        1.0|         0.0|[3.61,4.0,1.0,0.0...|
|     25

In [12]:
finalized_data = output.select(['Independent Features', 'total_bill'])

In [13]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regression = LinearRegression(featuresCol='Independent Features', labelCol='total_bill')
regressor = regression.fit(train_data)

23/01/10 13:51:56 WARN Instrumentation: [7ce2f583] regParam is zero, which might cause numerical instability and overfitting.


In [14]:
regressor.coefficients

DenseVector([3.1087, 3.3866, -0.6019, 2.9328, -0.1926, -1.0811])

In [15]:
regressor.intercept

1.4762055716500542

In [17]:
pred = regressor.evaluate(test_data)
pred.predictions.show()
pred.r2

+--------------------+----------+------------------+
|Independent Features|total_bill|        prediction|
+--------------------+----------+------------------+
|(6,[0,1],[1.45,2.0])|      9.55|12.757121126170256|
| (6,[0,1],[2.0,3.0])|     16.31|17.853554016158448|
|(6,[0,1],[2.01,2.0])|     20.23|14.498000765440375|
|(6,[0,1],[2.24,3.0])|     16.04|18.599645290131356|
|(6,[0,1],[2.31,3.0])|     18.69|18.817255245040123|
|(6,[0,1],[3.18,2.0])|     19.82|18.135195726058306|
| (6,[0,1],[3.6,3.0])|     24.06|22.827495842644502|
|(6,[0,1],[5.92,3.0])|     29.03| 30.03971149104928|
|[1.44,2.0,0.0,1.0...|      7.74|15.658794826325868|
|[1.5,2.0,0.0,0.0,...|     19.08|11.446286182544954|
|[1.5,2.0,0.0,1.0,...|     11.59|15.845317644819096|
|[1.5,2.0,0.0,1.0,...|     15.69|15.652715759717934|
|[1.61,2.0,1.0,1.0...|     10.59|15.585355551316209|
|[1.63,2.0,1.0,0.0...|     11.87|11.248498361873141|
|[1.64,2.0,0.0,1.0...|     15.36|16.280537554636627|
|[1.66,3.0,0.0,0.0...|     10.34|   16.6039894

0.47970762948749046