In [0]:
fileLoc = "/FileStore/tables/reg.csv"
fileType = "csv"

dataFrame = spark.read.csv(fileLoc,header = True, inferSchema = True)
dataFrame.show()

dataFrame.printSchema() #for printing the dataset schema, column names of the dataset, and column data types. 









dataFrame.columns #will print column names in the dataFrame. 

+----------+---+----------+------------+----+------+
|total_bill|tip|first_name|   last_name|size|   sex|
+----------+---+----------+------------+----+------+
|       724|  6|    Esdras|      Athowe|   2|  Male|
|       348|  7| Gabriello|       Beers|  10|  Male|
|       795|  1|     Janel|   Southouse|   8|Female|
|       291|  5|    Cullie|    Petrenko|   2|  Male|
|       167|  4|   Alexine|  Van Giffen|   3|Female|
|       189|  9|       Rob|Dominichetti|   6|  Male|
|       757|  6|     Doyle|       Reary|   8|  Male|
|       756|  9|     Kevon|   Wiltshear|   3|  Male|
|       867|  8|  Mallissa|       Geill|   2|Female|
|       443|  4|     Moore|      Cowsby|   2|  Male|
|       363|  9|     Julie|     Junifer|   6|Female|
|       249|  3|    Almeda|    Welbrock|   6|Female|
|       112|  7|   Rubetta|       Balch|  10|Female|
|       352|  6| Ferdinand|        Lace|   6|  Male|
|       473|  2|      Arni|     Ziebart|   7|  Male|
|       150|  6|  Cordelia|     Valenta|   2|F

In [0]:
#Handling categorical features

from pyspark.ml.feature import StringIndexer

dataFrame.show()


indexer = StringIndexer(inputCol = "sex", outputCol = "sex_indexed") 
newDF = indexer.fit(dataFrame).transform(dataFrame) 
newDF.show() #indexing sex feature

#we can see from the output that the males are indexed with 1.0 and the females are indexed with 0.0



indexer = StringIndexer(inputCols = ["tip","total_bill"], outputCols = ["tip_indexed", "total_bill_indexed"]) #indexing tip and total_bill features 
newDF = indexer.fit(newDF).transform(newDF)
newDF.show()

+----------+---+----------+------------+----+------+
|total_bill|tip|first_name|   last_name|size|   sex|
+----------+---+----------+------------+----+------+
|       724|  6|    Esdras|      Athowe|   2|  Male|
|       348|  7| Gabriello|       Beers|  10|  Male|
|       795|  1|     Janel|   Southouse|   8|Female|
|       291|  5|    Cullie|    Petrenko|   2|  Male|
|       167|  4|   Alexine|  Van Giffen|   3|Female|
|       189|  9|       Rob|Dominichetti|   6|  Male|
|       757|  6|     Doyle|       Reary|   8|  Male|
|       756|  9|     Kevon|   Wiltshear|   3|  Male|
|       867|  8|  Mallissa|       Geill|   2|Female|
|       443|  4|     Moore|      Cowsby|   2|  Male|
|       363|  9|     Julie|     Junifer|   6|Female|
|       249|  3|    Almeda|    Welbrock|   6|Female|
|       112|  7|   Rubetta|       Balch|  10|Female|
|       352|  6| Ferdinand|        Lace|   6|  Male|
|       473|  2|      Arni|     Ziebart|   7|  Male|
|       150|  6|  Cordelia|     Valenta|   2|F

In [0]:
from pyspark.ml.feature import VectorAssembler

newDF.columns

 #VectorAssembler accepts only numerical or boolean datatypes.
featureMerger = VectorAssembler(
                inputCols = ['total_bill','tip', 'size', 'sex_indexed', 'total_bill_indexed'], 
                outputCol = "Independent Features"
                ) #merging the inputCols and creating a single column named as "Independent Features" consisting of the merged input features. 

output = featureMerger.transform(newDF) #transforming the dataset as specified in the featureMerger (adding the column named "Independent Features")

output.show()






+----------+---+----------+------------+----+------+-----------+-----------+------------------+--------------------+
|total_bill|tip|first_name|   last_name|size|   sex|sex_indexed|tip_indexed|total_bill_indexed|Independent Features|
+----------+---+----------+------------+----+------+-----------+-----------+------------------+--------------------+
|       724|  6|    Esdras|      Athowe|   2|  Male|        1.0|        3.0|             519.0|[724.0,6.0,2.0,1....|
|       348|  7| Gabriello|       Beers|  10|  Male|        1.0|        8.0|               9.0|[348.0,7.0,10.0,1...|
|       795|  1|     Janel|   Southouse|   8|Female|        0.0|        4.0|             216.0|[795.0,1.0,8.0,0....|
|       291|  5|    Cullie|    Petrenko|   2|  Male|        1.0|        1.0|             344.0|[291.0,5.0,2.0,1....|
|       167|  4|   Alexine|  Van Giffen|   3|Female|        0.0|        7.0|              26.0|[167.0,4.0,3.0,0....|
|       189|  9|       Rob|Dominichetti|   6|  Male|        1.0|

In [0]:
final_data = output.select("Independent Features", "total_bill") #selecting only the data under "Independent Features" and "total_bill"
final_data.show() #showing the selected data

+--------------------+----------+
|Independent Features|total_bill|
+--------------------+----------+
|[724.0,6.0,2.0,1....|       724|
|[348.0,7.0,10.0,1...|       348|
|[795.0,1.0,8.0,0....|       795|
|[291.0,5.0,2.0,1....|       291|
|[167.0,4.0,3.0,0....|       167|
|[189.0,9.0,6.0,1....|       189|
|[757.0,6.0,8.0,1....|       757|
|[756.0,9.0,3.0,1....|       756|
|[867.0,8.0,2.0,0....|       867|
|[443.0,4.0,2.0,1....|       443|
|[363.0,9.0,6.0,0....|       363|
|[249.0,3.0,6.0,0....|       249|
|[112.0,7.0,10.0,0...|       112|
|[352.0,6.0,6.0,1....|       352|
|[473.0,2.0,7.0,1....|       473|
|[150.0,6.0,2.0,0....|       150|
|[949.0,3.0,2.0,1....|       949|
|[506.0,1.0,5.0,0....|       506|
|[982.0,6.0,5.0,1....|       982|
|[68.0,3.0,7.0,0.0...|        68|
+--------------------+----------+
only showing top 20 rows



In [0]:

from pyspark.ml.regression import LinearRegression 

#splitting train data and test data 
trainData, testData = final_data.randomSplit([0.70,0.30])
regressor = LinearRegression(featuresCol = "Independent Features", labelCol = "total_bill")
regressor = regressor.fit(trainData)

regressor.coefficients #regression coefficients for each feature


regressor.intercept

Out[63]: -4.2161937848691774e-13

In [0]:
#Predictions 
pred_results = regressor.evaluate(testData)

#Comparison between prediction and total_bill
pred_results.predictions.show() # calling pred_results.predictions.show() to show all predictions of the feature values.


#Performance metrics (r square , mean absolute error , and mean squared error)
pred_results.r2, pred_results.meanAbsoluteError, pred_results.meanSquaredError


+--------------------+----------+------------------+
|Independent Features|total_bill|        prediction|
+--------------------+----------+------------------+
|[8.0,1.0,3.0,1.0,...|         8| 7.999999999999617|
|[8.0,1.0,7.0,1.0,...|         8| 7.999999999999788|
|[9.0,1.0,5.0,0.0,...|         9| 8.999999999999526|
|[14.0,4.0,4.0,0.0...|        14|13.999999999999622|
|[16.0,6.0,6.0,0.0...|        16|  15.9999999999997|
|[22.0,3.0,1.0,1.0...|        22|21.999999999999496|
|[29.0,4.0,7.0,1.0...|        29|28.999999999999837|
|[38.0,2.0,3.0,1.0...|        38| 37.99999999999968|
|[43.0,3.0,6.0,0.0...|        43|42.999999999999794|
|[47.0,1.0,10.0,0....|        47|46.999999999999964|
|[49.0,9.0,1.0,0.0...|        49|48.999999999999574|
|[51.0,2.0,10.0,0....|        51| 51.00000000000002|
|[51.0,6.0,5.0,1.0...|        51| 50.99999999999981|
|[51.0,7.0,8.0,1.0...|        51| 50.99999999999993|
|[52.0,8.0,2.0,0.0...|        52|51.999999999999616|
|[57.0,2.0,7.0,1.0...|        57|56.9999999999