Лабораторная работа №2 - 1 

In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, MinMaxScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.master("local[*]").getOrCreate()
filename_data = 'us-dealers-used-cleaned.csv'
csv = spark.read.csv(filename_data, inferSchema=True, header=True)
csv = csv.withColumn('miles', csv.miles.cast(IntegerType()))
csv.show(10)

+-------------+-----+------+----+-------------+-------------+---------+------------+-----------+
|           id|price| miles|year|         make|        model|body_type|transmission|engine_size|
+-------------+-----+------+----+-------------+-------------+---------+------------+-----------+
|38b2f52e-8f5d|20998|115879|2015|    Chevrolet|Express Cargo|Cargo Van|   Automatic|        4.8|
|97ba4955-ccf0|27921|  7339|2018|          BMW|           i3|Hatchback|   Automatic|        0.6|
|be1da9fd-0f34|11055| 39798|2018|   Mitsubishi|    Mirage G4|    Sedan|   Automatic|        1.2|
|84327e45-6cb6|52997| 28568|2019|    Chevrolet|     Colorado|   Pickup|   Automatic|        2.8|
|43847b9a-6fed| 3995|137537|2000|        Dodge|   Ram Pickup|   Pickup|      Manual|        5.2|
|8d10256f-3be9| 6500| 74274|2010|    Chevrolet|          HHR| Mini Mpv|   Automatic|        2.2|
|3c539e0f-3eb8|23024|131286|2016|    Chevrolet|     Colorado|   Pickup|   Automatic|        2.8|
|dffc4e35-48e7|16995|110615|20

In [2]:
splits = csv.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1].withColumnRenamed("price", "truePrice")
train_rows = train.count()
test_rows = test.count()
print("Training Rows:", train_rows, " Testing Rows:", test_rows)

Training Rows: 1607404  Testing Rows: 688792


In [3]:
strIdx = StringIndexer(inputCols = ['make', 'model', 'body_type','transmission'], 
                       outputCols = ['make_index', 'model_index', 'body_type_index', 'transmission_index'], 
                       handleInvalid = "keep")
catVect = VectorAssembler(inputCols = ['make_index', 'model_index', 'body_type_index','transmission_index'], 
                          outputCol="features_cat")
catIdx = VectorIndexer(inputCol = catVect.getOutputCol(), 
                       outputCol = "features_index", 
                       handleInvalid = "keep")
numVect = VectorAssembler(inputCols = ["engine_size",'miles'], 
                          outputCol="features_num", 
                          handleInvalid = "keep")
minMax = MinMaxScaler(inputCol = numVect.getOutputCol(), 
                      outputCol="features_norm")
featVect = VectorAssembler(inputCols=["features_index", "features_norm"], 
                           outputCol="features", 
                           handleInvalid = "keep")
lr = LinearRegression(featuresCol = 'features', 
                      labelCol='price', 
                      maxIter=10, 
                      regParam=0.3, 
                      elasticNetParam=0.8)
pipeline = Pipeline(stages=[strIdx, catVect, catIdx, numVect, minMax, featVect, lr])

In [4]:
pipelineModel = pipeline.fit(train)

In [5]:
pred_df = pipelineModel.transform(test)
pred_df.select("features", "prediction", "truePrice").show()

+--------------------+------------------+---------+
|            features|        prediction|truePrice|
+--------------------+------------------+---------+
|[21.0,85.0,0.0,0....|32377.773150161076|    43997|
|[5.0,16.0,0.0,0.0...| 29764.23458282672|    25999|
|[1.0,10.0,1.0,0.0...|21590.414538826146|    18491|
|[0.0,22.0,1.0,0.0...| 19981.94686854893|    12999|
|[26.0,100.0,4.0,1...| 25157.98761059677|    27993|
|[29.0,146.0,0.0,1...|  31129.0693842735|    54873|
|[0.0,0.0,2.0,0.0,...| 24035.75080278493|    27990|
|[14.0,56.0,0.0,0....|  21032.2452135954|    17944|
|[5.0,43.0,0.0,0.0...|29325.670086563667|    21995|
|[14.0,57.0,0.0,0....| 24867.92754636355|    29289|
|[4.0,6.0,3.0,0.0,...| 13160.78566134814|    16988|
|[9.0,59.0,0.0,0.0...|24559.122671455534|    34490|
|[10.0,186.0,4.0,0...| 4579.998900574454|     7495|
|[2.0,3.0,0.0,0.0,...|20572.826375476154|    22749|
|[10.0,31.0,0.0,0....|29873.409730857184|    30584|
|[6.0,28.0,0.0,0.0...|25001.958227539213|    35295|
|[3.0,1.0,0.

In [6]:
from pyspark.ml.evaluation import RegressionEvaluator

regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="truePrice", metricName="rmse")

In [7]:
# RMSE
rmse = regressionEvaluator.evaluate(pred_df)
print(f"The RMSE for the linear regression model is {rmse:0.2f}")
# MSE
mse = regressionEvaluator.setMetricName("mse").evaluate(pred_df)
print(f"The MSE for the linear regression model is {mse:0.2f}")
# R2
r2 = regressionEvaluator.setMetricName("r2").evaluate(pred_df)
print(f"The R2 for the linear regression model is {r2:0.2f}")
# MAE
mae = regressionEvaluator.setMetricName("mae").evaluate(pred_df)
print(f"The MAE for the linear regression model is {mae:0.2f}")

The RMSE for the linear regression model is 7559.14
The MSE for the linear regression model is 57140578.32
The R2 for the linear regression model is 0.52
The MAE for the linear regression model is 5805.28


In [16]:
param_grid = ParamGridBuilder().\
    addGrid(lr.regParam, [0.1, 0.4, 0.9]).\
    addGrid(lr.maxIter, [10, 20, 30]).\
    addGrid(lr.solver, ['auto', 'normal', 'l-bfgs']).\
    build()

In [17]:
cv = CrossValidator(estimator=pipeline, \
                    estimatorParamMaps=param_grid, \
                    evaluator=RegressionEvaluator(
                                predictionCol="prediction", \
                                labelCol="price", \
                                metricName="rmse"), \
                    numFolds=2)

In [18]:
cv_model = cv.fit(train)

In [19]:
newPrediction = cv_model.transform(test)

In [20]:
# RMSE
rmse = regressionEvaluator.evaluate(newPrediction)
print(f"The RMSE for the linear regression model is {rmse:0.2f}")
# MSE
mse = regressionEvaluator.setMetricName("mse").evaluate(newPrediction)
print(f"The MSE for the linear regression model is {mse:0.2f}")
# R2
r2 = regressionEvaluator.setMetricName("r2").evaluate(newPrediction)
print(f"The R2 for the linear regression model is {r2:0.2f}")
# MAE
mae = regressionEvaluator.setMetricName("mae").evaluate(newPrediction)
print(f"The MAE for the linear regression model is {mae:0.2f}")

The RMSE for the linear regression model is 5805.28
The MSE for the linear regression model is 57140580.58
The R2 for the linear regression model is 0.52
The MAE for the linear regression model is 5805.28
