In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, MinMaxScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator

spark = SparkSession.builder.master("local[*]").getOrCreate()
filename_data = 'datasetelectro.csv'
csv = spark.read.csv(filename_data, inferSchema=True, header=True)
csv = csv.withColumn('mileage', csv.mileage.cast(IntegerType()))
csv.show(10)

+---+----------+---------+----------+----+-------+---------+------------+--------------+------+-------+
|_c0|     brand|    model|  offer_id|year|mileage|fuel_type|engine_power|  transmission| wheel|  price|
+---+----------+---------+----------+----+-------+---------+------------+--------------+------+-------+
|  0|    Toyota|     Vitz|1102756899|2001|   6236|   Бензин|        70.0|автоматическая|Правый| 230000|
|  1|LADA (ВАЗ)|     2107|1102754237|1995|  96000|   Бензин|        75.0|  механическая| Левый|  35000|
|  2|    Daewoo|   Nubira|1102763224|1998|  50000|   Бензин|       106.0|  механическая| Левый|  45000|
|  3| Chevrolet|     Aveo|1102757199|2014| 119120|   Бензин|       115.0|  механическая| Левый| 475000|
|  4| Chevrolet|  Lacetti|1102760232|2011|  87650|   Бензин|       109.0|  механическая| Левый| 479000|
|  5|       Kia|   Cerato|1102753937|2010| 160000|   Бензин|       126.0|  механическая| Левый| 520000|
|  6|    Nissan| Wingroad|1102757232|2011| 132300|   Бензин|    

In [2]:
splits = csv.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1].withColumnRenamed("price", "truePrice")
train_rows = train.count()
test_rows = test.count()
print("Training Rows:", train_rows, " Testing Rows:", test_rows)

Training Rows: 181606  Testing Rows: 77793


In [3]:
strIdx = StringIndexer(inputCols = ['brand', 'model', 'fuel_type','transmission', 'wheel'], 
                       outputCols = ['brand_index', 'model_index', 'fuel_type_index', 'transmission_index', 'wheel_index'], 
                       handleInvalid = "keep")
catVect = VectorAssembler(inputCols = ['brand_index', 'model_index', 'fuel_type_index','transmission_index', 'wheel_index'],
                          outputCol="features_cat")
catIdx = VectorIndexer(inputCol = catVect.getOutputCol(), 
                       outputCol = "features_index", 
                       handleInvalid = "keep")
numVect = VectorAssembler(inputCols = ["mileage",'engine_power', 'year'], 
                          outputCol="features_num", 
                          handleInvalid = "keep")
minMax = MinMaxScaler(inputCol = numVect.getOutputCol(), 
                      outputCol="features_norm")
featVect = VectorAssembler(inputCols=["features_index", "features_norm"], 
                           outputCol="features", 
                           handleInvalid = "keep")
rfr = RandomForestRegressor(featuresCol = 'features', 
                      labelCol='price',
                      numTrees = 10,
                      maxDepth=2,
                      maxBins = 181834)
pipeline = Pipeline(stages=[strIdx, catVect, catIdx, numVect, minMax, featVect, rfr])

In [4]:
pipelineModel = pipeline.fit(train)

In [5]:
pred_df = pipelineModel.transform(test)
pred_df.select("features", "prediction", "truePrice").show()

+--------------------+------------------+---------+
|            features|        prediction|truePrice|
+--------------------+------------------+---------+
|[4.0,57.0,0.0,0.0...| 669068.1614277184|   520000|
|[2.0,215.0,0.0,2....| 492916.3665875532|   584000|
|(8,[1,5,6,7],[16....|166562.25895872113|   120000|
|[2.0,404.0,1.0,0....| 642843.5537908359|   750000|
|[10.0,135.0,0.0,2...| 785747.4556966085|   889000|
|[4.0,30.0,0.0,0.0...| 750908.2812455347|   875000|
|[3.0,99.0,0.0,1.0...| 549124.6197953405|   530000|
|[6.0,36.0,0.0,0.0...| 285668.5128573777|   250000|
|[18.0,15.0,0.0,0....|166562.25895872113|    45000|
|(8,[5,6,7],[0.203...| 440836.8789195012|   539990|
|(8,[1,5,6,7],[5.0...|166562.25895872113|    75000|
|[3.0,68.0,0.0,1.0...| 559820.0316274439|   500000|
|(8,[5,6,7],[0.173...|371715.54959993827|   561000|
|[4.0,30.0,0.0,0.0...| 750908.2812455347|   777000|
|[13.0,167.0,1.0,1...| 549124.6197953405|   500000|
|[3.0,276.0,0.0,1....|267964.90474274184|   300000|
|[6.0,21.0,0

In [6]:
from pyspark.ml.evaluation import RegressionEvaluator

regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="truePrice", metricName="rmse")

In [7]:
# RMSE
rmse = regressionEvaluator.evaluate(pred_df)
print(f"The RMSE for the random forest regression model is {rmse:0.2f}")
# MSE
mse = regressionEvaluator.setMetricName("mse").evaluate(pred_df)
print(f"The MSE for the random forest regression model is {mse:0.2f}")
# R2
r2 = regressionEvaluator.setMetricName("r2").evaluate(pred_df)
print(f"The R2 for the random forest regression model is {r2:0.2f}")
# MAE
mae = regressionEvaluator.setMetricName("mae").evaluate(pred_df)
print(f"The MAE for the random forest regression model is {mae:0.2f}")

The RMSE for the random forest regression model is 180333.89
The MSE for the random forest regression model is 32520311016.46
The R2 for the random forest regression model is 0.70
The MAE for the random forest regression model is 128254.86


In [8]:
param_grid = ParamGridBuilder().\
    addGrid(rfr.numTrees, [10, 15, 20]).\
    addGrid(rfr.maxDepth, [1, 2, 4]).\
    addGrid(rfr.maxBins , [181834, 362432, 724864]).\
    build()

In [9]:
cv = CrossValidator(estimator=pipeline, \
                    estimatorParamMaps=param_grid, \
                    evaluator=RegressionEvaluator(
                                predictionCol="prediction", \
                                labelCol="price", \
                                metricName="rmse"), \
                    numFolds=2)

In [10]:
cv_model = cv.fit(train)

In [11]:
newPrediction = cv_model.transform(test)

In [12]:
# RMSE
rmse = regressionEvaluator.evaluate(newPrediction)
print(f"The RMSE for the random forest regression model is {rmse:0.2f}")
# MSE
mse = regressionEvaluator.setMetricName("mse").evaluate(newPrediction)
print(f"The MSE for the random forest regression model is {mse:0.2f}")
# R2
r2 = regressionEvaluator.setMetricName("r2").evaluate(newPrediction)
print(f"The R2 for the random forest regression model is {r2:0.2f}")
# MAE
mae = regressionEvaluator.setMetricName("mae").evaluate(newPrediction)
print(f"The MAE for the random forest regression model is {mae:0.2f}")

The RMSE for the random forest regression model is 91192.43
The MSE for the random forest regression model is 17543341114.54
The R2 for the random forest regression model is 0.84
The MAE for the random forest regression model is 91192.43


2 часть

In [13]:
csv = csv.drop(csv._c0).withColumn('label', when(col('wheel') =='Левый', 1).otherwise(0))
csv.drop(csv.wheel).show()

+----------+---------+----------+----+-------+---------+------------+----------------+-------+-----+
|     brand|    model|  offer_id|year|mileage|fuel_type|engine_power|    transmission|  price|label|
+----------+---------+----------+----+-------+---------+------------+----------------+-------+-----+
|    Toyota|     Vitz|1102756899|2001|   6236|   Бензин|        70.0|  автоматическая| 230000|    0|
|LADA (ВАЗ)|     2107|1102754237|1995|  96000|   Бензин|        75.0|    механическая|  35000|    1|
|    Daewoo|   Nubira|1102763224|1998|  50000|   Бензин|       106.0|    механическая|  45000|    1|
| Chevrolet|     Aveo|1102757199|2014| 119120|   Бензин|       115.0|    механическая| 475000|    1|
| Chevrolet|  Lacetti|1102760232|2011|  87650|   Бензин|       109.0|    механическая| 479000|    1|
|       Kia|   Cerato|1102753937|2010| 160000|   Бензин|       126.0|    механическая| 520000|    1|
|    Nissan| Wingroad|1102757232|2011| 132300|   Бензин|       109.0|        вариатор| 5840

In [14]:
splits = csv.randomSplit([0.85, 0.15])
train = splits[0]
test = splits[1].withColumnRenamed("label", "trueLabel")
print("Training Rows:", train.count(), " Testing Rows:", test.count())

Training Rows: 220316  Testing Rows: 39083


In [15]:
strIdx = StringIndexer(inputCols = ['brand', 'model', 'fuel_type','transmission'], 
                       outputCols = ['brand_index', 'model_index', 'fuel_type_index', 'transmission_index',], 
                       handleInvalid = "keep")
catVect = VectorAssembler(inputCols = ['brand_index', 'model_index', 'fuel_type_index', 'transmission_index',], 
                          outputCol="features_cat")
catIdx = VectorIndexer(inputCol = catVect.getOutputCol(), 
                       outputCol = "features_index", 
                       handleInvalid = "keep")
numVect = VectorAssembler(inputCols = ["mileage",'engine_power', 'year'], 
                          outputCol="features_num", 
                          handleInvalid = "keep")
minMax = MinMaxScaler(inputCol = numVect.getOutputCol(), 
                      outputCol="features_norm")
featVect = VectorAssembler(inputCols=["features_index", "features_norm"], 
                           outputCol="features", 
                           handleInvalid = "keep")
lr = LogisticRegression(labelCol="label", 
                        featuresCol="features", 
                        maxIter=10,
                        regParam=0.3)
pipeline = Pipeline(stages=[strIdx, catVect, catIdx, numVect, minMax, featVect, lr])

In [16]:
pipelineModel = pipeline.fit(train)

In [17]:
pred_df = pipelineModel.transform(test)
pred_df.select("features", "prediction", "trueLabel").show()

+--------------------+----------+---------+
|            features|prediction|trueLabel|
+--------------------+----------+---------+
|[65.0,1127.0,0.0,...|       1.0|        1|
|[56.0,1163.0,0.0,...|       1.0|        1|
|[56.0,1163.0,0.0,...|       1.0|        1|
|[56.0,1305.0,0.0,...|       1.0|        1|
|[17.0,131.0,0.0,0...|       1.0|        1|
|[17.0,131.0,0.0,0...|       1.0|        1|
|[17.0,131.0,0.0,0...|       1.0|        1|
|[17.0,131.0,0.0,0...|       1.0|        1|
|[17.0,131.0,0.0,0...|       1.0|        1|
|[17.0,131.0,2.0,0...|       1.0|        1|
|[17.0,131.0,0.0,0...|       1.0|        1|
|[17.0,117.0,0.0,0...|       1.0|        1|
|[17.0,117.0,0.0,0...|       1.0|        1|
|[17.0,117.0,0.0,0...|       1.0|        1|
|[17.0,117.0,0.0,0...|       1.0|        1|
|[17.0,117.0,0.0,0...|       1.0|        1|
|[17.0,117.0,0.0,0...|       1.0|        1|
|[17.0,117.0,0.0,0...|       1.0|        1|
|[17.0,117.0,0.0,1...|       1.0|        1|
|[17.0,117.0,0.0,0...|       1.0

In [18]:
tp = float(pred_df.filter("prediction == 1.0 AND truelabel == 1").count())
fp = float(pred_df.filter("prediction == 1.0 AND truelabel == 0").count())
tn = float(pred_df.filter("prediction == 0.0 AND truelabel == 0").count())
fn = float(pred_df.filter("prediction == 0.0 AND truelabel == 1").count())
pr = tp / (tp + fp)
re = tp / (tp + fn)
metrics = spark.createDataFrame([
 ("TP", tp),
 ("FP", fp),
 ("TN", tn),
 ("FN", fn),
 ("Precision", pr),
 ("Recall", re),
 ("F1", 2*pr*re/(re+pr))],["metric", "value"])
metrics.show()

+---------+------------------+
|   metric|             value|
+---------+------------------+
|       TP|           35878.0|
|       FP|            3203.0|
|       TN|               0.0|
|       FN|               2.0|
|Precision|0.9180420153015532|
|   Recall|0.9999442586399108|
|       F1|0.9572444337722281|
+---------+------------------+



In [19]:
evaluator = BinaryClassificationEvaluator(labelCol="trueLabel", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
aur = evaluator.evaluate(pred_df)
print ("AUR = ", aur)

AUR =  0.9056657751181565


In [20]:
paramGrid = ParamGridBuilder().\
    addGrid(lr.maxIter, [10, 20, 30]).\
    addGrid(lr.regParam, [0.3, 0.5, 0.7]).build()
cv = CrossValidator(estimator=pipeline, evaluator=BinaryClassificationEvaluator(metricName='areaUnderPR'), estimatorParamMaps=paramGrid, 
                    numFolds=2)

In [21]:
cv_model = cv.fit(train)

In [22]:
newPrediction = cv_model.transform(test)

In [23]:
# Recalculate confusion matrix
tp2 = float(newPrediction.filter("prediction == 1.0 AND truelabel == 1").count())
fp2 = float(newPrediction.filter("prediction == 1.0 AND truelabel == 0").count())
tn2 = float(newPrediction.filter("prediction == 0.0 AND truelabel == 0").count())
fn2 = float(newPrediction.filter("prediction == 0.0 AND truelabel == 1").count())
pr2 = tp2 / (tp2 + fp2)
re2 = tp2 / (tp2 + fn2)
metrics2 = spark.createDataFrame([
 ("TP", tp2),
 ("FP", fp2),
 ("TN", tn2),
 ("FN", fn2),
 ("Precision", pr2),
 ("Recall", re2),
 ("F1", 2*pr2*re2/(re2+pr2))],["metric", "value"])
metrics2.show()

+---------+------------------+
|   metric|             value|
+---------+------------------+
|       TP|           35878.0|
|       FP|            3203.0|
|       TN|               0.0|
|       FN|               2.0|
|Precision|0.9180420153015532|
|   Recall|0.9999442586399108|
|       F1|0.9572444337722281|
+---------+------------------+



In [24]:
# Recalculate the Area Under ROC
evaluator2 = BinaryClassificationEvaluator(labelCol="trueLabel", rawPredictionCol="prediction", metricName="areaUnderROC")
aur2 = evaluator2.evaluate(newPrediction)
print( "AUR2 = ", aur2)

AUR2 =  0.4999721293199554
