In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("RegressionTask") \
    .getOrCreate()


In [4]:
from pyspark.sql.functions import lit

# Create DataFrame
df = spark.read.json('./reimbursement_report')

+-----+-----------------------------+--------------------+-------+
|price|reimbursement_for_delay_total|             trip_id|user_id|
+-----+-----------------------------+--------------------+-------+
|123.8|                       16.094|75adb101-9183-418...|   5356|
|200.5|                        601.0|028ce82f-e2bc-4a9...|   5358|
|200.5|                          0.0|a974b1ea-dfff-40a...|    123|
|100.5|                        100.5|028ce82f-e2bc-4a9...|   5359|
| 76.8|                        9.984|75adb101-9183-418...|   5357|
+-----+-----------------------------+--------------------+-------+



In [5]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# Index categorical columns
indexer_user = StringIndexer(inputCol="user_id", outputCol="user_id_index")
indexer_trip = StringIndexer(inputCol="trip_id", outputCol="trip_id_index")

# Assemble feature vector
assembler = VectorAssembler(inputCols=["user_id_index", "trip_id_index", "price"], outputCol="features")

# Define pipeline
pipeline = Pipeline(stages=[indexer_user, indexer_trip, assembler])
df_transformed = pipeline.fit(df).transform(df)

# Show the transformed DataFrame
df_transformed.show()


+-----+-----------------------------+--------------------+-------+-------------+-------------+---------------+
|price|reimbursement_for_delay_total|             trip_id|user_id|user_id_index|trip_id_index|       features|
+-----+-----------------------------+--------------------+-------+-------------+-------------+---------------+
|123.8|                       16.094|75adb101-9183-418...|   5356|          1.0|          1.0|[1.0,1.0,123.8]|
|200.5|                        601.0|028ce82f-e2bc-4a9...|   5358|          3.0|          0.0|[3.0,0.0,200.5]|
|200.5|                          0.0|a974b1ea-dfff-40a...|    123|          0.0|          2.0|[0.0,2.0,200.5]|
|100.5|                        100.5|028ce82f-e2bc-4a9...|   5359|          4.0|          0.0|[4.0,0.0,100.5]|
| 76.8|                        9.984|75adb101-9183-418...|   5357|          2.0|          1.0| [2.0,1.0,76.8]|
+-----+-----------------------------+--------------------+-------+-------------+-------------+---------------+



In [6]:
from pyspark.ml.regression import LinearRegression

# Define the regression model
lr = LinearRegression(featuresCol="features", labelCol="reimbursement_for_delay_total")

# Train the model
lr_model = lr.fit(df_transformed)

# Print the coefficients and intercept for linear regression
print(f"Coefficients: {lr_model.coefficients}")
print(f"Intercept: {lr_model.intercept}")


Coefficients: [-52.574388518838546,-333.7264736689342,2.895235062094774]
Intercept: 111.0966485534763


In [7]:
# Make predictions
predictions = lr_model.transform(df_transformed)

# Show predictions
predictions.select("features", "reimbursement_for_delay_total", "prediction").show()

# Evaluate the model
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="reimbursement_for_delay_total", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

evaluator = RegressionEvaluator(
    labelCol="reimbursement_for_delay_total", predictionCol="prediction", metricName="r2")

r2 = evaluator.evaluate(predictions)
print(f"R-squared (R2): {r2}")


+---------------+-----------------------------+-------------------+
|       features|reimbursement_for_delay_total|         prediction|
+---------------+-----------------------------+-------------------+
|[1.0,1.0,123.8]|                       16.094|   83.2258870530366|
|[3.0,0.0,200.5]|                        601.0|  533.8681129469629|
|[0.0,2.0,200.5]|                          0.0| 24.138331165610154|
|[4.0,0.0,100.5]|                        100.5| 191.77021821864696|
| [2.0,1.0,76.8]|                        9.984|-105.42454938425631|
+---------------+-----------------------------+-------------------+

Root Mean Squared Error (RMSE): 79.05115502599796
R-squared (R2): 0.8824414160094576
