In [29]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ComputerPricePrediction").getOrCreate()
df = spark.read.csv("computer_prices_all.csv", header=True, inferSchema=True)
df.show(5)

ConnectionRefusedError: [Errno 61] Connection refused

In [None]:
df.printSchema()

root
 |-- device_type: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- model: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- os: string (nullable = true)
 |-- form_factor: string (nullable = true)
 |-- cpu_brand: string (nullable = true)
 |-- cpu_model: string (nullable = true)
 |-- cpu_tier: integer (nullable = true)
 |-- cpu_cores: integer (nullable = true)
 |-- cpu_threads: integer (nullable = true)
 |-- cpu_base_ghz: double (nullable = true)
 |-- cpu_boost_ghz: double (nullable = true)
 |-- gpu_brand: string (nullable = true)
 |-- gpu_model: string (nullable = true)
 |-- gpu_tier: integer (nullable = true)
 |-- vram_gb: integer (nullable = true)
 |-- ram_gb: integer (nullable = true)
 |-- storage_type: string (nullable = true)
 |-- storage_gb: integer (nullable = true)
 |-- storage_drive_count: integer (nullable = true)
 |-- display_type: string (nullable = true)
 |-- display_size_in: double (nullable = true)
 |-- resolution: string (n

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression


df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()
df = df.na.drop()

+-----------+-----+-----+------------+---+-----------+---------+---------+--------+---------+-----------+------------+-------------+---------+---------+--------+-------+------+------------+----------+-------------------+------------+---------------+----------+----------+----------+-------------+---------+----+---------+---------+---------------+-----+
|device_type|brand|model|release_year| os|form_factor|cpu_brand|cpu_model|cpu_tier|cpu_cores|cpu_threads|cpu_base_ghz|cpu_boost_ghz|gpu_brand|gpu_model|gpu_tier|vram_gb|ram_gb|storage_type|storage_gb|storage_drive_count|display_type|display_size_in|resolution|refresh_hz|battery_wh|charger_watts|psu_watts|wifi|bluetooth|weight_kg|warranty_months|price|
+-----------+-----+-----+------------+---+-----------+---------+---------+--------+---------+-----------+------------+-------------+---------+---------+--------+-------+------+------------+----------+-------------------+------------+---------------+----------+----------+----------+----------

In [None]:
df.describe().show()



+-------+-----------+-------+---------------+------------------+--------+-----------+---------+----------------+------------------+------------------+-----------------+------------------+------------------+---------+----------------+-----------------+-----------------+------------------+------------+-----------------+-------------------+------------+------------------+----------+-----------------+----------------+------------------+-----------------+-------+------------------+------------------+-----------------+------------------+
|summary|device_type|  brand|          model|      release_year|      os|form_factor|cpu_brand|       cpu_model|          cpu_tier|         cpu_cores|      cpu_threads|      cpu_base_ghz|     cpu_boost_ghz|gpu_brand|       gpu_model|         gpu_tier|          vram_gb|            ram_gb|storage_type|       storage_gb|storage_drive_count|display_type|   display_size_in|resolution|       refresh_hz|      battery_wh|     charger_watts|        psu_watts|   wifi|

                                                                                

In [None]:
if 'brand' in df.columns:
    df.groupBy("brand").agg(avg("price").alias("avg_price")).orderBy("avg_price", ascending=False).show()


+--------+------------------+
|   brand|         avg_price|
+--------+------------------+
|   Apple|2362.2958329837716|
|   Razer| 2079.525773161878|
| Samsung|1930.3900743863956|
|     MSI| 1905.564325180663|
|    Dell|1882.8194894681794|
|Gigabyte|1866.3038775510154|
|  Lenovo| 1865.952356178231|
|      HP|1857.3487218366067|
|    ASUS|1848.1082202973764|
|    Acer|1760.3520151134458|
+--------+------------------+



In [None]:
# numeric columns
numeric_cols = [c for c, t in df.dtypes if t in ('int', 'double') and c != 'price']

# feature engineering
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")
data = assembler.transform(df).select("features", "price")

# train-test split
train, test = data.randomSplit([0.8, 0.2], seed=42)

# Linear regressionmodel 
lr = LinearRegression(featuresCol="features", labelCol="price")
model = lr.fit(train)

# evaluation
predictions = model.transform(test)
predictions.select("price", "prediction").show(20)

print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)
print("RMSE:", model.summary.rootMeanSquaredError)
print("R2:", model.summary.r2)


25/10/22 15:53:24 WARN Instrumentation: [c7601bfb] regParam is zero, which might cause numerical instability and overfitting.


+-------+------------------+
|  price|        prediction|
+-------+------------------+
|1997.99|1350.0340566083105|
|1051.99|1336.3545680391471|
|1309.99|1300.1578800893913|
| 920.99| 948.5824920558152|
|1114.99|  1169.68976784587|
|1074.99|1210.2081830532916|
|1461.99| 1656.472985235414|
|1464.99|1210.7840702807443|
|1159.99|1187.0648645503243|
|1441.99|1339.1769672460432|
|1577.99|1421.1194999229192|
|1503.99|1519.2351413439537|
| 985.99| 1256.844232535077|
|1684.99|1248.6257351835156|
|1584.99|1315.7037679325294|
|1211.99|1406.0564778254702|
| 747.99| 919.1283089115459|
|1653.99|1094.8672736397712|
|1318.99| 1275.864413227515|
| 885.99|1195.6573717177234|
+-------+------------------+
only showing top 20 rows
Coefficients: [26.453038427292917,244.4657413376703,45.80916339063594,-24.10944894467012,-673.9418925611168,-3.0209216933495995,172.5717768312047,3.5210563784840048,1.742563796730069,0.0667082338454519,-1.1275195086279242,0.32823219851083246,1.0321337058377777,-0.038685014184948

Random Forest

In [None]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


rf = RandomForestRegressor(featuresCol="features", labelCol="price", numTrees=100, maxDepth=8)
model_rf = rf.fit(train)


25/10/22 15:57:08 WARN DAGScheduler: Broadcasting large task binary with size 1112.0 KiB
25/10/22 15:57:09 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
25/10/22 15:57:10 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
25/10/22 15:57:11 WARN DAGScheduler: Broadcasting large task binary with size 1240.6 KiB
                                                                                

In [None]:
predictions_rf = model_rf.transform(test)
predictions_rf.select("price", "prediction").show(30)

# ewaluacja
evaluator_rmse = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")

rmse_rf = evaluator_rmse.evaluate(predictions_rf)
r2_rf = evaluator_r2.evaluate(predictions_rf)

print(f"RMSE: {rmse_rf:.2f}")
print(f"R²: {r2_rf:.3f}")

+-------+------------------+
|  price|        prediction|
+-------+------------------+
|1997.99| 1394.142307086425|
|1051.99|1335.4535545267042|
|1309.99|1296.4490209636915|
| 920.99|1136.3329430875688|
|1114.99|1270.4449936118144|
|1074.99|1310.1638068632535|
|1461.99|1519.6133822147356|
|1464.99|1310.0563168501699|
|1159.99|1277.8582098247568|
|1441.99| 1418.989153116662|
|1577.99|1474.1185316301455|
|1503.99|1531.4727215011947|
| 985.99|1323.0443930141973|
|1684.99|1310.0005958873778|
|1584.99|1348.1871301242654|
|1211.99|1422.5928694586585|
| 747.99|1139.6609670942055|
|1653.99|1233.0906316237688|
|1318.99| 1329.683781995384|
| 885.99|1315.1289170191706|
|1134.99| 1273.564941531458|
|1877.99|1329.5294026169713|
|1059.99|1219.2507323935874|
|1200.99|1348.2899942675497|
|1161.99|1440.0109405002272|
|1655.99| 1333.243178896953|
|1110.99| 1434.411082582065|
|1208.99|1363.3033702714629|
|1218.99|1426.8773300551727|
|1510.99|1365.8289520151468|
+-------+------------------+
only showing t

In [None]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

pdf = df.toPandas()

y = pdf['price']
X = pdf.drop(columns=['price'])

# podział na train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
cat_features = X.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", cat_features)

Categorical columns: ['device_type', 'brand', 'model', 'os', 'form_factor', 'cpu_brand', 'cpu_model', 'gpu_brand', 'gpu_model', 'storage_type', 'display_type', 'resolution', 'wifi']


In [None]:
model_CB = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=8,
    cat_features=cat_features,
    loss_function='RMSE',
    random_seed=42,
    verbose=100
)

model_CB.fit(X_train, y_train, cat_features=cat_features)

0:	learn: 560.6273552	total: 120ms	remaining: 59.9s
100:	learn: 224.6379408	total: 6.1s	remaining: 24.1s
200:	learn: 217.9873165	total: 11.9s	remaining: 17.7s
300:	learn: 214.8832598	total: 17.7s	remaining: 11.7s
400:	learn: 212.7531089	total: 23.8s	remaining: 5.87s
499:	learn: 211.1072684	total: 30.1s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1420e9d00>

In [None]:

from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred_CB = model_CB.predict(X_test)

rmse_CB = mean_squared_error(y_test, y_pred_CB)
r2_CB = r2_score(y_test, y_pred_CB)

print(f"RMSE: {rmse_CB:.2f}")
print(f"R²: {r2_CB:.3f}")

RMSE: 38449.69
R²: 0.884


In [None]:
import matplotlib.pyplot as plt

feature_importance = model_CB.get_feature_importance(prettified=True)
print(feature_importance)



             Feature Id  Importances
0              gpu_tier    20.599049
1                ram_gb    18.303036
2              cpu_tier    10.712142
3          display_type     8.488830
4            resolution     6.942143
5                 brand     6.732908
6                    os     6.605258
7           form_factor     3.847601
8            storage_gb     2.698919
9          release_year     2.165603
10      display_size_in     1.672739
11           refresh_hz     1.604658
12            cpu_brand     1.580651
13           battery_wh     1.346456
14         storage_type     1.243887
15        charger_watts     1.162570
16          device_type     0.744118
17            weight_kg     0.728996
18            cpu_cores     0.716039
19            gpu_model     0.641338
20            psu_watts     0.369945
21              vram_gb     0.276011
22                 wifi     0.254895
23            gpu_brand     0.149148
24         cpu_base_ghz     0.114065
25            cpu_model     0.103173
2

25/10/22 17:50:49 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1015618 ms exceeds timeout 120000 ms
25/10/22 17:50:49 WARN SparkContext: Killing executors is not supported by current scheduler.
25/10/22 17:50:49 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

In [30]:
spark.stop()

ConnectionRefusedError: [Errno 61] Connection refused