In [72]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [52]:
spark = SparkSession.builder.appName("practica").getOrCreate()

In [53]:
df_train = spark.read.csv("../data/processed/train/train.csv", header=True, inferSchema=True)

In [54]:
df_train.show()

+---+-----------+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+
|Ram|Price_euros|Intel|Intel_baja|Intel_media|Intel_alta|Gaming|Ultrabook|Notebook|  AMD|Nvidia|Nvidia_Geforce|Nvidia_graphics|Intel_graphics|Screen_HD|  SSD|  HDD|HDD+SSD|               ppi|
+---+-----------+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+
|  8|      749.0| true|     false|      false|     false| false|    false|    true|false| false|         false|          false|         false|    false|false| true|  false|100.45466986113651|
|  4|      449.0| true|     false|      false|     false| false|    false|    true|false| false|         false|           true|          true|    false|false| true|  false|100.45466986113651|
|  8|     1460.0| true|     false|      

In [55]:
df_test = spark.read.csv("../data/processed/test/test.csv", header=True, inferSchema=True)

In [56]:
df_test.show()

+---+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+
|Ram|Intel|Intel_baja|Intel_media|Intel_alta|Gaming|Ultrabook|Notebook|  AMD|Nvidia|Nvidia_Geforce|Nvidia_graphics|Intel_graphics|Screen_HD|  SSD|  HDD|HDD+SSD|               ppi|
+---+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+
| 16| true|     false|      false|     false| false|     true|   false|false| false|         false|           true|          true|     true| true|false|  false| 331.2642361026012|
|  8|false|     false|      false|     false| false|    false|    true| true| false|         false|          false|         false|    false| true|false|  false|100.45466986113651|
|  4| true|     false|      false|     false| false|    false|    true|false| false|         false| 

In [57]:
X = df_train.drop("price_euros")
y = df_train.select("price_euros")

In [58]:
X.show()

+---+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+
|Ram|Intel|Intel_baja|Intel_media|Intel_alta|Gaming|Ultrabook|Notebook|  AMD|Nvidia|Nvidia_Geforce|Nvidia_graphics|Intel_graphics|Screen_HD|  SSD|  HDD|HDD+SSD|               ppi|
+---+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+
|  8| true|     false|      false|     false| false|    false|    true|false| false|         false|          false|         false|    false|false| true|  false|100.45466986113651|
|  4| true|     false|      false|     false| false|    false|    true|false| false|         false|           true|          true|    false|false| true|  false|100.45466986113651|
|  8| true|     false|      false|     false| false|    false|    true|false| false|         false| 

In [59]:
y.show()

+-----------+
|price_euros|
+-----------+
|      749.0|
|      449.0|
|     1460.0|
|    2868.99|
|    1713.37|
|     1099.0|
|      415.0|
|     1142.8|
|     1193.0|
|     1600.0|
|      309.0|
|      855.0|
|     1119.0|
|      649.0|
|      499.0|
|    1096.16|
|      629.0|
|      549.0|
|     4899.0|
|      854.0|
+-----------+
only showing top 20 rows



In [65]:
vector_assembler_train = VectorAssembler(inputCols=X.columns, outputCol="features")
X_assembled = vector_assembler_train.transform(df_train)

In [66]:
X_assembled.show()

+---+-----------+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+--------------------+
|Ram|Price_euros|Intel|Intel_baja|Intel_media|Intel_alta|Gaming|Ultrabook|Notebook|  AMD|Nvidia|Nvidia_Geforce|Nvidia_graphics|Intel_graphics|Screen_HD|  SSD|  HDD|HDD+SSD|               ppi|            features|
+---+-----------+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+--------------------+
|  8|      749.0| true|     false|      false|     false| false|    false|    true|false| false|         false|          false|         false|    false|false| true|  false|100.45466986113651|(18,[0,1,7,15,17]...|
|  4|      449.0| true|     false|      false|     false| false|    false|    true|false| false|         false|           true|          true|    fa

In [68]:
lr_model = LinearRegression(featuresCol="features", labelCol="Price_euros")
lr_model = lr_model.fit(X_assembled)

In [69]:
vector_assembler_test = VectorAssembler(inputCols=df_test.columns, outputCol="features")
test_assembled = vector_assembler_test.transform(df_test)

In [70]:
test_assembled.show()

+---+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+--------------------+
|Ram|Intel|Intel_baja|Intel_media|Intel_alta|Gaming|Ultrabook|Notebook|  AMD|Nvidia|Nvidia_Geforce|Nvidia_graphics|Intel_graphics|Screen_HD|  SSD|  HDD|HDD+SSD|               ppi|            features|
+---+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+--------------------+
| 16| true|     false|      false|     false| false|     true|   false|false| false|         false|           true|          true|     true| true|false|  false| 331.2642361026012|(18,[0,1,6,11,12,...|
|  8|false|     false|      false|     false| false|    false|    true| true| false|         false|          false|         false|    false| true|false|  false|100.45466986113651|(18,[0,7,8,14,17]

In [71]:
predictions_test = lr_model.transform(test_assembled)

predictions_test.show()

+---+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+--------------------+------------------+
|Ram|Intel|Intel_baja|Intel_media|Intel_alta|Gaming|Ultrabook|Notebook|  AMD|Nvidia|Nvidia_Geforce|Nvidia_graphics|Intel_graphics|Screen_HD|  SSD|  HDD|HDD+SSD|               ppi|            features|        prediction|
+---+-----+----------+-----------+----------+------+---------+--------+-----+------+--------------+---------------+--------------+---------+-----+-----+-------+------------------+--------------------+------------------+
| 16| true|     false|      false|     false| false|     true|   false|false| false|         false|           true|          true|     true| true|false|  false| 331.2642361026012|(18,[0,1,6,11,12,...| 2518.680798491453|
|  8|false|     false|      false|     false| false|    false|    true| true| false|         false|          false|     