In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, isnull
from pyspark.ml.feature import Imputer,VectorAssembler
from pyspark.ml.regression import LinearRegression

In [15]:
spark = SparkSession.builder.appName("practica").getOrCreate()

In [16]:
df_train = spark.read.csv("../data/raw/train.csv", header=True, inferSchema=True)

In [17]:
df_train.show()

+---+----------+---------+-------------------+-----+-----------+
| id|      date|store_nbr|             family|sales|onpromotion|
+---+----------+---------+-------------------+-----+-----------+
|  0|2013-01-01|        1|         AUTOMOTIVE|  0.0|          0|
|  1|2013-01-01|        1|          BABY CARE|  0.0|          0|
|  2|2013-01-01|        1|             BEAUTY|  0.0|          0|
|  3|2013-01-01|        1|          BEVERAGES|  0.0|          0|
|  4|2013-01-01|        1|              BOOKS|  0.0|          0|
|  5|2013-01-01|        1|       BREAD/BAKERY|  0.0|          0|
|  6|2013-01-01|        1|        CELEBRATION|  0.0|          0|
|  7|2013-01-01|        1|           CLEANING|  0.0|          0|
|  8|2013-01-01|        1|              DAIRY|  0.0|          0|
|  9|2013-01-01|        1|               DELI|  0.0|          0|
| 10|2013-01-01|        1|               EGGS|  0.0|          0|
| 11|2013-01-01|        1|       FROZEN FOODS|  0.0|          0|
| 12|2013-01-01|        1

In [18]:
df_train.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- store_nbr: integer (nullable = true)
 |-- family: string (nullable = true)
 |-- sales: double (nullable = true)
 |-- onpromotion: integer (nullable = true)



In [19]:
assembler =VectorAssembler(inputCols=["store_nbr","sales"], outputCol="features")

In [20]:
output = assembler.transform(df_train)

In [22]:
output.show()

+---+----------+---------+-------------------+-----+-----------+-------------------+
| id|      date|store_nbr|             family|sales|onpromotion|independant_feature|
+---+----------+---------+-------------------+-----+-----------+-------------------+
|  0|2013-01-01|        1|         AUTOMOTIVE|  0.0|          0|          [1.0,0.0]|
|  1|2013-01-01|        1|          BABY CARE|  0.0|          0|          [1.0,0.0]|
|  2|2013-01-01|        1|             BEAUTY|  0.0|          0|          [1.0,0.0]|
|  3|2013-01-01|        1|          BEVERAGES|  0.0|          0|          [1.0,0.0]|
|  4|2013-01-01|        1|              BOOKS|  0.0|          0|          [1.0,0.0]|
|  5|2013-01-01|        1|       BREAD/BAKERY|  0.0|          0|          [1.0,0.0]|
|  6|2013-01-01|        1|        CELEBRATION|  0.0|          0|          [1.0,0.0]|
|  7|2013-01-01|        1|           CLEANING|  0.0|          0|          [1.0,0.0]|
|  8|2013-01-01|        1|              DAIRY|  0.0|          0| 

In [29]:
data_final = output.select("independant_feature","sales")

In [30]:
train_data, test_data = data_final.randomSplit([0.75,0.25])

regressor= LinearRegression(featuresCol="features",labelCol="sales")

regressor=regressor.fit(train_data)

In [32]:
regressor.coefficients

DenseVector([0.0, 1.0])

In [33]:
regressor.intercept

0.0

In [34]:
resultados = regressor.evaluate(test_data)

In [35]:
resultados.predictions.show()

+-------------------+-----+----------+
|independant_feature|sales|prediction|
+-------------------+-----+----------+
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
|          [1.0,0.0]|  0.0|       0.0|
+-------------------+-----+----------+
only showing top 20 rows



In [36]:
resultados.meanAbsoluteError, resultados.meanSquaredError

(0.0, 0.0)