In [1]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import *
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import seaborn as sns

import torch
import torch.nn as nn
from sparktorch import serialize_torch_obj, SparkTorch

In [2]:
spark = SparkSession.builder.appName("Vu dep trai").config("spark.executor.memory","10g").getOrCreate()
# conf = pyspark.SparkConf().setMaster("spark://node-master:7077")\
#         .setAppName("Vu dep trai")\
#         .set("spark.executor.memory","15g")
# # sc = SparkContext.getOrCreate(conf=conf)
# # spark.stop()
# sc = SparkContext(conf = conf)
# spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/04 12:03:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df_stores_raw = spark.read.csv("data/ba-walmart/stores.csv", header=True, inferSchema=True)
df_feature_raw = spark.read.csv("data/ba-walmart/features.csv", header=True, inferSchema=True)
df_train_raw = spark.read.csv("data/ba-walmart/train.csv", header=True, inferSchema=True)
df_test_raw = spark.read.csv("data/ba-walmart/test.csv", header=True, inferSchema=True)

In [5]:
df_feature = df_feature_raw.drop("MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5")
df = df_train_raw.join(df_feature, how="left", on=["Store", "Date", "IsHoliday"], ).join(df_stores_raw, how="left", on=["Store"])
df_test = df_test_raw.join(df_feature, how="left", on=["Store", "Date", "IsHoliday"]).join(df_stores_raw, how="left", on=["Store"])
df = df.withColumn("CPI", df["CPI"].cast(FloatType())).withColumn("Unemployment", df["Unemployment"].cast(FloatType()))
df_test = df_test.withColumn("CPI", df_test["CPI"].cast(FloatType())).withColumn("Unemployment", df_test["Unemployment"].cast(FloatType()))


In [6]:
df = df.withColumn("Year", year("Date")).withColumn("Month", month("Date")).withColumn("Week", weekofyear("Date"))
df_test = df_test.withColumn("Year", year("Date")).withColumn("Month", month("Date")).withColumn("Week", weekofyear("Date"))
df = df.withColumn("IsHoliday", df["IsHoliday"].cast(IntegerType()))
df_test = df_test.withColumn("IsHoliday", df_test["IsHoliday"].cast(IntegerType()))

In [7]:
df_clean = df.filter(df["Weekly_Sales"] > 0)
df_clean = df_clean.filter(df_clean["Weekly_Sales"] < 450000)

In [8]:
types = df_clean.select("Type").distinct().collect()
types.sort()
mapping = {t.Type: str(i) for i, t in enumerate(types)}
df_clean = df_clean.replace(mapping, subset=["Type"])
df_test = df_test.replace(mapping, subset=["Type"])
df_clean = df_clean.withColumn("Type", df_clean["Type"].cast(IntegerType()))
df_test = df_test.withColumn("Type", df_test["Type"].cast(IntegerType()))

                                                                                

In [9]:
## From EDA select important columns
input_col = ['Store', 'IsHoliday', 'Type', 'Size', 'Week','Dept','Year']
target = 'Weekly_Sales'

In [10]:
# split 80% first data for training
df_train, df_valid = df_clean.randomSplit([0.8, 0.2], seed=1234)

In [11]:
assembler = VectorAssembler(inputCols=input_col, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)

In [14]:
net = nn.Sequential(
    nn.Linear(7, 128),
    nn.ReLU(),
    nn.Linear(128, 1)
)

torch_obj = serialize_torch_obj(
    model=net,
    criterion=nn.MSELoss(),
    optimizer=torch.optim.Adam,
    lr=0.0001
)

spark_model = SparkTorch(
    inputCol='scaledFeatures',
    labelCol=target,
    predictionCol='predictions',
    torchObj=torch_obj,
    iters=2,
    verbose=1
)

In [15]:
p = pyspark.ml.Pipeline(stages=[assembler, scaler, spark_model])
model = p.fit(df_train)

Partition: 2a47eb9b-6e07-491f-8347-81a263ee2298. Iteration: 0. Distributed Loss: None Partition Training Loss: 549430144.0, Partition Validation Loss: None
Partition: f4c48d52-e982-4e5c-b351-e53af77c92ca. Iteration: 0. Distributed Loss: None Partition Training Loss: 293181184.0, Partition Validation Loss: None
Partition: 682b4bed-c9e4-4303-b657-18300f9b7000. Iteration: 0. Distributed Loss: None Partition Training Loss: 736884608.0, Partition Validation Loss: None
Partition: 6e89eef3-5492-44d0-823e-169a66bc4bd5. Iteration: 0. Distributed Loss: None Partition Training Loss: 1049317376.0, Partition Validation Loss: None
Partition: f4c48d52-e982-4e5c-b351-e53af77c92ca. Iteration: 1. Distributed Loss: None Partition Training Loss: 293181088.0, Partition Validation Loss: None
Partition: 2a47eb9b-6e07-491f-8347-81a263ee2298. Iteration: 1. Distributed Loss: None Partition Training Loss: 549430016.0, Partition Validation Loss: None
Partition: 682b4bed-c9e4-4303-b657-18300f9b7000. Iteration: 1. 

In [16]:
pred = model.transform(df_valid)
pred.show()

[Stage 31:>                                                         (0 + 1) / 1]

+-----+-------------------+---------+----+------------+-----------+----------+---------+------------+----+------+----+-----+----+--------------------+--------------------+-------------------+
|Store|               Date|IsHoliday|Dept|Weekly_Sales|Temperature|Fuel_Price|      CPI|Unemployment|Type|  Size|Year|Month|Week|            features|      scaledFeatures|        predictions|
+-----+-------------------+---------+----+------------+-----------+----------+---------+------------+----+------+----+-----+----+--------------------+--------------------+-------------------+
|    1|2010-02-05 00:00:00|        0|   2|    50605.27|      42.31|     2.572|211.09636|       8.106|   0|151315|2010|    2|   5|[1.0,0.0,0.0,1513...|[-1.6577908705381...|0.15043896436691284|
|    1|2010-02-05 00:00:00|        0|   5|    32229.38|      42.31|     2.572|211.09636|       8.106|   0|151315|2010|    2|   5|[1.0,0.0,0.0,1513...|[-1.6577908705381...|0.14201930165290833|
|    1|2010-02-05 00:00:00|        0|  2

                                                                                

In [17]:
# get mae
evaluator = RegressionEvaluator(labelCol=target, predictionCol="predictions", metricName="mae")
mae = evaluator.evaluate(pred)
print("MAE: ", mae)



MAE:  16070.635356372743


                                                                                