In [1]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-11.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.0 MB)
     |████████████████████████████████| 35.0 MB 3.3 MB/s            
Installing collected packages: pyarrow
Successfully installed pyarrow-11.0.0


In [2]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import *
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *
# import pyspark.pandas
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import seaborn as sns

import torch
import torch.nn as nn
from sparktorch import serialize_torch_obj, SparkTorch

In [3]:
spark = SparkSession.builder.appName("Vu dep trai").config("spark.executor.memory","10g").getOrCreate()
# conf = pyspark.SparkConf().setMaster("spark://node-master:7077")\
#         .setAppName("Vu dep trai")\
#         .set("spark.executor.memory","15g")
# # sc = SparkContext.getOrCreate(conf=conf)
# # spark.stop()
# sc = SparkContext(conf = conf)
# spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/07 15:14:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/07 15:14:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
df_stores_raw = spark.read.csv("data/ba-walmart/stores.csv", header=True, inferSchema=True)
df_feature_raw = spark.read.csv("data/ba-walmart/features.csv", header=True, inferSchema=True)
df_train_raw = spark.read.csv("data/ba-walmart/train.csv", header=True, inferSchema=True)
df_test_raw = spark.read.csv("data/ba-walmart/test.csv", header=True, inferSchema=True)

In [5]:
df_feature = df_feature_raw.drop("MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5")
df = df_train_raw.join(df_feature, how="left", on=["Store", "Date", "IsHoliday"], ).join(df_stores_raw, how="left", on=["Store"])
df_test = df_test_raw.join(df_feature, how="left", on=["Store", "Date", "IsHoliday"]).join(df_stores_raw, how="left", on=["Store"])
df = df.withColumn("CPI", df["CPI"].cast(FloatType())).withColumn("Unemployment", df["Unemployment"].cast(FloatType()))
df_test = df_test.withColumn("CPI", df_test["CPI"].cast(FloatType())).withColumn("Unemployment", df_test["Unemployment"].cast(FloatType()))


In [6]:
df = df.withColumn("Year", year("Date")).withColumn("Month", month("Date")).withColumn("Week", weekofyear("Date"))
df_test = df_test.withColumn("Year", year("Date")).withColumn("Month", month("Date")).withColumn("Week", weekofyear("Date"))
df = df.withColumn("IsHoliday", df["IsHoliday"].cast(IntegerType()))
df_test = df_test.withColumn("IsHoliday", df_test["IsHoliday"].cast(IntegerType()))

In [7]:
df_clean = df.filter(df["Weekly_Sales"] > 0)
df_clean = df_clean.filter(df_clean["Weekly_Sales"] < 450000)

In [8]:
types = df_clean.select("Type").distinct().collect()
types.sort()
mapping = {t.Type: str(i) for i, t in enumerate(types)}
df_clean = df_clean.replace(mapping, subset=["Type"])
df_test = df_test.replace(mapping, subset=["Type"])
df_clean = df_clean.withColumn("Type", df_clean["Type"].cast(IntegerType()))
df_test = df_test.withColumn("Type", df_test["Type"].cast(IntegerType()))

In [9]:
## From EDA select important columns
drop_col = ['Date', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Month']
# input_col = ['Store', 'IsHoliday', 'Type', 'Size', 'Week','Dept','Year']
onehot_col = ['Store', 'Type']
target = 'Weekly_Sales'

In [10]:
# target_scale = df_clean.agg({"Weekly_Sales": "mean"}).collect()[0][0]

In [11]:
df_clean = df_clean.drop(*drop_col)
df_clean = df_clean.na.drop()
df_clean_pd = df_clean.toPandas()
min_target = df_clean_pd[target].min()
max_target = df_clean_pd[target].max()
for oh_cols in onehot_col:
    df_clean_pd = pd.concat([df_clean_pd, pd.get_dummies(df_clean_pd[oh_cols], prefix=oh_cols)], axis=1)
    df_clean_pd = df_clean_pd.drop(oh_cols, axis=1)
    
df_clean_pd = (df_clean_pd - df_clean_pd.min()) / (df_clean_pd.max() - df_clean_pd.min())
df_clean_pd = df_clean_pd.dropna()
df_clean_pd = df_clean_pd.reset_index(drop=True)
df_clean_pd.head()

Unnamed: 0,IsHoliday,Dept,Weekly_Sales,Size,Year,Week,Store_1,Store_2,Store_3,Store_4,...,Store_39,Store_40,Store_41,Store_42,Store_43,Store_44,Store_45,Type_0,Type_1,Type_2
0,0.0,0.0,0.05902,0.630267,0.0,0.078431,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.109019,0.630267,0.0,0.098039,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.098496,0.630267,0.0,0.117647,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.045947,0.630267,0.0,0.137255,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.051687,0.630267,0.0,0.156863,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [12]:
df_clean = spark.createDataFrame(df_clean_pd)

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [13]:
# min max scaler
all_col = df_clean.columns
all_col.remove(target)
mm_assembler = VectorAssembler(inputCols=all_col, outputCol="features")
mm_pipeline = pyspark.ml.Pipeline(stages=[mm_assembler]).fit(df_clean)
df_clean = mm_pipeline.transform(df_clean)

In [14]:
df_clean[["Weekly_Sales"]].show(5, truncate=False)

23/02/07 15:16:10 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/02/07 15:16:10 WARN TaskSetManager: Stage 16 contains a task of very large size (16633 KiB). The maximum recommended task size is 1000 KiB.


[Stage 16:>                                                         (0 + 1) / 1]

23/02/07 15:16:14 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 16 (TID 24): Attempting to kill Python Worker
+-------------------+
|Weekly_Sales       |
+-------------------+
|0.05901994249481135|
|0.10901918001495786|
|0.0984961529339467 |
|0.04594658606039068|
|0.05168734897215822|
+-------------------+
only showing top 5 rows



                                                                                

In [15]:
# split 80% first data for training
df_train, df_valid = df_clean.randomSplit([0.8, 0.2], seed=1234)

In [16]:
net = nn.Sequential(
    nn.Linear(53, 128),
    nn.ReLU(),
    nn.Linear(128, 256),
    nn.ReLU(),
    nn.Linear(256, 512),
    nn.ReLU(),
    nn.Linear(512, 512),
    nn.ReLU(),
    nn.Linear(512, 1)
)

torch_obj = serialize_torch_obj(
    model=net,
    criterion=nn.MSELoss(),
    optimizer=torch.optim.Adam,
    lr=0.0001
)

spark_model = SparkTorch(
    inputCol='features',
    labelCol=target,
    predictionCol="prediction",
    torchObj=torch_obj,
    # miniBatch=1000,
    iters=15,
    verbose=1
)

In [17]:
p = pyspark.ml.Pipeline(stages=[spark_model])
model = p.fit(df_train)

23/02/07 15:16:15 WARN TaskSetManager: Stage 17 contains a task of very large size (16633 KiB). The maximum recommended task size is 1000 KiB.


Partition: cca273c7-e826-4017-a8d8-91c91dcef7a9. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.004410120192915201, Partition Validation Loss: None
Partition: a2ee3eef-1ac9-4fa0-9672-df93ac573d92. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.0019760557916015387, Partition Validation Loss: None
Partition: 66353d31-b216-44dc-97b0-d032ae5851f2. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.003026119200512767, Partition Validation Loss: None
Partition: f91421cc-2f72-45d4-a4e3-ffa4a3872a9c. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.004651714116334915, Partition Validation Loss: None
Partition: 6e8d3f4c-b9fb-4376-b7f7-b01dca17ed8b. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.0037486867513507605, Partition Validation Loss: None
Partition: cf3d473a-96a2-4e00-a1cc-b1d4c4d29a67. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.006622850429266691, Partition Validation Loss: None
Partitio

In [18]:
pred = model.transform(df_valid).select("prediction", target)
pred = pred.withColumn("prediction", pred["prediction"] * (max_target - min_target) + min_target)
pred = pred.withColumn(target, pred[target] * (max_target - min_target) + min_target)
pred[["prediction", target]].show()

[Stage 18:>                                                         (0 + 0) / 1]

23/02/07 15:18:19 WARN TaskSetManager: Stage 18 contains a task of very large size (16633 KiB). The maximum recommended task size is 1000 KiB.


[Stage 18:>                                                         (0 + 1) / 1]

+------------------+-----------------+
|        prediction|     Weekly_Sales|
+------------------+-----------------+
|13293.761825739146|           3552.7|
|13322.428919199705|          3930.41|
|13199.183386234045|          4726.45|
|13113.186825492381|          4790.03|
|13317.029651020765|          4799.63|
|13340.914175931215|          4808.68|
|13077.778512874842|          4896.15|
|13043.775079768897|          5172.73|
|13364.540693854093|          5625.99|
|13566.802439976931|          5794.35|
|13545.655306276083|          5855.34|
|13666.100519498586|          5991.01|
|13431.801856943368|          6041.07|
|13578.161040283441|          6121.23|
|12996.342697602511|          6453.58|
|13554.603743747473|          6635.58|
|13593.572238150835|          6755.83|
|13782.387729866505|           6762.8|
|  13621.0531287539|6926.879999999999|
|13546.284591611624|          7205.82|
+------------------+-----------------+
only showing top 20 rows



                                                                                

In [19]:
# get pred target max
pred_pd = pred.toPandas()
pred_pd["diff"] = pred_pd["prediction"] - pred_pd[target]
pred_pd["diff"].abs().mean()

[Stage 19:>                                                        (0 + 0) / 12]

23/02/07 15:18:35 WARN TaskSetManager: Stage 19 contains a task of very large size (16633 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

14759.186176181876

In [20]:
pred = model.transform(df_train).select("prediction", target)
pred = pred.withColumn("prediction", pred["prediction"] * (max_target - min_target) + min_target)
pred = pred.withColumn(target, pred[target] * (max_target - min_target) + min_target)
# get pred target max
pred_pd = pred.toPandas()
pred_pd["diff"] = pred_pd["prediction"] - pred_pd[target]
pred_pd["diff"].abs().mean()

23/02/07 15:46:38 WARN TaskSetManager: Stage 20 contains a task of very large size (16633 KiB). The maximum recommended task size is 1000 KiB.


ERROR:root:KeyboardInterrupt while sending command.               (0 + 12) / 12]
Traceback (most recent call last):
  File "/opt/bitnami/python/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/bitnami/python/lib/python3.8/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/bitnami/python/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

23/02/07 16:39:26 ERROR Executor: Exception in task 2.0 in stage 20.0 (TID 52): Connection reset
23/02/07 16:39:26 ERROR Executor: Exception in task 7.0 in stage 20.0 (TID 57): Connection reset
23/02/07 16:39:26 ERROR Executor: Exception in task 6.0 in stage 20.0 (TID 56): Connection reset
23/02/07 16:39:26 ERROR Executor: Exception in task 5.0 in stage 20.0 (TID 55): Connection reset
23/02/07 16:39:26 ERROR Executor: Exception in task 10.0 in stage 20.0 (TID 60): Connection reset
23/02/07 16:39:26 ERROR Executor: Exception in task 11.0 in stage 20.0 (TID 61): Connection reset
23/02/07 16:39:26 ERROR Executor: Exception in task 8.0 in stage 20.0 (TID 58): Connection reset
23/02/07 16:39:26 ERROR Executor: Exception in task 0.0 in stage 20.0 (TID 50): Connection reset
23/02/07 16:39:26 ERROR Executor: Exception in task 4.0 in stage 20.0 (TID 54): Connection reset
23/02/07 16:39:26 ERROR Executor: Exception in task 1.0 in stage 20.0 (TID 51): Connection reset
23/02/07 16:39:26 ERROR Exec

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 35880)
Traceback (most recent call last):
  File "/opt/bitnami/python/lib/python3.8/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/bitnami/python/lib/python3.8/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/bitnami/python/lib/python3.8/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/bitnami/python/lib/python3.8/socketserver.py", line 747, in __init__
    self.handle()
  File "/opt/bitnami/spark/python/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/opt/bitnami/spark/python/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/opt/bitnami/spark/python/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = read_int(self.r