In [1]:
!pip install pyarrow



In [1]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import *
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *
# import pyspark.pandas
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import seaborn as sns

import torch
import torch.nn as nn
from sparktorch import serialize_torch_obj, SparkTorch

In [2]:
spark = SparkSession.builder.appName("Vu dep trai").config("spark.executor.memory","10g").getOrCreate()
# conf = pyspark.SparkConf().setMaster("spark://node-master:7077")\
#         .setAppName("Vu dep trai")\
#         .set("spark.executor.memory","15g")
# # sc = SparkContext.getOrCreate(conf=conf)
# # spark.stop()
# sc = SparkContext(conf = conf)
# spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/05 03:18:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df_stores_raw = spark.read.csv("data/ba-walmart/stores.csv", header=True, inferSchema=True)
df_feature_raw = spark.read.csv("data/ba-walmart/features.csv", header=True, inferSchema=True)
df_train_raw = spark.read.csv("data/ba-walmart/train.csv", header=True, inferSchema=True)
df_test_raw = spark.read.csv("data/ba-walmart/test.csv", header=True, inferSchema=True)

In [4]:
df_feature = df_feature_raw.drop("MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5")
df = df_train_raw.join(df_feature, how="left", on=["Store", "Date", "IsHoliday"], ).join(df_stores_raw, how="left", on=["Store"])
df_test = df_test_raw.join(df_feature, how="left", on=["Store", "Date", "IsHoliday"]).join(df_stores_raw, how="left", on=["Store"])
df = df.withColumn("CPI", df["CPI"].cast(FloatType())).withColumn("Unemployment", df["Unemployment"].cast(FloatType()))
df_test = df_test.withColumn("CPI", df_test["CPI"].cast(FloatType())).withColumn("Unemployment", df_test["Unemployment"].cast(FloatType()))


In [5]:
df = df.withColumn("Year", year("Date")).withColumn("Month", month("Date")).withColumn("Week", weekofyear("Date"))
df_test = df_test.withColumn("Year", year("Date")).withColumn("Month", month("Date")).withColumn("Week", weekofyear("Date"))
df = df.withColumn("IsHoliday", df["IsHoliday"].cast(IntegerType()))
df_test = df_test.withColumn("IsHoliday", df_test["IsHoliday"].cast(IntegerType()))

In [6]:
df_clean = df.filter(df["Weekly_Sales"] > 0)
df_clean = df_clean.filter(df_clean["Weekly_Sales"] < 450000)

In [7]:
types = df_clean.select("Type").distinct().collect()
types.sort()
mapping = {t.Type: str(i) for i, t in enumerate(types)}
df_clean = df_clean.replace(mapping, subset=["Type"])
df_test = df_test.replace(mapping, subset=["Type"])
df_clean = df_clean.withColumn("Type", df_clean["Type"].cast(IntegerType()))
df_test = df_test.withColumn("Type", df_test["Type"].cast(IntegerType()))

In [8]:
## From EDA select important columns
drop_col = ['Date', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Month']
# input_col = ['Store', 'IsHoliday', 'Type', 'Size', 'Week','Dept','Year']
onehot_col = ['Store', 'Type']
target = 'Weekly_Sales'

In [9]:
# target_scale = df_clean.agg({"Weekly_Sales": "mean"}).collect()[0][0]

In [10]:
df_clean = df_clean.drop(*drop_col)
df_clean = df_clean.na.drop()
df_clean_pd = df_clean.toPandas()
min_target = df_clean_pd[target].min()
max_target = df_clean_pd[target].max()
for oh_cols in onehot_col:
    df_clean_pd = pd.concat([df_clean_pd, pd.get_dummies(df_clean_pd[oh_cols], prefix=oh_cols)], axis=1)
    df_clean_pd = df_clean_pd.drop(oh_cols, axis=1)
    
df_clean_pd = (df_clean_pd - df_clean_pd.min()) / (df_clean_pd.max() - df_clean_pd.min())
df_clean_pd = df_clean_pd.dropna()
df_clean_pd = df_clean_pd.reset_index(drop=True)
df_clean_pd.head()

Unnamed: 0,IsHoliday,Dept,Weekly_Sales,Size,Year,Week,Store_1,Store_2,Store_3,Store_4,...,Store_39,Store_40,Store_41,Store_42,Store_43,Store_44,Store_45,Type_0,Type_1,Type_2
0,0.0,0.0,0.05902,0.630267,0.0,0.078431,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.109019,0.630267,0.0,0.098039,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.098496,0.630267,0.0,0.117647,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.045947,0.630267,0.0,0.137255,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.051687,0.630267,0.0,0.156863,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
df_clean = spark.createDataFrame(df_clean_pd)

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [12]:
# min max scaler
all_col = df_clean.columns
all_col.remove(target)
mm_assembler = VectorAssembler(inputCols=all_col, outputCol="features")
mm_pipeline = pyspark.ml.Pipeline(stages=[mm_assembler]).fit(df_clean)
df_clean = mm_pipeline.transform(df_clean)

In [13]:
df_clean[["Weekly_Sales"]].show(5, truncate=False)

23/02/05 03:20:43 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/02/05 03:20:43 WARN TaskSetManager: Stage 16 contains a task of very large size (16633 KiB). The maximum recommended task size is 1000 KiB.


[Stage 16:>                                                         (0 + 1) / 1]

23/02/05 03:20:47 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 16 (TID 24): Attempting to kill Python Worker
+-------------------+
|Weekly_Sales       |
+-------------------+
|0.05901994249481135|
|0.10901918001495786|
|0.0984961529339467 |
|0.04594658606039068|
|0.05168734897215822|
+-------------------+
only showing top 5 rows



                                                                                

In [14]:
# split 80% first data for training
df_train, df_valid = df_clean.randomSplit([0.8, 0.2], seed=1234)

In [15]:
net = nn.Sequential(
    nn.Linear(53, 128),
    nn.ReLU(),
    nn.Linear(128, 256),
    nn.ReLU(),
    nn.Linear(256, 512),
    nn.ReLU(),
    nn.Linear(512, 512),
    nn.ReLU(),
    nn.Linear(512, 1)
)

torch_obj = serialize_torch_obj(
    model=net,
    criterion=nn.MSELoss(),
    optimizer=torch.optim.Adam,
    lr=0.0001
)

spark_model = SparkTorch(
    inputCol='features',
    labelCol=target,
    predictionCol="prediction",
    torchObj=torch_obj,
    miniBatch=1000,
    iters=50,
    verbose=1
)

In [16]:
p = pyspark.ml.Pipeline(stages=[spark_model])
model = p.fit(df_train)

23/02/05 03:20:48 WARN TaskSetManager: Stage 17 contains a task of very large size (16633 KiB). The maximum recommended task size is 1000 KiB.


Partition: 0ad56e10-0b7e-41b7-b0ae-fe40176f00ba. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.017471155151724815, Partition Validation Loss: None
Partition: 69280c3d-7378-4001-a023-b189787b9ef2. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.015252096578478813, Partition Validation Loss: None
Partition: b1c961da-5ce9-407b-b22a-8b5d1f901f7e. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.02142767794430256, Partition Validation Loss: None
Partition: 79a76d55-3a6c-4479-870f-3e402b8459e8. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.027571167796850204, Partition Validation Loss: None
Partition: bcfb8cc1-4aca-4afa-9f97-405e6462834a. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.015256104990839958, Partition Validation Loss: None
Partition: d82c30d9-78ce-44d3-985c-457583762c8b. Iteration: 0. Distributed Loss: None Partition Training Loss: 0.02002730593085289, Partition Validation Loss: None
Partition: 7

In [17]:
pred = model.transform(df_valid).select("prediction", target)
pred = pred.withColumn("prediction", pred["prediction"] * (max_target - min_target) + min_target)
pred = pred.withColumn(target, pred[target] * (max_target - min_target) + min_target)
pred[["prediction", target]].show()

23/02/05 03:24:22 WARN TaskSetManager: Stage 18 contains a task of very large size (16633 KiB). The maximum recommended task size is 1000 KiB.


[Stage 18:>                                                         (0 + 1) / 1]

+------------------+-----------------+
|        prediction|     Weekly_Sales|
+------------------+-----------------+
| 10989.66031428337|           3552.7|
|  11022.9180442667|          3930.41|
|10765.458534936904|          4726.45|
|11553.518853487969|          4790.03|
|11715.301820402145|          4799.63|
|11737.037335891724|          4808.68|
|11414.865269081592|          4896.15|
| 10787.73208938837|          5172.73|
|11758.769704954624|          5625.99|
|12581.705016801357|          5794.35|
|11223.581405637264|          5855.34|
|12775.735710735322|          5991.01|
|11832.732756867408|          6041.07|
|12598.233196139336|          6121.23|
|10814.643476762772|          6453.58|
| 12565.27123026371|          6635.58|
|12631.748933110237|          6755.83|
|12969.675158295631|           6762.8|
|12691.273032999039|6926.879999999999|
|11934.041403036117|          7205.82|
+------------------+-----------------+
only showing top 20 rows



                                                                                

In [18]:
# get pred target max
pred_pd = pred.toPandas()
pred_pd["diff"] = pred_pd["prediction"] - pred_pd[target]
pred_pd["diff"].abs().mean()

23/02/05 03:24:25 WARN TaskSetManager: Stage 19 contains a task of very large size (16633 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

-1474.282908094142