In [1]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import *
from pyspark.ml.regression import *
from pyspark.ml.evaluation import *
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import seaborn as sns

In [2]:
spark = SparkSession.builder.appName("Vu dep trai").config("spark.executor.memory","12g").getOrCreate()
# conf = pyspark.SparkConf().setMaster("spark://node-master:7077")\
#         .setAppName("Vu dep trai")\
#         .set("spark.executor.memory","15g")
# # sc = SparkContext.getOrCreate(conf=conf)
# # spark.stop()
# sc = SparkContext(conf = conf)
# spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/05 16:40:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/05 16:40:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df_stores_raw = spark.read.csv("data/ba-walmart/stores.csv", header=True, inferSchema=True)
df_feature_raw = spark.read.csv("data/ba-walmart/features.csv", header=True, inferSchema=True)
df_train_raw = spark.read.csv("data/ba-walmart/train.csv", header=True, inferSchema=True)
df_test_raw = spark.read.csv("data/ba-walmart/test.csv", header=True, inferSchema=True)

In [4]:
df_feature = df_feature_raw.drop("MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5")
df = df_train_raw.join(df_feature, how="left", on=["Store", "Date", "IsHoliday"], ).join(df_stores_raw, how="left", on=["Store"])
df_test = df_test_raw.join(df_feature, how="left", on=["Store", "Date", "IsHoliday"]).join(df_stores_raw, how="left", on=["Store"])
df = df.withColumn("CPI", df["CPI"].cast(FloatType())).withColumn("Unemployment", df["Unemployment"].cast(FloatType()))
df_test = df_test.withColumn("CPI", df_test["CPI"].cast(FloatType())).withColumn("Unemployment", df_test["Unemployment"].cast(FloatType()))


In [7]:
df = df.withColumn("Year", year("Date")).withColumn("Month", month("Date")).withColumn("Week", weekofyear("Date"))
df_test = df_test.withColumn("Year", year("Date")).withColumn("Month", month("Date")).withColumn("Week", weekofyear("Date"))
df = df.withColumn("IsHoliday", df["IsHoliday"].cast(IntegerType()))
df_test = df_test.withColumn("IsHoliday", df_test["IsHoliday"].cast(IntegerType()))

In [10]:
# df_clean = df.filter(df["Weekly_Sales"] > 0)
# df_clean = df_clean.filter(df_clean["Weekly_Sales"] < 450000)
df_clean = df

In [11]:
types = df_clean.select("Type").distinct().collect()
types.sort()
mapping = {t.Type: str(i) for i, t in enumerate(types)}
df_clean = df_clean.replace(mapping, subset=["Type"])
df_test = df_test.replace(mapping, subset=["Type"])
df_clean = df_clean.withColumn("Type", df_clean["Type"].cast(IntegerType()))
df_test = df_test.withColumn("Type", df_test["Type"].cast(IntegerType()))

In [12]:
## From EDA select important columns
input_col = ['Store', 'IsHoliday', 'Type', 'Size', 'Week','Dept','Year']
target = 'Weekly_Sales'

In [13]:
df_train, df_valid = df_clean.randomSplit([0.8, 0.2], seed=1234)

In [21]:
assembler = VectorAssembler(inputCols=input_col, outputCol="features_")
minmax = MinMaxScaler(inputCol="features_", outputCol="features")

In [22]:
linear = LinearRegression(featuresCol="features", labelCol=target, maxIter=10, regParam=0.3, elasticNetParam=0.8)
pipeline = pyspark.ml.Pipeline(stages=[assembler, minmax, linear])
model = pipeline.fit(df_train)
pred = model.transform(df_valid)
evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(pred)
print("MAE: ", mae)

                                                                                

MAE:  14467.416166461382


                                                                                

In [23]:
factor_reg = FMRegressor(featuresCol="features", labelCol=target)
pipeline = pyspark.ml.Pipeline(stages=[assembler, minmax, factor_reg])
model = pipeline.fit(df_train)
pred = model.transform(df_valid)
evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(pred)
print("MAE: ", mae)

Exception ignored in: <function JavaWrapper.__del__ at 0x7fdc96b903a0>
Traceback (most recent call last):
  File "/opt/bitnami/spark/python/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'FMRegressor' object has no attribute '_java_obj'
                                                                                

MAE:  14366.208175575988


In [26]:
gradient = GBTRegressor(featuresCol="features_", labelCol=target)
pipeline = pyspark.ml.Pipeline(stages=[assembler, gradient])
model = pipeline.fit(df_train)
pred = model.transform(df_valid)
evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(pred)
print("MAE: ", mae)



MAE:  8219.484677523957


                                                                                

In [28]:
random_forest = RandomForestRegressor(featuresCol="features_", labelCol=target, numTrees=20, maxDepth=25, maxBins=100000)
pipeline = pyspark.ml.Pipeline(stages=[assembler, random_forest])
model = pipeline.fit(df_train)
pred = model.transform(df_valid)
evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(pred)
print("MAE: ", mae)


                                                                                

23/02/05 16:59:04 WARN DAGScheduler: Broadcasting large task binary with size 1532.2 KiB


                                                                                

23/02/05 16:59:05 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


                                                                                

23/02/05 16:59:06 WARN DAGScheduler: Broadcasting large task binary with size 5.0 MiB




23/02/05 16:59:07 WARN DAGScheduler: Broadcasting large task binary with size 1452.5 KiB


                                                                                

23/02/05 16:59:08 WARN DAGScheduler: Broadcasting large task binary with size 8.6 MiB




23/02/05 16:59:09 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB


                                                                                

23/02/05 16:59:11 WARN DAGScheduler: Broadcasting large task binary with size 14.2 MiB




23/02/05 16:59:13 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB


                                                                                

23/02/05 16:59:15 WARN DAGScheduler: Broadcasting large task binary with size 22.3 MiB




23/02/05 16:59:18 WARN DAGScheduler: Broadcasting large task binary with size 5.0 MiB


[Stage 532:>                                                        (0 + 0) / 4]

23/02/05 16:59:21 WARN DAGScheduler: Broadcasting large task binary with size 33.2 MiB




23/02/05 16:59:24 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB


[Stage 534:>                                                        (0 + 0) / 4]

23/02/05 16:59:29 WARN DAGScheduler: Broadcasting large task binary with size 46.8 MiB




23/02/05 16:59:32 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB


[Stage 536:>                                                        (0 + 0) / 4]

23/02/05 16:59:40 WARN DAGScheduler: Broadcasting large task binary with size 63.1 MiB




23/02/05 16:59:44 WARN DAGScheduler: Broadcasting large task binary with size 9.8 MiB


[Stage 538:>                                                        (0 + 0) / 4]

23/02/05 16:59:54 WARN DAGScheduler: Broadcasting large task binary with size 81.5 MiB




23/02/05 16:59:59 WARN DAGScheduler: Broadcasting large task binary with size 10.9 MiB


[Stage 540:>                                                        (0 + 0) / 4]

23/02/05 17:00:08 WARN DAGScheduler: Broadcasting large task binary with size 96.1 MiB




23/02/05 17:00:14 WARN DAGScheduler: Broadcasting large task binary with size 10.9 MiB


[Stage 542:>                                                        (0 + 0) / 4]

23/02/05 17:00:24 WARN DAGScheduler: Broadcasting large task binary with size 114.9 MiB




23/02/05 17:00:30 WARN DAGScheduler: Broadcasting large task binary with size 10.9 MiB


[Stage 544:>                                                        (0 + 0) / 4]

23/02/05 17:00:41 WARN DAGScheduler: Broadcasting large task binary with size 138.1 MiB




23/02/05 17:00:48 WARN DAGScheduler: Broadcasting large task binary with size 10.9 MiB


[Stage 546:>                                                        (0 + 0) / 4]

23/02/05 17:01:00 WARN DAGScheduler: Broadcasting large task binary with size 155.8 MiB




23/02/05 17:01:06 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB


[Stage 548:>                                                        (0 + 0) / 4]

23/02/05 17:01:20 WARN DAGScheduler: Broadcasting large task binary with size 171.7 MiB




23/02/05 17:01:27 WARN DAGScheduler: Broadcasting large task binary with size 8.9 MiB


[Stage 550:>                                                        (0 + 0) / 4]

23/02/05 17:01:41 WARN DAGScheduler: Broadcasting large task binary with size 184.9 MiB




23/02/05 17:01:47 WARN DAGScheduler: Broadcasting large task binary with size 7.3 MiB


[Stage 552:>                                                        (0 + 0) / 4]

23/02/05 17:02:01 WARN DAGScheduler: Broadcasting large task binary with size 195.2 MiB




23/02/05 17:02:08 WARN DAGScheduler: Broadcasting large task binary with size 5.6 MiB


                                                                                

23/02/05 17:02:10 WARN DAGScheduler: Broadcasting large task binary with size 19.7 MiB


                                                                                

23/02/05 17:02:11 WARN DAGScheduler: Broadcasting large task binary with size 20.4 MiB


                                                                                

23/02/05 17:02:13 WARN DAGScheduler: Broadcasting large task binary with size 10.3 MiB




MAE:  3483.14591347027


                                                                                

In [None]:
# assembler = VectorAssembler(inputCols=input_col, outputCol="features")
# rf = RandomForestRegressor(featuresCol="features", labelCol="Weekly_Sales", maxDepth=30, numTrees=20, seed=42, maxBins=100000)
# pipeline = pyspark.ml.Pipeline(stages=[assembler, rf])
# model = pipeline.fit(df_train)
# rf = RandomForestRegressor(featuresCol="features", labelCol=target)
# pipeline = pyspark.ml.Pipeline(stages=[assembler, rf])
# paramGrid = pyspark.ml.tuning.ParamGridBuilder()\
#     .addGrid(rf.numTrees, [10])\
#     .addGrid(rf.maxDepth, [20])\
#     .addGrid(rf.maxBins, [100])\
#     .addGrid(rf.featureSubsetStrategy, ["0.8"])\
#     .addGrid(rf.subsamplingRate, [0.2])\
#     .build()
# evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="mse")
# crossval = pyspark.ml.tuning.CrossValidator(estimator=pipeline,
#                             estimatorParamMaps=paramGrid,
#                             evaluator=evaluator,
#                             numFolds=3)
# cvModel = crossval.fit(df_train)

In [None]:
# bestModel = cvModel.bestModel
# bestModel.stages[-1].extractParamMap()

In [None]:
# train_pred = model.transform(df_train)
# valid_pred = model.transform(df_valid)
# evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="mae")
# train_mae = evaluator.evaluate(train_pred)
# valid_mae = evaluator.evaluate(valid_pred)
# print("Train MAE: ", train_mae)
# print("Valid MAE: ", valid_mae)