In [1]:
import pyspark as sp
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
import pyspark.ml as ml
spark = SparkSession.builder.appName('BigMartSales').getOrCreate()
spark

In [2]:
df = spark.read.csv('constants/quick_fit.csv', header=True, inferSchema=True)
df.printSchema()

root
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: integer (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: integer (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- outletAge: integer (nullable = true)
 |-- Outlet_Size: integer (nullable = true)
 |-- Outlet_Location_Type: integer (nullable = true)
 |-- Outlet_Type: integer (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



In [3]:
independent_variables = ["Item_Weight", "Item_Fat_Content", "Item_Visibility", "Item_Type", "Item_MRP", "outletAge", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type"]
featuresVector = VectorAssembler(inputCols=independent_variables,outputCol="featuresVector")
output = featuresVector.transform(df)

In [4]:
output = output.withColumnRenamed('Item_Outlet_Sales', 'totalSales').select('featuresVector', 'totalSales')

In [5]:
output.show()

+--------------------+----------+
|      featuresVector|totalSales|
+--------------------+----------+
|[9.3,0.0,0.016047...|  3735.138|
|[5.92,1.0,0.01927...|  443.4228|
|[17.5,0.0,0.01676...|   2097.27|
|[19.2,1.0,0.0,6.0...|    732.38|
|[8.93,0.0,0.0,9.0...|  994.7052|
|[10.395,1.0,0.0,0...|  556.6088|
|[13.65,1.0,0.0127...|  343.5528|
|[19.0,0.0,0.12746...| 4022.7636|
|[16.2,1.0,0.01668...| 1076.5986|
|[19.2,1.0,0.09444...|  4710.535|
|[11.8,0.0,0.0,6.0...| 1516.0266|
|[18.5,1.0,0.04546...|  2187.153|
|[15.1,1.0,0.10001...| 1589.2646|
|[17.6,1.0,0.04725...| 2145.2076|
|[16.35,0.0,0.0680...|  1977.426|
|[9.0,1.0,0.069088...| 1547.3192|
|[11.8,0.0,0.00859...| 1621.8888|
|[9.0,1.0,0.069196...|  718.3982|
|[8.26,0.0,0.03423...|  2303.668|
|[13.35,0.0,0.1024...| 2748.4224|
+--------------------+----------+
only showing top 20 rows



In [6]:
train,test = output.randomSplit([0.8, 0.2])

In [7]:
def autoTest(report, label, pred):
    metrics = {"mae": 0,
               'mse': 0,
               'rmse': 0,
               'r2': 0,
               'var': 0}
    for metric in metrics.keys():
        evaluator = ml.evaluation.RegressionEvaluator(labelCol=label, predictionCol=pred, metricName=metric)
        metrics[metric] = evaluator.evaluate(report)
    print(*[f'{met}: {result}' for met, result in metrics.items()],sep='\n')

In [8]:
model = ml.regression.DecisionTreeRegressor(featuresCol='featuresVector', labelCol='totalSales', maxBins=2048)
model = model.fit(train)

In [9]:
prediction = model.transform(train)

In [10]:
autoTest(report=prediction, label='totalSales', pred='prediction')

mae: 755.7722394585275
mse: 1150792.787984912
rmse: 1072.750105096668
r2: 0.6117819359658123
var: 1813502.0107330377


In [11]:
model.write().overwrite().save('model')