In [58]:
import pyspark as sp
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
spark = SparkSession.builder.appName('BigMartSales').getOrCreate()

In [52]:
df = spark.read.csv('data/cleaned_train.csv', header=True, inferSchema=True)

In [53]:
df.show()

+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|Item_Identifier|Item_Weight|Item_Fat_Content|Item_Visibility|           Item_Type|Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type|Item_Outlet_Sales|
+---------------+-----------+----------------+---------------+--------------------+--------+-----------------+-------------------------+-----------+--------------------+-----------+-----------------+
|          FDA15|        9.3|         Low Fat|    0.016047301|               Dairy|249.8092|           OUT049|                     1999|     Medium|                   1|         S1|         3735.138|
|          DRC01|       5.92|         Regular|    0.019278216|         Soft Drinks| 48.2692|           OUT018|                     2009|     Medium|                   3|         S2|         443.4228|


In [54]:
df.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: integer (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



In [56]:
df.describe().show()

+-------+---------------+------------------+----------------+-------------------+-------------+-----------------+-----------------+-------------------------+-----------+--------------------+-----------+------------------+
|summary|Item_Identifier|       Item_Weight|Item_Fat_Content|    Item_Visibility|    Item_Type|         Item_MRP|Outlet_Identifier|Outlet_Establishment_Year|Outlet_Size|Outlet_Location_Type|Outlet_Type| Item_Outlet_Sales|
+-------+---------------+------------------+----------------+-------------------+-------------+-----------------+-----------------+-------------------------+-----------+--------------------+-----------+------------------+
|  count|           8523|              8523|            8523|               8523|         8523|             8523|             8523|                     8523|       8523|                8523|       8523|              8523|
|   mean|           NULL|12.875361375103129|            NULL|0.06613202877895127|         NULL|140.9927819781768

In [65]:
to_encode = {'Item_Identifier': 'itemID',
             'Item_Fat_Content': 'isLF',
             'Item_Type': 'itemTypeID',
             'Outlet_Size': 'outletSize'}

for column, new_column in to_encode.items():
    indexer = StringIndexer(inputCol=column, outputCol=new_column)
    indexed_df = indexer.fit(df).transform(df)

In [71]:
indexed_df = indexed_df.drop('Item_Identifier').drop('Item_Fat_Content').drop('Item_Type').drop('Outlet_Size').drop('Outlet_Identifier')

In [72]:
indexed_df.show()

+-----------+---------------+--------+-------------------------+--------------------+-----------+-----------------+----------+
|Item_Weight|Item_Visibility|Item_MRP|Outlet_Establishment_Year|Outlet_Location_Type|Outlet_Type|Item_Outlet_Sales|outletSize|
+-----------+---------------+--------+-------------------------+--------------------+-----------+-----------------+----------+
|        9.3|    0.016047301|249.8092|                     1999|                   1|         S1|         3735.138|       0.0|
|       5.92|    0.019278216| 48.2692|                     2009|                   3|         S2|         443.4228|       0.0|
|       17.5|    0.016760075| 141.618|                     1999|                   1|         S1|          2097.27|       0.0|
|       19.2|            0.0| 182.095|                     1998|                   3|          G|           732.38|       1.0|
|       8.93|            0.0| 53.8614|                     1987|                   3|         S1|         994.7

In [None]:
featureasVector = VectorAssembler(inputCols=["Item_Identifier","Item_Weight", "Item_Fat_Content", "Item_Visibility", "Item_Type", "Item_MRP", "Outlet_Establishment_Year", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type"],outputCol="Independent Features")