In [3]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Customers").getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression

In [4]:
dataset=spark.read.csv("Customer.csv",inferSchema=True,header=True)

In [7]:
dataset

DataFrame[index: int, Order ID: string, Cust ID: int, Gender: string, Age: int, Age Group: string, Date: string, Month: string, Status: string, Channel : string, SKU: string, Category: string, Size: string, Qty: int, currency: string, Amount: int, ship-city: string, ship-state: string, ship-postal-code: int, ship-country: string, B2B: boolean]

In [8]:
dataset.show()

+-----+-------------------+-------+------+---+---------+----------+-----+---------+--------+--------------------+-------------+----+---+--------+------+-------------------+--------------+----------------+------------+-----+
|index|           Order ID|Cust ID|Gender|Age|Age Group|      Date|Month|   Status|Channel |                 SKU|     Category|Size|Qty|currency|Amount|          ship-city|    ship-state|ship-postal-code|ship-country|  B2B|
+-----+-------------------+-------+------+---+---------+----------+-----+---------+--------+--------------------+-------------+----+---+--------+------+-------------------+--------------+----------------+------------+-----+
|    1|171-1029312-3038738|1029312| Women| 44|    Adult|04-12-2022|  Dec|Delivered|  Myntra|JNE1233-BLUE-KR-0...|        kurta| XXL|  1|     INR|   376|             MOHALI|        PUNJAB|          140301|          IN|false|
|    2|405-2183842-2225946|2183842| Women| 29| Teenager|04-12-2022|  Dec|Delivered|    Ajio|      SET414

In [10]:
dataset.printSchema()

root
 |-- index: integer (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Cust ID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Age Group: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Channel : string (nullable = true)
 |-- SKU: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Qty: integer (nullable = true)
 |-- currency: string (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- ship-city: string (nullable = true)
 |-- ship-state: string (nullable = true)
 |-- ship-postal-code: integer (nullable = true)
 |-- ship-country: string (nullable = true)
 |-- B2B: boolean (nullable = true)



In [12]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [18]:
featureassembler=VectorAssembler(inputCols=["Qty","Age","Amount"],outputCol="Independent Features")

In [22]:
output=featureassembler.transform(dataset)

In [23]:
output.show()

+-----+-------------------+-------+------+---+---------+----------+-----+---------+--------+--------------------+-------------+----+---+--------+------+-------------------+--------------+----------------+------------+-----+--------------------+
|index|           Order ID|Cust ID|Gender|Age|Age Group|      Date|Month|   Status|Channel |                 SKU|     Category|Size|Qty|currency|Amount|          ship-city|    ship-state|ship-postal-code|ship-country|  B2B|Independent Features|
+-----+-------------------+-------+------+---+---------+----------+-----+---------+--------+--------------------+-------------+----+---+--------+------+-------------------+--------------+----------------+------------+-----+--------------------+
|    1|171-1029312-3038738|1029312| Women| 44|    Adult|04-12-2022|  Dec|Delivered|  Myntra|JNE1233-BLUE-KR-0...|        kurta| XXL|  1|     INR|   376|             MOHALI|        PUNJAB|          140301|          IN|false|    [1.0,44.0,376.0]|
|    2|405-2183842-2

In [24]:
output.select("Independent Features").show()

+--------------------+
|Independent Features|
+--------------------+
|    [1.0,44.0,376.0]|
|    [1.0,29.0,376.0]|
|    [1.0,67.0,376.0]|
|    [1.0,20.0,376.0]|
|    [1.0,62.0,376.0]|
|    [1.0,49.0,376.0]|
|    [1.0,23.0,376.0]|
|    [1.0,70.0,376.0]|
|    [1.0,75.0,376.0]|
|    [1.0,43.0,376.0]|
|    [1.0,76.0,517.0]|
|    [1.0,45.0,399.0]|
|    [1.0,18.0,786.0]|
|    [1.0,44.0,911.0]|
|    [1.0,52.0,967.0]|
|    [1.0,18.0,523.0]|
|   [1.0,30.0,1115.0]|
|    [1.0,48.0,563.0]|
|    [1.0,24.0,473.0]|
|    [1.0,46.0,545.0]|
+--------------------+
only showing top 20 rows



In [25]:
finalized_data=output.select("Independent Features","Amount")

In [26]:
finalized_data.show()

+--------------------+------+
|Independent Features|Amount|
+--------------------+------+
|    [1.0,44.0,376.0]|   376|
|    [1.0,29.0,376.0]|   376|
|    [1.0,67.0,376.0]|   376|
|    [1.0,20.0,376.0]|   376|
|    [1.0,62.0,376.0]|   376|
|    [1.0,49.0,376.0]|   376|
|    [1.0,23.0,376.0]|   376|
|    [1.0,70.0,376.0]|   376|
|    [1.0,75.0,376.0]|   376|
|    [1.0,43.0,376.0]|   376|
|    [1.0,76.0,517.0]|   517|
|    [1.0,45.0,399.0]|   399|
|    [1.0,18.0,786.0]|   786|
|    [1.0,44.0,911.0]|   911|
|    [1.0,52.0,967.0]|   967|
|    [1.0,18.0,523.0]|   523|
|   [1.0,30.0,1115.0]|  1115|
|    [1.0,48.0,563.0]|   563|
|    [1.0,24.0,473.0]|   473|
|    [1.0,46.0,545.0]|   545|
+--------------------+------+
only showing top 20 rows



In [32]:
train_data,test_data=finalized_data.randomSplit([0.75,0.25])

In [35]:
regressor=LinearRegression(featuresCol="Independent Features",labelCol="Amount")
regressor=regressor.fit(train_data)

In [37]:
regressor.coefficients

DenseVector([0.0, 0.0, 1.0])

In [38]:
regressor.intercept

2.8301090837239876e-14

In [41]:
pred_results=regressor.evaluate(test_data)

In [45]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Amount|        prediction|
+--------------------+------+------------------+
|    [1.0,18.0,291.0]|   291|291.00000000000006|
|    [1.0,18.0,292.0]|   292|292.00000000000006|
|    [1.0,18.0,301.0]|   301|301.00000000000006|
|    [1.0,18.0,301.0]|   301|301.00000000000006|
|    [1.0,18.0,301.0]|   301|301.00000000000006|
|    [1.0,18.0,307.0]|   307|307.00000000000006|
|    [1.0,18.0,314.0]|   314|314.00000000000006|
|    [1.0,18.0,318.0]|   318|318.00000000000006|
|    [1.0,18.0,323.0]|   323|323.00000000000006|
|    [1.0,18.0,329.0]|   329|329.00000000000006|
|    [1.0,18.0,329.0]|   329|329.00000000000006|
|    [1.0,18.0,330.0]|   330|330.00000000000006|
|    [1.0,18.0,345.0]|   345|345.00000000000006|
|    [1.0,18.0,345.0]|   345|345.00000000000006|
|    [1.0,18.0,346.0]|   346|346.00000000000006|
|    [1.0,18.0,349.0]|   349|349.00000000000006|
|    [1.0,18.0,353.0]|   353|353.00000000000006|
|    [1.0,18.0,362.0