In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
     ---------------------------------------- 0.0/317.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/317.0 MB ? eta -:--:--
     -------------------------------------- 0.0/317.0 MB 330.3 kB/s eta 0:16:00
     -------------------------------------- 0.0/317.0 MB 330.3 kB/s eta 0:16:00
     -------------------------------------- 0.1/317.0 MB 525.1 kB/s eta 0:10:04
     -------------------------------------- 0.1/317.0 MB 655.8 kB/s eta 0:08:04
     -------------------------------------- 0.3/317.0 MB 983.0 kB/s eta 0:05:23
     ---------------------------------------- 0.3/317.0 MB 1.1 MB/s eta 0:04:41
     ---------------------------------------- 0.5/317.0 MB 1.3 MB/s eta 0:03:56
     ---------------------------------------- 0.6/317.0 MB 1.4 MB/s eta 0:03:48
     ---------------------------------------- 0.7/317.0 MB 1.6 MB/s eta 0:03:22
     ---------------------------------------- 0.9/317.0 MB 1.8 MB/s e

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("practice").getOrCreate()

In [7]:
spark_df = spark.read.csv('dataset.csv', inferSchema=True, header=True)

In [8]:
spark_df.show()

+---------+--------------+-----------+------+--------+
|ProductID|   ProductName|   Category| Price|Quantity|
+---------+--------------+-----------+------+--------+
|        1|  Coffee Maker| Appliances| 79.99|     120|
|        2|       Blender| Appliances| 39.99|      80|
|        3|     Desk Lamp|  Furniture| 19.99|     150|
|        4|  Dining Chair|  Furniture| 45.99|     100|
|        5| Running Shoes|   Clothing| 89.99|     200|
|        6|        Jacket|   Clothing|149.99|      90|
|        7| Laptop Sleeve|Electronics| 29.99|      50|
|        8|Wireless Mouse|Electronics| 19.99|     180|
+---------+--------------+-----------+------+--------+



In [9]:
spark_df.printSchema()

root
 |-- ProductID: integer (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Quantity: integer (nullable = true)



In [10]:
type(spark_df)

pyspark.sql.dataframe.DataFrame

In [11]:
spark_df.columns

['ProductID', 'ProductName', 'Category', 'Price', 'Quantity']

In [13]:
spark_df.head(3)

[Row(ProductID=1, ProductName='Coffee Maker', Category='Appliances', Price=79.99, Quantity=120),
 Row(ProductID=2, ProductName='Blender', Category='Appliances', Price=39.99, Quantity=80),
 Row(ProductID=3, ProductName='Desk Lamp', Category='Furniture', Price=19.99, Quantity=150)]

In [15]:
spark_df.select('ProductName').show()

+--------------+
|   ProductName|
+--------------+
|  Coffee Maker|
|       Blender|
|     Desk Lamp|
|  Dining Chair|
| Running Shoes|
|        Jacket|
| Laptop Sleeve|
|Wireless Mouse|
+--------------+



In [16]:
spark_df.select(['ProductName', 'ProductID']).show()

+--------------+---------+
|   ProductName|ProductID|
+--------------+---------+
|  Coffee Maker|        1|
|       Blender|        2|
|     Desk Lamp|        3|
|  Dining Chair|        4|
| Running Shoes|        5|
|        Jacket|        6|
| Laptop Sleeve|        7|
|Wireless Mouse|        8|
+--------------+---------+



In [18]:
spark_df.describe().show()

+-------+-----------------+--------------+----------+----------------+------------------+
|summary|        ProductID|   ProductName|  Category|           Price|          Quantity|
+-------+-----------------+--------------+----------+----------------+------------------+
|  count|                8|             8|         8|               8|                 8|
|   mean|              4.5|          NULL|      NULL|           59.49|            121.25|
| stddev|2.449489742783178|          NULL|      NULL|44.8712443712961|51.668586753876454|
|    min|                1|       Blender|Appliances|           19.99|                50|
|    max|                8|Wireless Mouse| Furniture|          149.99|               200|
+-------+-----------------+--------------+----------+----------------+------------------+



In [20]:
spark_df_new = spark_df.withColumn('newPrice', spark_df['Price'] * 2)

In [21]:
spark_df_new.show()

+---------+--------------+-----------+------+--------+--------+
|ProductID|   ProductName|   Category| Price|Quantity|newPrice|
+---------+--------------+-----------+------+--------+--------+
|        1|  Coffee Maker| Appliances| 79.99|     120|  159.98|
|        2|       Blender| Appliances| 39.99|      80|   79.98|
|        3|     Desk Lamp|  Furniture| 19.99|     150|   39.98|
|        4|  Dining Chair|  Furniture| 45.99|     100|   91.98|
|        5| Running Shoes|   Clothing| 89.99|     200|  179.98|
|        6|        Jacket|   Clothing|149.99|      90|  299.98|
|        7| Laptop Sleeve|Electronics| 29.99|      50|   59.98|
|        8|Wireless Mouse|Electronics| 19.99|     180|   39.98|
+---------+--------------+-----------+------+--------+--------+



In [22]:
spark_df_new.drop('newPrice').show()

+---------+--------------+-----------+------+--------+
|ProductID|   ProductName|   Category| Price|Quantity|
+---------+--------------+-----------+------+--------+
|        1|  Coffee Maker| Appliances| 79.99|     120|
|        2|       Blender| Appliances| 39.99|      80|
|        3|     Desk Lamp|  Furniture| 19.99|     150|
|        4|  Dining Chair|  Furniture| 45.99|     100|
|        5| Running Shoes|   Clothing| 89.99|     200|
|        6|        Jacket|   Clothing|149.99|      90|
|        7| Laptop Sleeve|Electronics| 29.99|      50|
|        8|Wireless Mouse|Electronics| 19.99|     180|
+---------+--------------+-----------+------+--------+



In [23]:
spark_df_new.withColumnRenamed('newPrice', 'doublePrice').show()  # assign to a variable to save the changes

+---------+--------------+-----------+------+--------+-----------+
|ProductID|   ProductName|   Category| Price|Quantity|doublePrice|
+---------+--------------+-----------+------+--------+-----------+
|        1|  Coffee Maker| Appliances| 79.99|     120|     159.98|
|        2|       Blender| Appliances| 39.99|      80|      79.98|
|        3|     Desk Lamp|  Furniture| 19.99|     150|      39.98|
|        4|  Dining Chair|  Furniture| 45.99|     100|      91.98|
|        5| Running Shoes|   Clothing| 89.99|     200|     179.98|
|        6|        Jacket|   Clothing|149.99|      90|     299.98|
|        7| Laptop Sleeve|Electronics| 29.99|      50|      59.98|
|        8|Wireless Mouse|Electronics| 19.99|     180|      39.98|
+---------+--------------+-----------+------+--------+-----------+

