<a href="https://colab.research.google.com/github/Devcoding17/ADF/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install PySpark
# !pip install pyspark

# Import SparkSession
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("PySparkTutorial").getOrCreate()

# Create a sample DataFrame
data = [("Alice", 1), ("Bob", 2), ("Charlie", 3)]
columns = ["Name", "ID"]
df = spark.createDataFrame(data, columns)

# Show the DataFrame
print("Original DataFrame:")
df.show()

# Perform a transformation (e.g., filter)
filtered_df = df.filter(df.ID > 1)

# Show the transformed DataFrame
print("Filtered DataFrame (ID > 1):")
filtered_df.show()

# Stop the SparkSession (optional, but good practice)
# spark.stop()


Original DataFrame:
+-------+---+
|   Name| ID|
+-------+---+
|  Alice|  1|
|    Bob|  2|
|Charlie|  3|
+-------+---+

Filtered DataFrame (ID > 1):
+-------+---+
|   Name| ID|
+-------+---+
|    Bob|  2|
|Charlie|  3|
+-------+---+



In [3]:
print(spark.sparkContext.uiWebUrl)

http://59bd17eb46fa:4040


In [5]:
mega_csv = spark.read.csv('MegaMart.csv', header=True, inferSchema=True)

mega_csv.show()

+--------+-------+----------+----------+----------------+--------------------+--------+--------------+--------------+------------+
|order_id|user_id|order_date|product_id|product_category|        product_name|quantity|price_per_unit|payment_method|order_status|
+--------+-------+----------+----------+----------------+--------------------+--------+--------------+--------------+------------+
|    1001|   U188|2025-04-20|      P940|         Fashion|            Sneakers|       2|         58.53|        PayPal|   Cancelled|
|    1002|   U062|2025-04-16|      P794|         Fashion|             T-Shirt|       3|         83.76|           UPI|    Returned|
|    1003|   U058|2025-04-18|      P326|         Fashion|          Sunglasses|       2|         78.85|        PayPal|  Processing|
|    1004|   U011|2025-04-10|      P574|         Fashion|          Sunglasses|       5|         46.49|        PayPal|   Delivered|
|    1005|   U003|2025-04-19|      P988|      Home Decor|         Photo Frame|     

In [6]:
mega_csv.printSchema()
mega_csv.columns


root
 |-- order_id: integer (nullable = true)
 |-- user_id: string (nullable = true)
 |-- order_date: date (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price_per_unit: double (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- order_status: string (nullable = true)



In [9]:
mega_csv.describe().show()

+-------+-----------------+-------+----------+----------------+--------------+----------------+------------------+--------------+------------+
|summary|         order_id|user_id|product_id|product_category|  product_name|        quantity|    price_per_unit|payment_method|order_status|
+-------+-----------------+-------+----------+----------------+--------------+----------------+------------------+--------------+------------+
|  count|             1000|   1000|      1000|            1000|          1000|            1000|              1000|          1000|        1000|
|   mean|           1500.5|   NULL|      NULL|            NULL|          NULL|           3.001|55.205360000000034|          NULL|        NULL|
| stddev|288.8194360957494|   NULL|      NULL|            NULL|          NULL|1.42864972615405|25.355789976960995|          NULL|        NULL|
|    min|             1001|   U001|      P101|           Books| AI Revolution|               1|             10.03|   Credit Card|   Cancelled|

In [12]:
mega_csv1 = mega_csv.select('order_id')
mega_csv1.show()

+--------+
|order_id|
+--------+
|    1001|
|    1002|
|    1003|
|    1004|
|    1005|
|    1006|
|    1007|
|    1008|
|    1009|
|    1010|
|    1011|
|    1012|
|    1013|
|    1014|
|    1015|
|    1016|
|    1017|
|    1018|
|    1019|
|    1020|
+--------+
only showing top 20 rows



In [23]:
mega_csv2 = mega_csv.select(['order_id', 'product_id']) \
            .orderBy('order_id', ascending=False)

mega_csv2.show()

+--------+----------+
|order_id|product_id|
+--------+----------+
|    2000|      P870|
|    1999|      P900|
|    1998|      P651|
|    1997|      P126|
|    1996|      P860|
|    1995|      P126|
|    1994|      P673|
|    1993|      P171|
|    1992|      P969|
|    1991|      P101|
|    1990|      P123|
|    1989|      P125|
|    1988|      P649|
|    1987|      P848|
|    1986|      P289|
|    1985|      P167|
|    1984|      P820|
|    1983|      P551|
|    1982|      P314|
|    1981|      P237|
+--------+----------+
only showing top 20 rows



In [29]:
# Register the DataFrame as a temporary view
mega_csv.createOrReplaceTempView("mega_csv_view")

# Now you can query the temporary view using spark.sql
df1 = spark.sql("SELECT distinct(order_status) FROM mega_csv_view")
df1.show()

+------------+
|order_status|
+------------+
|    Returned|
|  Processing|
|   Cancelled|
|   Delivered|
+------------+



In [32]:
df1 = spark.sql("""SELECT product_id
                 FROM mega_csv_view
                 WHERE order_status = 'Returned'  """)
df1.show()

+----------+
|product_id|
+----------+
|      P794|
|      P988|
|      P328|
|      P786|
|      P713|
|      P960|
|      P311|
|      P106|
|      P948|
|      P840|
|      P324|
|      P834|
|      P171|
|      P371|
|      P553|
|      P750|
|      P862|
|      P807|
|      P390|
|      P235|
+----------+
only showing top 20 rows



### More PySpark Examples

This section will cover more PySpark operations, from basic data manipulation to more advanced concepts.

Let's start with some basic data manipulation operations.

In [None]:
# Select specific columns
selected_cols_df = df.select("Name")
print("DataFrame with selected columns:")
selected_cols_df.show()

# Add a new column
df_with_new_col = df.withColumn("Status", when(df.ID > 2, "Senior").otherwise("Junior"))
print("DataFrame with new column:")
df_with_new_col.show()

# Drop a column
df_without_id = df_with_new_col.drop("ID")
print("DataFrame after dropping a column:")
df_without_id.show()

Now let's look at handling missing values and performing aggregations.

In [None]:
from pyspark.sql.functions import avg, col

# Create a DataFrame with missing values
data_missing = [("Alice", 1, 100), ("Bob", 2, None), ("Charlie", 3, 150), ("David", 4, None)]
columns_missing = ["Name", "ID", "Score"]
df_missing = spark.createDataFrame(data_missing, columns_missing)
print("DataFrame with missing values:")
df_missing.show()

# Drop rows with missing values
df_no_missing = df_missing.na.drop()
print("DataFrame after dropping rows with missing values:")
df_no_missing.show()

# Fill missing values
df_filled_missing = df_missing.na.fill(0)
print("DataFrame after filling missing values:")
df_filled_missing.show()

# Aggregate data
avg_score = df_missing.agg(avg(col("Score")).alias("AverageScore"))
print("Average score:")
avg_score.show()

Finally, let's demonstrate joining DataFrames.

In [None]:
# Create another DataFrame
data_address = [("Alice", "New York"), ("Bob", "Los Angeles"), ("Charlie", "Chicago")]
columns_address = ["Name", "City"]
df_address = spark.createDataFrame(data_address, columns_address)
print("Address DataFrame:")
df_address.show()

# Join DataFrames
joined_df = df.join(df_address, on="Name", how="inner")
print("Joined DataFrame:")
joined_df.show()

In [33]:
from pyspark.sql.functions import lit
df.select(lit(5), lit("five"), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

In [35]:
from pyspark.sql.functions import *
df.select(
ltrim(lit("  HELLO  ")).alias("ltrim"),
rtrim(lit("  HELLO  ")).alias("rtrim"),
trim(lit("  HELLO  ")).alias("trim"),
lpad(lit("HELLO"), 3, " ").alias("lp"),
rpad(lit("HELLO"), 10, " ").alias("rp")).show()

+-------+-------+-----+---+----------+
|  ltrim|  rtrim| trim| lp|        rp|
+-------+-------+-----+---+----------+
|HELLO  |  HELLO|HELLO|HEL|HELLO     |
|HELLO  |  HELLO|HELLO|HEL|HELLO     |
|HELLO  |  HELLO|HELLO|HEL|HELLO     |
+-------+-------+-----+---+----------+



In [41]:
dateDF = spark.range(10)\
	.withColumn("today", current_date())\
	.withColumn("now", current_timestamp())

dateDF.createOrReplaceTempView("dateTable")

dateDF.printSchema()
dateDF.show(3,truncate=False)

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)

+---+----------+--------------------------+
|id |today     |now                       |
+---+----------+--------------------------+
|0  |2025-08-30|2025-08-30 10:12:35.682768|
|1  |2025-08-30|2025-08-30 10:12:35.682768|
|2  |2025-08-30|2025-08-30 10:12:35.682768|
+---+----------+--------------------------+
only showing top 3 rows

