In [0]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
spark = SparkSession.builder.appName("MyFirstProject").getOrCreate()

In [0]:
df_read = spark.read.format("csv").option("inferSchema",True).option("Header",True).load("/FileStore/shared_uploads/prakhyatgurung4@gmail.com/iphone-1.csv")
df_read.printSchema()

root
 |-- TransactionID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Amount: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Region: string (nullable = true)



In [0]:
df_read.display()

TransactionID,Product,Category,Amount,Date,Region
1,Laptop,Electronics,800,2024-12-01,North
2,Smartphone,Electronics,500,2024-12-02,South
3,Tablet,Electronics,300,2024-12-02,West
4,Laptop,Electronics,850,2024-12-03,East
5,Headphones,Accessories,150,2024-12-03,North
6,Keyboard,Accessories,100,2024-12-04,South
7,Mouse,Accessories,50,2024-12-04,West
8,Smartphone,Electronics,600,2024-12-05,North
9,Tablet,Electronics,350,2024-12-05,East
10,Headphones,Accessories,200,2024-12-06,South


In [0]:
filtered_df = df_read.filter(col("Amount")<100)
filtered_df.display()

TransactionID,Product,Category,Amount,Date,Region
7,Mouse,Accessories,50,2024-12-04,West


In [0]:
df_total_sales_category = filtered_df.groupBy("Category").agg(sum("Amount").alias("TotalSales"))
df_total_sales_category.display()

Category,TotalSales
Accessories,50


In [0]:
df_total_sales = df_read.groupBy("Region").agg(sum("Amount").alias("TotalSales"))
df_total_sales.display()

Region,TotalSales
South,800
East,1200
West,350
North,1550


In [0]:
highest_Sales_Electronics = df_read.filter(col("Category")=='Electronics').orderBy(col('Amount').desc())
highest_sales = highest_Sales_Electronics.select("Product","Amount").limit(1)
highest_sales.display()

Product,Amount
Laptop,850


In [0]:
df_write = highest_sales.write.format("csv").mode("append").save("/FileStore/shared_uploads/prakhyatgurung4@gmail.com/output/06-Dec")

In [0]:
dbutils.fs.ls("/FileStore/shared_uploads/prakhyatgurung4@gmail.com/output/06-Dec")

Out[40]: [FileInfo(path='dbfs:/FileStore/shared_uploads/prakhyatgurung4@gmail.com/output/06-Dec/_SUCCESS', name='_SUCCESS', size=0, modificationTime=1733455353000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/prakhyatgurung4@gmail.com/output/06-Dec/_committed_5839386513000871577', name='_committed_5839386513000871577', size=112, modificationTime=1733455216000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/prakhyatgurung4@gmail.com/output/06-Dec/_committed_9036362387702957869', name='_committed_9036362387702957869', size=112, modificationTime=1733455352000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/prakhyatgurung4@gmail.com/output/06-Dec/_started_5839386513000871577', name='_started_5839386513000871577', size=0, modificationTime=1733455215000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/prakhyatgurung4@gmail.com/output/06-Dec/_started_9036362387702957869', name='_started_9036362387702957869', size=0, modificationTime=1733455352000),
 FileInfo(path='dbfs:/FileStore/shared_

In [0]:
# Sample data
data1 = [(1, "Alice"), (2, "Bob"), (3, "Charlie")]
data2 = [(2, "Bob"), (3, "Charlie"), (4, "David")]
data3 = [(1, "Alice"), (2, "Bob"), (3, "Charlie")]
# Columns for the data
columns = ["id", "name"]


In [0]:
df1 = spark.createDataFrame(data1, columns)
df2 = spark.createDataFrame(data2, columns)
df3 = spark.createDataFrame(data3, columns)


In [0]:
# Union the DataFrames
df_union = df1.union(df2)

# Show the result
df_union.display()


id,name
1,Alice
2,Bob
3,Charlie
2,Bob
3,Charlie
4,David


In [0]:
# Union the DataFrames
df_union = df1.union(df3)

# Show the result
df_union.display()


id,name
1,Alice
2,Bob
3,Charlie
1,Alice
2,Bob
3,Charlie


In [0]:
# Union the DataFrames
df_union = df1.unionAll(df2)

# Show the result
df_union.display()


id,name
1,Alice
2,Bob
3,Charlie
2,Bob
3,Charlie
4,David
