# Apache Spark is written in Scala programming language. 
# To support Python with Spark, Apache Spark Community released a tool, PySpark.

In [13]:
## BASIC PYSPARK PROGRAM

# pip install pyspark

from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

# Create a DataFrame from a list of tuples
data = [("Alice", 1), ("Bob", 2), ("Cathy", 3)]
columns = ["Name", "Id"]
df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()

# Show the schema
df.printSchema()

# Select a single column
df.select("Name").show()

# imposing conditions
df.filter(df.Id > 1).show()

# for adding columns
from pyspark.sql.functions import col

df = df.withColumn("IdPlusOne", col("Id") + 1)
df.show()

# running SQL queries
df.createOrReplaceTempView("people")
result = spark.sql("SELECT * FROM people WHERE Id > 1")
result.show()

# reading and writng to csv file
# Read CSV file
df = spark.read.csv("path/to/file.csv", header=True, inferSchema=True)
df.write.csv("path/to/output.csv", header=True)



+-----+---+
| Name| Id|
+-----+---+
|Alice|  1|
|  Bob|  2|
|Cathy|  3|
+-----+---+

root
 |-- Name: string (nullable = true)
 |-- Id: long (nullable = true)

+-----+
| Name|
+-----+
|Alice|
|  Bob|
|Cathy|
+-----+

+-----+---+
| Name| Id|
+-----+---+
|  Bob|  2|
|Cathy|  3|
+-----+---+

+-----+---+---------+
| Name| Id|IdPlusOne|
+-----+---+---------+
|Alice|  1|        2|
|  Bob|  2|        3|
|Cathy|  3|        4|
+-----+---+---------+

+-----+---+---------+
| Name| Id|IdPlusOne|
+-----+---+---------+
|  Bob|  2|        3|
|Cathy|  3|        4|
+-----+---+---------+



In [None]:
# Working with aggregations
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, avg, min, max, count

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("AggregationExample") \
    .getOrCreate()

# Sample DataFrame
data = [("Alice", "A", 1), ("Bob", "B", 2), ("Alice", "A", 3), ("Bob", "B", 4), ("Alice", "C", 5)]
columns = ["Name", "Category", "Value"]
df = spark.createDataFrame(data, columns)
df.show()

# Aggregations
result = df.groupBy("Name").agg(
    count("Value").alias("Count"),
    sum("Value").alias("Sum"),
    avg("Value").alias("Average"),
    min("Value").alias("Min"),
    max("Value").alias("Max")
)

result.show()

# Stop the SparkSession
spark.stop()


+-----+--------+-----+
| Name|Category|Value|
+-----+--------+-----+
|Alice|       A|    1|
|  Bob|       B|    2|
|Alice|       A|    3|
|  Bob|       B|    4|
|Alice|       C|    5|
+-----+--------+-----+

+-----+-----+---+-------+---+---+
| Name|Count|Sum|Average|Min|Max|
+-----+-----+---+-------+---+---+
|Alice|    3|  9|    3.0|  1|  5|
|  Bob|    2|  6|    3.0|  2|  4|
+-----+-----+---+-------+---+---+



## LAB EXERCISES

In [1]:
# Question 1
# Write a PySpark program to square set of integers.

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("SquareIntegers") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

# NOTE THE COMMAS ARE NECESSARY AS THE INPUT MUST BE A TUPLE NOT INT
data = [(1,),(2,),(3,),(4,),(5,)]

df = spark.createDataFrame(data, ["number"])

print("Original DataFrame:")
df.show()

# Adding another column to of squared numbers
df = df.withColumn("squared", col("number") ** 2)

print("Squared DataFrame:")
df.show()

spark.stop()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/03 15:00:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Original DataFrame:
+------+
|number|
+------+
|     1|
|     2|
|     3|
|     4|
|     5|
+------+

Squared DataFrame:
+------+-------+
|number|squared|
+------+-------+
|     1|    1.0|
|     2|    4.0|
|     3|    9.0|
|     4|   16.0|
|     5|   25.0|
+------+-------+



In [2]:
# Question 2 Write a PySpark program to find the maximum of given set of numbers.

from pyspark.sql import SparkSession
from pyspark.sql.functions import max

spark = SparkSession.builder \
    .appName("Find Maximum") \
    .getOrCreate()
data = [(1,), (5,), (3,), (9,), (7,)]
spark.sparkContext.setLogLevel("ERROR")

df = spark.createDataFrame(data, ["number"])
print("Original DataFrame:")
df.show()

# WE ADD [0][0] AT THE END TO GET HTE EXACT VALUE
max_value = df.agg(max("number")).collect()[0][0]
print(f"Maximum value: {max_value}")

spark.stop()


# WE ADD [0][0] AT THE END TO GET HTE EXACT VALUE
max_value = df.agg(max("number")).collect()[0][0]
print(f"Maximum value: {max_value}")

spark.stop()


Original DataFrame:
+------+
|number|
+------+
|     1|
|     5|
|     3|
|     9|
|     7|
+------+

Maximum value: 9


In [2]:
#Question 3 Write a PySpark program to find average of N numbers

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

spark = SparkSession.builder \
    .appName("Find Maximum") \
    .getOrCreate()
data = [(1,), (5,), (3,), (9,), (7,)]
spark.sparkContext.setLogLevel("ERROR")

df = spark.createDataFrame(data, ["number"])
print("Original DataFrame:")
df.show()

# WE ADD [0][0] AT THE END TO GET HTE EXACT VALUE
max_value = df.agg(avg("number")).collect()[0][0]
print(f"Maximum value: {max_value}")

spark.stop()


Original DataFrame:
+------+
|number|
+------+
|     1|
|     5|
|     3|
|     9|
|     7|
+------+

Maximum value: 5.0


In [3]:
# Question 4 Demonstrate how to read a CSV file into a PySpark DataFrame.

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

spark = SparkSession.builder \
    .appName("Read CSV") \
    .getOrCreate()

df = spark.read.csv("industry.csv", header=True, inferSchema=True)
df.show()

+--------------------+
|            Industry|
+--------------------+
|  Accounting/Finance|
|Advertising/Publi...|
|  Aerospace/Aviation|
|Arts/Entertainmen...|
|          Automotive|
|    Banking/Mortgage|
|Business Development|
|Business Opportunity|
|Clerical/Administ...|
|Construction/Faci...|
|      Consumer Goods|
|    Customer Service|
|  Education/Training|
|    Energy/Utilities|
|         Engineering|
| Government/Military|
|               Green|
|          Healthcare|
|  Hospitality/Travel|
|     Human Resources|
+--------------------+
only showing top 20 rows



In [10]:
# Question 5 Use PySpark commands to display the first few rows and schema of a DataFrame.

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

spark = SparkSession.builder \
    .appName("Read top 5 rows CSV") \
    .getOrCreate()

df = spark.read.load("industry.csv", format = "csv" , sep="," ,header=True, inferSchema=True)
df.head(5)

[Row(Industry='Accounting/Finance'),
 Row(Industry='Advertising/Public Relations'),
 Row(Industry='Aerospace/Aviation'),
 Row(Industry='Arts/Entertainment/Publishing'),
 Row(Industry='Automotive')]

In [11]:
# Question 6 Calculate basic summary statistics for a specific column in the DataFrame.
df.describe().show() # workds kinda like pandas
df.select("Industry").describe().show() # for specific columns

+-------+--------------------+
|summary|            Industry|
+-------+--------------------+
|  count|                  43|
|   mean|                NULL|
| stddev|                NULL|
|    min|  Accounting/Finance|
|    max|Transportation/Lo...|
+-------+--------------------+

