In [None]:
#1. Working with RDDs:
#    a) Write a Python program to create an RDD from a local data source.
#    b) Implement transformations and actions on the RDD to perform data processing tasks.
#    c) Analyze and manipulate data using RDD operations such as map, filter, reduce, or aggregate.
#Sol:
from pyspark import SparkContext

# Initialize SparkContext
sc = SparkContext("local", "RDD Example")

# Create an RDD from a local data source
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

# Implement transformations and actions on the RDD
# Map operation: multiply each element by 2
mapped_rdd = rdd.map(lambda x: x * 2)

# Filter operation: filter even numbers
filtered_rdd = mapped_rdd.filter(lambda x: x % 2 == 0)

# Reduce operation: sum all the elements
sum_result = filtered_rdd.reduce(lambda x, y: x + y)

# Print the result
print("Sum of even numbers multiplied by 2:", sum_result)

# Aggregate operation: calculate sum and count of elements
sum_count = filtered_rdd.aggregate((0, 0),
                                  lambda acc, value: (acc[0] + value, acc[1] + 1),
                                  lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1]))

print("Sum:", sum_count[0])
print("Count:", sum_count[1])

# Stop SparkContext
sc.stop()


In [None]:
# 2. Spark DataFrame Operations:
#    a) Write a Python program to load a CSV file into a Spark DataFrame.
#    b)Perform common DataFrame operations such as filtering, grouping, or joining.
#    c) Apply Spark SQL queries on the DataFrame to extract insights from the data.
#Sol:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.appName("DataFrame Example").getOrCreate()

# Load CSV file into DataFrame
df = spark.read.csv("path/to/file.csv", header=True, inferSchema=True)

# Display the DataFrame schema
df.printSchema()

# Perform filtering operation
filtered_df = df.filter(df["age"] > 30)

# Perform grouping operation
grouped_df = df.groupBy("gender").count()

# Perform joining operation
joined_df = df.join(grouped_df, on="gender")

# Display the filtered DataFrame
filtered_df.show()

# Display the grouped DataFrame
grouped_df.show()

# Display the joined DataFrame
joined_df.show()

# Apply Spark SQL queries on the DataFrame
df.createOrReplaceTempView("people")

# Query to get the average age
average_age = spark.sql("SELECT AVG(age) FROM people")

# Query to get the count of people by gender
gender_count = spark.sql("SELECT gender, COUNT(*) FROM people GROUP BY gender")

# Display the results
average_age.show()
gender_count.show()

# Stop SparkSession
spark.stop()


In [None]:
#3. Spark Streaming:
#   a) Write a Python program to create a Spark Streaming application.
#    b) Configure the application to consume data from a streaming source (e.g., Kafka or a socket).
#    c) Implement streaming transformations and actions to process and analyze the incoming data stream.
#Sol
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

# Create a StreamingContext with a batch interval of 1 second
ssc = StreamingContext(sparkContext, 1)

# Configure Kafka parameters
kafka_params = {
    "bootstrap.servers": "localhost:9092",
    "auto.offset.reset": "smallest",
    "group.id": "spark-streaming"
}

# Create a DStream that represents the data stream from Kafka topic(s)
kafka_topic = "my-topic"
dstream = KafkaUtils.createDirectStream(ssc, [kafka_topic], kafka_params)

# Perform transformations and actions on the DStream
# Example: Count the occurrences of each word in the stream
word_counts = dstream \
    .flatMap(lambda x: x[1].split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)

# Print the word counts
word_counts.pprint()

# Start the streaming context
ssc.start()

# Wait for the streaming to finish
ssc.awaitTermination()


In [None]:
#4. Spark SQL and Data Source Integration:
#    a) Write a Python program to connect Spark with a relational database (e.g., MySQL, PostgreSQL).
#    b)Perform SQL operations on the data stored in the database using Spark SQL.
#    c) Explore the integration capabilities of Spark with other data sources, such as Hadoop Distributed File System (HDFS) or Amazon S3.
#sol
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.appName("Spark SQL Example").getOrCreate()

# Connect Spark with a MySQL database
jdbc_url = "jdbc:mysql://localhost:3306/mydatabase"
connection_properties = {
    "user": "username",
    "password": "password",
    "driver": "com.mysql.jdbc.Driver"
}

# Load data from a MySQL table into a DataFrame using Spark SQL
df = spark.read.jdbc(jdbc_url, "tablename", properties=connection_properties)

# Perform SQL operations on the DataFrame
df.createOrReplaceTempView("mytable")

# Execute SQL queries using Spark SQL
result = spark.sql("SELECT * FROM mytable WHERE column1 > 100")

# Display the result
result.show()

# Read data from HDFS or S3 into a DataFrame
hdfs_path = "hdfs://localhost:9000/path/to/file.csv"
s3_path = "s3a://bucket-name/path/to/file.csv"

hdfs_df = spark.read.csv(hdfs_path, header=True, inferSchema=True)
s3_df = spark.read.csv(s3_path, header=True, inferSchema=True)

# Perform operations on the DataFrames
# Example: Display the schema of the HDFS DataFrame
hdfs_df.printSchema()

# Example: Display the first 10 rows of the S3 DataFrame
s3_df.show(10)

# Stop SparkSession
spark.stop()
