In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark

In [4]:
sc = pyspark.SparkContext(appName = "A2")

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import os

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("ReadAppendTextFile") \
    .getOrCreate()

# Step 1: Define the input file
input_file = r"D:\Data Science\abc.txt"

# Step 2: Read existing data from the text file
df = spark.read.text(input_file)

# Step 3: Show the original data
print("Original Data:")
df.show(truncate=False)

# Step 4: Add a new line to the DataFrame
new_line = "This is a new line added to the text file."  
new_row = spark.createDataFrame([Row(value=new_line)])

# Combine the original DataFrame with the new row
updated_df = df.union(new_row)

# Step 5: Collect the updated DataFrame back to Python
updated_lines = updated_df.select("value").rdd.flatMap(lambda x: x).collect()

# Step 6: Write the updated data back to the same text file using Python file handling
with open(input_file, 'w') as f:
    for line in updated_lines:
        f.write(line + '\n')

# Show the updated data
print("Updated Data:")
for line in updated_lines:
    print(line)

# Stop the Spark session
spark.stop()

Original Data:
+------------------------------------------+
|value                                     |
+------------------------------------------+
|Hello, this is assignment 5.              |
|Subject : Big data                        |
|This is a new line added to the text file.|
+------------------------------------------+

Updated Data:
Hello, this is assignment 5.
Subject : Big data
This is a new line added to the text file.
This is a new line added to the text file.


In [9]:
#Q2. Design a script in pyspark to count total words in a given doc. Also print unique words in it

from pyspark.sql.functions import explode, split, col

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("WordCount") \
    .getOrCreate()

# Step 1: Define the input file
input_file = r"D:\Data Science\abc.txt"

# Step 2: Read the text file
df = spark.read.text(input_file)

# Step 3: Split each line into words and explode to get a word per row
words_df = df.select(explode(split(col("value"), "\s+")).alias("word"))

# Step 4: Remove any empty strings (in case of extra spaces)
words_df = words_df.filter(words_df.word != "")

# Step 5: Count total words
total_word_count = words_df.count()

# Step 6: Get unique words
unique_words_df = words_df.select("word").distinct()

# Step 7: Count unique words (optional, if you need the count)
unique_word_count = unique_words_df.count()

# Step 8: Show results
print(f"Total Word Count: {total_word_count}")
print(f"Unique Word Count: {unique_word_count}")

print("Unique Words:")
unique_words_df.show(truncate=False)

# Stop the Spark session
spark.stop()

Total Word Count: 19
Unique Word Count: 18
Unique Words:
+----------+
|word      |
+----------+
|assignment|
|new       |
|Subject   |
|is        |
|data      |
|5.        |
|the       |
|:         |
|Big       |
|This      |
|Hello,    |
|text      |
|a         |
|this      |
|line      |
|added     |
|file.     |
|to        |
+----------+



In [7]:
# Q3. Create a schema for storing employee details (eno , ename, eage , gender, salary, city). Insert appropriate records. Use streaming to fetch the data for following queries. 
# Find the details of employee genderwise.
# Display  ename and age with respect to salary and city in descending order

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.functions import col

# Step 1: Initialize a Spark session
spark = SparkSession.builder \
    .appName("EmployeeDetails") \
    .getOrCreate()

# Step 2: Define the schema for employee data
schema = StructType([
    StructField("eno", IntegerType(), True),
    StructField("ename", StringType(), True),
    StructField("eage", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", FloatType(), True),
    StructField("city", StringType(), True)
])

# Step 3: Simulate streaming data by reading from a folder 
input_dir = r"D:\Data Science\Deep learning\employee\employee.csv"

# Reading the employee data as a streaming source
employee_stream = spark.readStream \
    .schema(schema) \
    .csv(input_dir)

In [9]:
# Perform aggregation: Compute the average salary per city and sort by the average salary in descending order
aggregated_data = employee_stream.groupBy("city").agg(avg("salary").alias("avg_salary"))

# Sort the aggregated data by average salary (DESC) and city (DESC)
sorted_aggregated_data = aggregated_data.orderBy(col("avg_salary").desc(), col("city").desc())

# Output the sorted results to the console
sorted_query = sorted_aggregated_data.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

# Wait for the stream to finish
sorted_query.awaitTermination()

NameError: name 'avg' is not defined

In [8]:
# Query 1: Find the details of employees genderwise
genderwise_df = employee_stream.groupBy("gender").count()

# Query 2: Display ename and age with respect to salary and city in descending order
sorted_df = employee_stream.select("ename", "eage", "salary", "city") \
    .orderBy(col("salary").desc(), col("city").desc())

# Step 4: Output the results to the console
# Query 1: Gender-wise employee details
genderwise_query = genderwise_df.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

# Query 2: Sorted ename and eage with respect to salary and city in descending order
sorted_query = sorted_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Wait for the streaming to finish
genderwise_query.awaitTermination()
sorted_query.awaitTermination()

AnalysisException: Sorting is not supported on streaming DataFrames/Datasets, unless it is on aggregated DataFrame/Dataset in Complete output mode;
Sort [salary#45 DESC NULLS LAST, city#46 DESC NULLS LAST], true
+- Project [ename#42, eage#43, salary#45, city#46]
   +- StreamingRelation DataSource(org.apache.spark.sql.SparkSession@410a940d,csv,List(),Some(StructType(StructField(eno,IntegerType,true),StructField(ename,StringType,true),StructField(eage,IntegerType,true),StructField(gender,StringType,true),StructField(salary,FloatType,true),StructField(city,StringType,true))),List(),None,Map(path -> D:\Data Science\Deep learning\employee\employee.csv),None), FileSource[D:\Data Science\Deep learning\employee\employee.csv], [eno#41, ename#42, eage#43, gender#44, salary#45, city#46]
