<a href="https://colab.research.google.com/github/Alby-Benny-IBM/PySpark/blob/main/02_AdvPySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("MySparkApp").setMaster("local")
sc = SparkContext(conf=conf)

In [2]:
rdd = sc.parallelize([1,2,3,4,5])
rdd_mapped = rdd.map(lambda x: x*2)
print(rdd_mapped.collect())

rdd_filtered = rdd.filter(lambda x: x % 2 == 0)
print(rdd_filtered.collect())

[2, 4, 6, 8, 10]
[2, 4]


In [3]:
rdd_reduce = rdd.reduce(lambda x, y: x + y)
print(rdd_reduce)

15


In [6]:
rdd = sc.parallelize([4,9,5])

rdd_flat = rdd.flatMap(lambda x: range(x, x+3))
print(rdd_flat.collect())

[4, 5, 6, 9, 10, 11, 5, 6, 7]


In [7]:
rdd = sc.parallelize(['hello','world'])

rdd_flat=rdd.flatMap(lambda x: list(x))
print(rdd_flat.collect())

['h', 'e', 'l', 'l', 'o', 'w', 'o', 'r', 'l', 'd']


In [8]:
rdd = sc.parallelize([[1,2],[3,4],[5]])
rdd_flat =rdd.flatMap(lambda x: x)
print(rdd_flat.collect())

[1, 2, 3, 4, 5]


In [11]:
rdd = sc.parallelize([1,2,3,4,5])
rdd_group= rdd.groupBy(lambda x: "even" if x%2 == 0 else "odd")
print([(key, list(value)) for (key, value) in rdd_group.collect()])

[('odd', <pyspark.resultiterable.ResultIterable object at 0x7df5d8df5ad0>), ('even', <pyspark.resultiterable.ResultIterable object at 0x7df5d8dc5210>)]
[('odd', [1, 3, 5]), ('even', [2, 4])]


In [12]:
data = [
    (1, "John", "HR", 5000),
    (2, "Jane", "IT", 8000),
    (3, "Mike", "IT", 6000),
    (4, "Sara", "Finance", 7000),
    (5, "David", "HR", 5500)
]

from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("DataFrameOperations").getOrCreate()

# Define column names
columns = ["ID", "Name", "Department", "Salary"]

# Create a DataFrame from the sample data
df = spark.createDataFrame(data, columns)

# Show the DataFrame
df.show()

+---+-----+----------+------+
| ID| Name|Department|Salary|
+---+-----+----------+------+
|  1| John|        HR|  5000|
|  2| Jane|        IT|  8000|
|  3| Mike|        IT|  6000|
|  4| Sara|   Finance|  7000|
|  5|David|        HR|  5500|
+---+-----+----------+------+



In [13]:
df.filter(df.Salary > 6000).show()

+---+----+----------+------+
| ID|Name|Department|Salary|
+---+----+----------+------+
|  2|Jane|        IT|  8000|
|  4|Sara|   Finance|  7000|
+---+----+----------+------+



In [14]:
df=df.drop("Bonus")

In [15]:
df=df.withColumnRenamed("Salary","Salary_After_Tax")
df.show()

+---+-----+----------+----------------+
| ID| Name|Department|Salary_After_Tax|
+---+-----+----------+----------------+
|  1| John|        HR|            5000|
|  2| Jane|        IT|            8000|
|  3| Mike|        IT|            6000|
|  4| Sara|   Finance|            7000|
|  5|David|        HR|            5500|
+---+-----+----------+----------------+



In [16]:
from pyspark.sql.functions import avg
df.groupBy("Department").agg(avg("Salary_After_Tax").alias("Avg_Salary")).show()


+----------+----------+
|Department|Avg_Salary|
+----------+----------+
|        HR|    5250.0|
|   Finance|    7000.0|
|        IT|    7000.0|
+----------+----------+



In [17]:
df.groupBy("Department").count().show()

+----------+-----+
|Department|count|
+----------+-----+
|        HR|    2|
|   Finance|    1|
|        IT|    2|
+----------+-----+



In [19]:
df.sort(df["Salary_After_Tax"].desc()).show(3)

+---+----+----------+----------------+
| ID|Name|Department|Salary_After_Tax|
+---+----+----------+----------------+
|  2|Jane|        IT|            8000|
|  4|Sara|   Finance|            7000|
|  3|Mike|        IT|            6000|
+---+----+----------+----------------+
only showing top 3 rows



In [21]:
data_list = df.collect()
for row in data_list:
    print(row.Name)

John
Jane
Mike
Sara
David


In [25]:
from pyspark.sql import Row

rdd = sc.parallelize([Row(name="Alice", age =25), Row(name="bob",age=30)])
df= spark.createDataFrame(rdd)
filtered_df = df.filter(df.age > 25)
selected_df=filtered_df.select("name")
selected_df.show()

+----+
|name|
+----+
| bob|
+----+



In [26]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkSQLBasics").getOrCreate()

data = [
    (1, "Alice", "Sales", 3000),
    (2, "Bob", "IT", 4000),
    (3, "Cathy", "HR", 3500),
    (4, "David", "Sales", 4500),
    (5, "Eva", "IT", 4200)
]
columns = ["EmpID", "Name", "Department", "Salary"]

df = spark.createDataFrame(data, columns)
df.show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    1|Alice|     Sales|  3000|
|    2|  Bob|        IT|  4000|
|    3|Cathy|        HR|  3500|
|    4|David|     Sales|  4500|
|    5|  Eva|        IT|  4200|
+-----+-----+----------+------+



In [28]:
#convert to RDD
rdd = df.rdd
print("RDD Example",rdd.map(lambda x: (x.Name,x.Salary)).collect())

RDD Example [('Alice', 3000), ('Bob', 4000), ('Cathy', 3500), ('David', 4500), ('Eva', 4200)]


In [30]:
df.createOrReplaceTempView("employees")
strSQL="select * from employees"
sql_result=spark.sql(strSQL)
sql_result.show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    1|Alice|     Sales|  3000|
|    2|  Bob|        IT|  4000|
|    3|Cathy|        HR|  3500|
|    4|David|     Sales|  4500|
|    5|  Eva|        IT|  4200|
+-----+-----+----------+------+



In [31]:
data = [
    (1, "Alice", "Sales", 3000),
    (2, "Bob", "IT", 4000),
    (3, "Cathy", "HR", 3500)
]
columns = ["EmpID", "Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)
df.show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    1|Alice|     Sales|  3000|
|    2|  Bob|        IT|  4000|
|    3|Cathy|        HR|  3500|
+-----+-----+----------+------+



In [32]:
df.write.mode("overwrite").json("json_data")

In [33]:
!ls /content/json_data/

part-00000-53abce00-0fac-4a14-940c-9563af0bd413-c000.json  _SUCCESS


In [34]:
!cat /content/json_data/part-00000-53abce00-0fac-4a14-940c-9563af0bd413-c000.json

{"EmpID":1,"Name":"Alice","Department":"Sales","Salary":3000}
{"EmpID":2,"Name":"Bob","Department":"IT","Salary":4000}
{"EmpID":3,"Name":"Cathy","Department":"HR","Salary":3500}


In [35]:
json_df = spark.read.json("json_data")
json_df.show()

+----------+-----+-----+------+
|Department|EmpID| Name|Salary|
+----------+-----+-----+------+
|     Sales|    1|Alice|  3000|
|        IT|    2|  Bob|  4000|
|        HR|    3|Cathy|  3500|
+----------+-----+-----+------+



In [40]:
df.show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    1|Alice|     Sales|  3000|
|    2|  Bob|        IT|  4000|
|    3|Cathy|        HR|  3500|
+-----+-----+----------+------+



In [41]:
strPath ="csv_data"
df.write.mode("overwrite").option("header", "true").csv(strPath)

In [42]:
csv_df=spark.read.option("header","true").csv(strPath)
csv_df.show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    1|Alice|     Sales|  3000|
|    2|  Bob|        IT|  4000|
|    3|Cathy|        HR|  3500|
+-----+-----+----------+------+



In [1]:
import random
import csv

# Generate 30 records with random data
names = ["John", "Jane", "Mike", "Sara", "David", "Emily", "George", "Nina", "Tom", "Anna"]
departments = ["Sales", "IT", "HR", "Finance", "Marketing"]
salaries = [3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

# Create and open a CSV file for writing
with open('employee_data.csv', mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header
    writer.writerow(["ID", "Name", "Department", "Salary"])

    # Write the 30 records
    for i in range(1, 31):
        name = random.choice(names)
        department = random.choice(departments)
        salary = random.choice(salaries)
        writer.writerow([i, name, department, salary])

print("CSV file 'employee_data.csv' has been generated successfully.")

CSV file 'employee_data.csv' has been generated successfully.


In [2]:
!cat employee_data.csv

ID,Name,Department,Salary
1,George,Marketing,8000
2,Tom,Marketing,4000
3,Jane,IT,4000
4,Anna,Marketing,7000
5,Jane,HR,9000
6,Anna,IT,3000
7,George,HR,8000
8,John,HR,4000
9,Anna,Marketing,4000
10,John,IT,8000
11,George,Marketing,4000
12,David,Marketing,8000
13,Sara,IT,8000
14,Mike,Sales,7000
15,David,Finance,8000
16,Mike,Sales,9000
17,Tom,IT,10000
18,Emily,Finance,6000
19,George,Marketing,4000
20,Mike,Sales,7000
21,Sara,HR,4000
22,Emily,Finance,6000
23,George,HR,3000
24,Jane,Finance,6000
25,Anna,Sales,6000
26,George,Finance,9000
27,George,Marketing,8000
28,David,IT,9000
29,Anna,IT,3000
30,David,IT,6000


In [5]:
!mkdir -p empdata/
!mv employee_data.csv empdata/

In [10]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("EmpID", IntegerType()),
    StructField("Name", StringType()),
    StructField("Department", StringType()),
    StructField("Salary", IntegerType())
])

In [44]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("StreamingExample").getOrCreate()

stream_df = spark.readStream.option("sep",",").schema(schema).csv("/content/empdata/")


In [45]:
from pyspark.sql.functions import upper

transform_df=stream_df.withColumn("NameUpper",upper("Name"))

In [48]:
query = transform_df.writeStream.format("console").outputMode("append").start()

query.awaitTermination()

In [None]:
import random
import csv
import time
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Create output directory if not exists
file_dir = '/content/drive/MyDrive/newData/'
os.makedirs(file_dir, exist_ok=True)

# Data sources
names = ["John", "Jane", "Mike", "Sara", "David", "Emily", "George", "Nina", "Tom", "Anna"]
departments = ["Sales", "IT", "HR", "Finance", "Marketing"]
salaries = [3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

i = 1          # Global record counter
j = 0          # File counter
temp = 0       # Row counter in current file
max_rows = 10  # Rows per file

while True:
    # Create new file if temp == 0
    if temp == 0:
        file_path = os.path.join(file_dir, f"streamed_data{j}.csv")
        with open(file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["ID", "Name", "Department", "Salary"])
        print(f"Created new file: {file_path}")

    # Generate record
    name = random.choice(names)
    department = random.choice(departments)
    salary = random.choice(salaries)
    record = [i, name, department, salary]
    print(record)

    # Append to current file
    with open(file_path, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(record)

    # Update counters
    i += 1
    temp += 1

    # Rotate file if max_rows reached
    if temp == max_rows:
        temp = 0
        j += 1

    time.sleep(1)


In [68]:
!rm /content/drive/MyDrive/newData/streamed_data*.csv