# Spark Mastery



In [12]:
%load_ext autoreload
%autoreload 2

In [13]:
# make sure your run the cell above before running this
import helper

In [14]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark") \
    .getOrCreate()

sc = spark.sparkContext

In [15]:
# create some test data
import csv
import random

# Define the number of lines and the headers
num_lines = 500000
headers = ['jobId', 'status']

# Status options for the 'status' field
status_options = ['Pending', 'In Progress', 'Completed', 'Failed']

# Creating and writing to the CSV file
with open('jobs_data.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Writing the header
    writer.writerow(headers)
    
    # Writing the data rows
    for i in range(1, num_lines + 1):
        # Generate a random status
        status = random.choice(status_options)
        # Write the row
        writer.writerow([f"Job{i}", status])

# Inform the user that the file has been created
print('CSV file created with 500,000 lines.')


CSV file created with 500,000 lines.


In [16]:
# Spark Approach

In [17]:
df = spark.read.csv("jobs_data.csv", header=True, inferSchema=True)
print(df.head)
print(df.columns)
df.show()

<bound method DataFrame.head of DataFrame[jobId: string, status: string]>
['jobId', 'status']
+-----+-----------+
|jobId|     status|
+-----+-----------+
| Job1|    Pending|
| Job2|    Pending|
| Job3|     Failed|
| Job4|  Completed|
| Job5|     Failed|
| Job6|     Failed|
| Job7|In Progress|
| Job8|    Pending|
| Job9|In Progress|
|Job10|In Progress|
|Job11|     Failed|
|Job12|  Completed|
|Job13|    Pending|
|Job14|    Pending|
|Job15|     Failed|
|Job16|  Completed|
|Job17|In Progress|
|Job18|In Progress|
|Job19|  Completed|
|Job20|In Progress|
+-----+-----------+
only showing top 20 rows



In [19]:
# retrieve total statuses
helper.get_status_counts(df)

                                                                                

Status: Completed, Count: 125050
Status: In Progress, Count: 125412
Status: Failed, Count: 124797
Status: Pending, Count: 124741
