In [10]:
from pyspark import SparkContext, SparkConf

# Create a Spark configuration
conf = SparkConf().setAppName("SquareIntegers").setMaster("local[*]")

# Create a SparkContext
sc = SparkContext(conf=conf)

# Sample data: Replace this with your set of integers
data = [1, 2, 3, 4, 5]

# Parallelize the data into an RDD (Resilient Distributed Dataset)
rdd = sc.parallelize(data)

# Use map transformation to square each element
squared_rdd = rdd.map(lambda x: x**2)

# Collect the results back to the driver program
result = squared_rdd.collect()

# Print the squared values
print("Original Data: {}".format(data))
print("Squared Data:  {}".format(result))

# Stop the SparkContext
sc.stop()




Original Data: [1, 2, 3, 4, 5]
Squared Data:  [1, 4, 9, 16, 25]


In [11]:
from pyspark import SparkContext, SparkConf

# Create a Spark configuration
conf = SparkConf().setAppName("MaxOfNumbers").setMaster("local[*]")

# Create a SparkContext
sc = SparkContext(conf=conf)

# Sample data: Replace this with your set of numbers
data = [10, 5, 8, 15, 3]

# Parallelize the data into an RDD
rdd = sc.parallelize(data)

# Use the reduce action to find the maximum
max_number = rdd.reduce(lambda x, y: max(x, y))

# Print the maximum number
print("Original Data: {}".format(data))
print("Maximum Number: {}".format(max_number))

# Stop the SparkContext
sc.stop()




Original Data: [10, 5, 8, 15, 3]
Maximum Number: 15


In [12]:
from pyspark import SparkContext, SparkConf

# Create a Spark configuration
conf = SparkConf().setAppName("AverageOfNumbers").setMaster("local[*]")

# Create a SparkContext
sc = SparkContext(conf=conf)

# Sample data: Replace this with your set of numbers
data = [10, 5, 8, 15, 3]

# Parallelize the data into an RDD
rdd = sc.parallelize(data)

# Use the reduce action to calculate the sum
total_sum = rdd.reduce(lambda x, y: x + y)

# Calculate the average
count = rdd.count()
average = total_sum / count if count > 0 else 0

# Print the results
print("Original Data: {}".format(data))
print("Total Sum: {}".format(total_sum))
print("Count: {}".format(count))
print("Average: {}".format(average))

# Stop the SparkContext
sc.stop()




Original Data: [10, 5, 8, 15, 3]
Total Sum: 41
Count: 5
Average: 8.2


In [13]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("ReadCSV").getOrCreate()

# Specify the path to your CSV file
csv_file_path = "Data.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Show the DataFrame
df.show()

# Stop the Spark session
spark.stop()




+----+--------------------+--------------------+------------+--------------------+-----+-----------------+
|year|industry_code_ANZSIC|industry_name_ANZSIC|rme_size_grp|            variable|value|             unit|
+----+--------------------+--------------------+------------+--------------------+-----+-----------------+
|2011|                   A|Agriculture, Fore...|         a_0|       Activity unit|46134|            COUNT|
|2011|                   A|Agriculture, Fore...|         a_0|Rolling mean empl...|    0|            COUNT|
|2011|                   A|Agriculture, Fore...|         a_0|Salaries and wage...|  279|DOLLARS(millions)|
|2011|                   A|Agriculture, Fore...|         a_0|Sales, government...| 8187|DOLLARS(millions)|
|2011|                   A|Agriculture, Fore...|         a_0|        Total income| 8866|DOLLARS(millions)|
|2011|                   A|Agriculture, Fore...|         a_0|   Total expenditure| 7618|DOLLARS(millions)|
|2011|                   A|Agricultur

In [14]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("DisplayDataFrame").getOrCreate()

# Specify the path to your CSV file
csv_file_path = "Data.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Display the first few rows of the DataFrame
print("First few rows of the DataFrame:")
df.show()

# Display the schema of the DataFrame
print("DataFrame Schema:")
df.printSchema()

# Stop the Spark session
spark.stop()




First few rows of the DataFrame:
+----+--------------------+--------------------+------------+--------------------+-----+-----------------+
|year|industry_code_ANZSIC|industry_name_ANZSIC|rme_size_grp|            variable|value|             unit|
+----+--------------------+--------------------+------------+--------------------+-----+-----------------+
|2011|                   A|Agriculture, Fore...|         a_0|       Activity unit|46134|            COUNT|
|2011|                   A|Agriculture, Fore...|         a_0|Rolling mean empl...|    0|            COUNT|
|2011|                   A|Agriculture, Fore...|         a_0|Salaries and wage...|  279|DOLLARS(millions)|
|2011|                   A|Agriculture, Fore...|         a_0|Sales, government...| 8187|DOLLARS(millions)|
|2011|                   A|Agriculture, Fore...|         a_0|        Total income| 8866|DOLLARS(millions)|
|2011|                   A|Agriculture, Fore...|         a_0|   Total expenditure| 7618|DOLLARS(millions)|
|201

In [15]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("ColumnSummaryStatistics").getOrCreate()

# Specify the path to your CSV file
csv_file_path = "Data.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Specify the column for which you want to calculate summary statistics
selected_column = "value"

# Calculate summary statistics for the selected column
column_summary = df.select(selected_column).describe()

# Display the summary statistics
print("Summary Statistics for '{}' column:".format(selected_column))
column_summary.show(truncate=False)

# Stop the Spark session
spark.stop()




Summary Statistics for 'value' column:
+-------+------------------+
|summary|value             |
+-------+------------------+
|count  |17028             |
|mean   |12858.970234173803|
|stddev |83791.06859238271 |
|min    |-1                |
|max    |C                 |
+-------+------------------+

