In [4]:
!pip install -q findspark

DEPRECATION: arcgis 2.0.0 has a non-standard dependency specifier keyring<=21.8.*,>=19. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of arcgis or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [10]:
!tar xf spark-3.5.1-bin-hadoop3.tgz
!pip install -q findspark


tar: Error opening archive: Failed to open 'spark-3.5.1-bin-hadoop3.tgz'
DEPRECATION: arcgis 2.0.0 has a non-standard dependency specifier keyring<=21.8.*,>=19. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of arcgis or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [11]:
import os
import findspark

os.environ["JAVA_HOME"] = "/Java/jdk-22/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/spark/spark-3.5.1-bin-hadoop3"

findspark.init()


In [2]:
import os
import findspark

# Configure Spark environment variables
os.environ["JAVA_HOME"] = "C:\\Java\\jdk-22"  # Adjust to your JDK path
os.environ["SPARK_HOME"] = "C:\\spark\\spark-3.5.1-bin-hadoop3"  # Adjust to your Spark path

# Initialize findspark
findspark.init()

# Import Spark libraries
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("ExampleApp").getOrCreate()


In [4]:
# Task 1: Filter Airport Cities by Latitude
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AirportLatitudeFilter").getOrCreate()

# Read the airport data
airport_df = spark.read.csv("in/cities.text", header=False, inferSchema=True)

# Filter airports with latitude > 40
filtered_airports = airport_df.filter(airport_df._c1 > 40)

# Select airport name and latitude
result = filtered_airports.select("_c0", "_c1")

# Save the result to text file
result.write.csv("out/cities_by_latitude.txt")

spark.stop()


In [5]:
# Task 2: Find Common Hosts in NASA Logs
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CommonHosts").getOrCreate()

# Read the log files
log1_df = spark.read.csv("in/nasa_log_01.tsv", sep='\t', header=True)
log2_df = spark.read.csv("in/nasa_log_02.tsv", sep='\t', header=True)

# Select hosts
hosts1 = log1_df.select("host").distinct()
hosts2 = log2_df.select("host").distinct()

# Find common hosts
common_hosts = hosts1.intersect(hosts2)

# Save the result to CSV
common_hosts.write.csv("out/same_hosts_nasa_logs.csv")

spark.stop()


In [7]:
# Task 3: Sum of First 100 Prime Numbers
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SumPrimes").getOrCreate()

# Read the primes file
primes_rdd = spark.sparkContext.textFile("in/primes.text")

# Convert each row of primes to a list of integers
primes_rdd = primes_rdd.flatMap(lambda line: [int(x) for x in line.split()])

# Sum the primes
sum_primes = primes_rdd.sum()

# Print the result
print("Sum of first 100 prime numbers:", sum_primes)

spark.stop()


Sum of first 100 prime numbers: 24133


In [8]:
# Task 4: Real Estate Data Analysis
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, max

spark = SparkSession.builder.appName("RealEstateAnalysis").getOrCreate()

# Read the real estate data
real_estate_df = spark.read.csv("in/RealEstate.csv", header=True, inferSchema=True)

# Group by location and calculate average price per SQ Ft and max price
aggregated_df = real_estate_df.groupBy("Location").agg(
    avg("Price SQ Ft").alias("avg_price_sq_ft"),
    max("Price").alias("max_price")
)

# Sort by average price per SQ Ft
sorted_df = aggregated_df.orderBy("avg_price_sq_ft", ascending=False)

# Show the result
sorted_df.show()

spark.stop()


+----------------+------------------+---------+
|        Location|   avg_price_sq_ft|max_price|
+----------------+------------------+---------+
|          Oceano|           1144.64|1195000.0|
|         Bradley|            606.06|1600000.0|
|     Avila Beach| 566.5500000000001|1999000.0|
|         Cambria| 491.9558333333334|2995000.0|
|     Pismo Beach|462.28416666666664|1799000.0|
| San Luis Obispo|458.91333333333336|2369000.0|
|      Santa Ynez|391.33000000000004|1395000.0|
|         Cayucos|             386.6|1500000.0|
|         Cayucos|            385.11| 695000.0|
|       Morro Bay|374.13750000000005| 982800.0|
|   Arroyo Grande| 361.4208333333333|5499000.0|
|     Pismo Beach|         357.01125| 920000.0|
|         Cambria| 347.5544444444444| 699900.0|
|       Morro Bay|345.90538461538466|1100000.0|
|         Creston|            322.75| 549000.0|
|     Out Of Area|            314.47|1195000.0|
|    Grover Beach|            308.78| 999000.0|
| San Luis Obispo|291.09357142857147| 89