In [None]:
from google.colab import output
output.enable_custom_widget_manager()
!pip install ipywidgets
!pip install opendatasets ipyopenlayers pandas

from ipyopenlayers import RasterTileLayer, ZoomSlider, Map
from ipyleaflet import Map, Marker, MarkerCluster,Popup
from ipywidgets import HTML
import opendatasets as od
import pandas as pd
import geopandas as gpd
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import time
import os
import kagglehub
from tqdm import tqdm
import pickle

!pip install pyspark opendatasets kagglehub

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, substring, avg, when, lit, min as spark_min, max as spark_max

spark = SparkSession.builder \
    .appName("PollutionDataProcessing") \
    .getOrCreate()


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, avg
from pyspark.sql.functions import trim, upper

spark = SparkSession.builder.appName("QOLI-Spark").getOrCreate()

# 1. Load CSVs into Spark DataFrames
pollution_df = spark.read.csv("pollution_data_spark_geocode.csv", header=True, inferSchema=True)
qof_df = spark.read.csv("qof_data.csv", header=True, inferSchema=True)
col_df = spark.read.csv("Cost_of_living_ranked.csv", header=True, inferSchema=True)
crime_df = spark.read.csv("Crime_Ranked.csv", header=True, inferSchema=True)





In [None]:

# 2. Select and rename relevant columns
pollution_df = pollution_df.filter(col("Granularity") == "yearly") \
    .select(col("State").alias("state"), col("County").alias("county"),
            col("ranking_final_value").alias("pollution_ranking"),
            col("Year").alias("year"), col("geometry"))

qof_df = qof_df.select(col("state"), col("county"),
                       col("ranking").alias("unemployment_ranking"),
                       col("geometry"))

col_df = col_df.select("state", "county", "cost_of_living_ranking", "geometry")

crime_df = crime_df.select(
    col("state"), col("county"), col("year"), col("crime_ranking"), col("geometry")
)

In [None]:

# 3. Normalize casing and whitespace
for df_name in ['pollution_df', 'qof_df', 'col_df', 'crime_df']:
    df = eval(df_name)
    for column in ['state', 'county']:
        df = df.withColumn(column, trim(upper(col(column).cast("string"))))
    globals()[df_name] = df

# 4. Merge all datasets
merged = pollution_df.join(qof_df, on=["state", "county", "geometry"], how="outer") \
    .join(col_df, on=["state", "county", "geometry"], how="outer") \
    .join(crime_df, on=["state", "county", "year", "geometry"], how="outer")

# 5. Fill missing values with group averages, then with overall average
for col_name in ["pollution_ranking", "unemployment_ranking", "cost_of_living_ranking", "crime_ranking"]:
    # Compute averages
    county_avg = merged.groupBy("county").agg(avg(col_name).alias("county_avg"))
    state_avg = merged.groupBy("state").agg(avg(col_name).alias("state_avg"))
    overall_avg = merged.agg(avg(col_name).alias("overall_avg")).collect()[0]["overall_avg"]

    # Join and fill nulls step-by-step
    merged = merged.join(county_avg, on="county", how="left")
    merged = merged.withColumn(col_name, when(col(col_name).isNull(), col("county_avg")).otherwise(col(col_name)))
    merged = merged.drop("county_avg")

    merged = merged.join(state_avg, on="state", how="left")
    merged = merged.withColumn(col_name, when(col(col_name).isNull(), col("state_avg")).otherwise(col(col_name)))
    merged = merged.drop("state_avg")

    merged = merged.withColumn(col_name, when(col(col_name).isNull(), lit(overall_avg)).otherwise(col(col_name)))

# 6. Fill year if missing
merged = merged.withColumn("year", when(col("year").isNull(), lit(2022)).otherwise(col("year").cast("int")))


In [None]:

# 7. Compute QOLI (using assumptions as discussed)
merged = merged.withColumn("QOLI",
    0.389 * lit(1) +  # I_pp
    0.128 * col("crime_ranking") +  # I_safety
    0.103 * lit(1) +  # I_health
    0.0855 * lit(1) +  # I_climate
    0.2564 * col("unemployment_ranking") +  # I_house_price
    0.0256 * col("cost_of_living_ranking") +  # I_cost_liv
    0.128 * lit(1) +  # I_traffic
    0.1709 * col("pollution_ranking")  # I_pollution
)


In [None]:
# 8. Select final columns
final_df = merged.select(
    "state", "county", "geometry", "year",
    "pollution_ranking", "unemployment_ranking",
    "cost_of_living_ranking", "crime_ranking", "QOLI"
)

# 9. Save to CSV
final_df.write.csv("final_df_spark.csv", header=True, mode="overwrite")