In [None]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.window import Window
import pandas as pd
# import boto3
# import datetime as dt
from datetime import datetime, timedelta
# import proximityhash as pr
# import pygeohash as pgh
# import geojson
# import geohashlite

# import matplotlib.pyplot as plt
# from reportlab.lib.pagesizes import letter
# from reportlab.pdfgen import canvas

In [None]:
import geopandas as gpd
import folium
from folium import Choropleth
from shapely.geometry import shape
import json
from folium import LinearColormap
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from fpdf import FPDF

In [None]:
spark = SparkSession.builder.appName("insights").getOrCreate()

In [None]:
df = spark.read.csv("/Users/nlblr135/Downloads/insights/geoid_data/new_full_county/county_count.csv", header = True)
df.show(5)

In [None]:
pdf = df.toPandas()

In [None]:
pdf['Reach Estimate'] = pd.to_numeric(pdf['Reach Estimate'], errors='coerce')

# Drop any rows where 'count' could not be converted (optional, if needed)
pdf = pdf.dropna(subset=['Reach Estimate'])

In [None]:
# Convert raw MultiPolygon strings to valid GeoJSON-like format
def convert_to_geojson(geometry_str):
    # Remove 'MultiPolygon ' prefix
    geojson_str = geometry_str.replace('MultiPolygon ', '')
    # Add surrounding braces to make it a valid GeoJSON string
    geojson_str = f'{{"type": "MultiPolygon", "coordinates": {geojson_str}}}'
    return json.loads(geojson_str)

# Convert to Shapely geometries
pdf['geometry'] = pdf['geometry'].apply(lambda x: shape(convert_to_geojson(x)))

In [None]:
# Convert your DataFrame to a GeoDataFrame if it's not already
gdf = gpd.GeoDataFrame(pdf, geometry=pdf['geometry'])

# # Find the row with the highest count
# max_count_row = pdf.loc[pdf['count'].astype(int).idxmax()]

# # Get the geometry of this region
# max_count_geometry = gdf.loc[gdf['GEOID'] == max_count_row['GEOID'], 'geometry'].values[0]

gdf.set_crs(epsg=4326, inplace=True)

# # Extract the centroid of the geometry to use as the map center
# center_coordinates = max_count_geometry.centroid.coords[0]

In [None]:
# Ensure 'count' column is numeric
pdf['Reach Estimate'] = pd.to_numeric(pdf['Reach Estimate'], errors='coerce')

# Find the row with the highest count
max_count_row = pdf.loc[pdf['Reach Estimate'].idxmax()]

# Get the geometry of the region with the highest count
max_count_geometry = gdf.loc[gdf['GEOID'] == max_count_row['GEOID'], 'geometry'].values[0]

# Extract the centroid of the geometry to use as the map center
center_coordinates = [max_count_geometry.centroid.y, max_count_geometry.centroid.x]

# Initialize a folium map centered on the region with the highest count
m = folium.Map(location=center_coordinates, zoom_start=9) 

In [None]:
# Find the minimum and maximum count values
min_count = 1
print(min_count)
max_count = pdf['Reach Estimate'].max()
print(max_count)

# Define custom bins dynamically (e.g., 5 bins)
bin_step = (max_count - min_count) / 20
bins = [min_count + i * bin_step for i in range(21)]

In [None]:
# # Create a Choropleth map using GEOID
Choropleth(
    geo_data=gdf.to_json(),  # Convert GeoDataFrame to GeoJSON string
    name='choropleth',
    data=pdf,  # Use the Pandas DataFrame
    columns=['GEOID', 'Reach Estimate'],
    key_on='feature.properties.GEOID',  # Ensure this matches your GeoJSON
    fill_color='OrRd',  # Placeholder fill color
    fill_opacity=0.9,
    # line_color='',  # Set line color to white
    line_opacity=0.4,      # Set line_opacity to 0 to make borders invisible
    legend_name='Reach Estimate',
    bins=bins,  # Adjust the number of bins as needed
).add_to(m)

In [None]:
folium.GeoJson(
    gdf,
    name='geojson',
    style_function=lambda x: {'color': 'white', 'weight': 0.9}  # Set weight to 0 to hide borders
).add_to(m)


# folium.GeoJson(
#     gdf,
#     name='geojson'
# ).add_to(m)

In [None]:
# Save the map as an HTML file
m.save('/Users/nlblr135/Downloads/insights/final/heatmap.html')

# Display the map in a Jupyter Notebook (if using one)
m

In [None]:
# Path to chromedriver
driver_path = '/Users/nlblr135/Downloads/insights/chromedriver-mac-x64/chromedriver'

# Setup Chrome options for headless mode
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Set up the Service object
service = Service(driver_path)

# Initialize the driver with Service and options
driver = webdriver.Chrome(service=service, options=chrome_options)

In [None]:
# Open the saved HTML map file
driver.get('file:///Users/nlblr135/Downloads/insights/final/heatmap.html')

# Must add sleep during automation

In [None]:
# Save screenshot
driver.save_screenshot('/Users/nlblr135/Downloads/insights/final/texas_heatmap_map_ss.png')
driver.quit()