## Explore the data, clean it, and perform EDA on it

In [None]:
import sys
from pathlib import Path
import os 
import tqdm 
import pandas as pd
import plotly.express as px

# Set pandas display options to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Set pandas option to display floats without scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)  # Adjust the number of decimal places as needed


sys.path.append('/Projects/regionintelligence/apn_insights/')

In [2]:
import numpy as np
import plotly as py
import polars as pl
from src.paths import DATA_DIR


file_path = DATA_DIR / '2019_RI_LAND_USE.csv'

In [3]:
# Load the entire CSV as a pandas DataFrame
df = pd.read_csv(file_path)

# Basic statistics for each column
stats = df.describe()

# Count the number of missing values for each column
missing_values = df.isnull().sum()

# Print the missing values for all columns
print("Missing Values for Each Column:")
print(missing_values)

  df = pd.read_csv(file_path)


Missing Values for Each Column:
OID_                                  0
RI_PARCEL_ID                          0
APN_RAW_2019                       8645
COUNTY_NAME                           0
RI_COUNTY_ID                          0
CITY_NAME                             0
CITY_ID                               0
MULTIPART                             0
STACK                                 0
ACREAGE                               0
SLOPE                                 0
RI_DEM_GEO_ID_20                     34
APN_DUP                           47523
IMPROVEMENT_RATIO                247662
LAND_USE_2019                     10435
LAND_USE_SOURCE_2019                248
RI_UNIQUE_PARCEL_ID_2016            729
APN                                 729
CITY_GP_CODE                       9727
RI_GP_CODE                        12116
SP_INDEX                            729
CITY_SP_CODE                    4324966
RI_SP_CODE                      4342482
CITY_ZONE_CODE                    12806
RI_ZONE_

In [4]:
stats

Unnamed: 0,OID_,RI_PARCEL_ID,RI_COUNTY_ID,CITY_ID,MULTIPART,STACK,ACREAGE,SLOPE,RI_DEM_GEO_ID_20,APN_DUP,IMPROVEMENT_RATIO,RI_UNIQUE_PARCEL_ID_2016,RI_GP_CODE,SP_INDEX,YEAR,PUBLIC_OWNED,BUILDING_SQFT,FIRE_HAZARD,SEARISE_1_METER,SEARISE_2_METER,FLOOD_PLAIN_ZONE,EQUAKE_ZONE,LIQUAFACTION_ZONE,LANDSLIDE_ZONE,PROTECTED_AREA,RIVER_WETLAND_AREA,WILDLIFE_AREA,CNDDB_RARE_SPECIES_AREA,HABITAT_RESERVE_AREA,WETLAND_AREA,URBANIZED_AREA,UNBUILT_SF,GROCERY_1_MILE,HEALTHCARE_1_MILE,OPENSPACE_1_MILE,HIGH_QUALITY_TRANSIT_AREA,JOB_CENTER,NEIGHBORHOOD_MOBILITY_AREA,ABSOLUTE_CONSTRAINT,VARIABLE_CONSTRAINT,ENVIRONMENT_JUSTICE_AREA,DISADVANTAGED_COMMUNITY_AREA,COMMUNITY_OF_CONCERN,ADU_SPACE_POSSIBILITY,SETBACK_REDUCTION_ADU,SMALL_ADU_POSSIBILITY,PARKING_EXEMPTION_ADU,SETBACK_SMALL_ADU,SETBACK_PARKING_ADU,SMALL_PARKING_ADU,SETBACK_SMALL_PARKING_ADU,Shape_Length,Shape_Area,Centroid_X,Centroid_Y
count,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131064.0,5083575.0,4883436.0,5130369.0,5118982.0,5130369.0,5131098.0,5131098.0,4266981.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0,5131098.0
mean,2565549.5,539463307238.114,53.687,54233.802,1.004,5.257,4.951,1.639,60539238465446.37,0.092,1.924,537561271.406,1319.023,0.152,2019.0,0.036,7025.266,0.157,0.008,0.009,0.034,0.023,0.245,0.045,0.033,0.189,0.068,0.026,0.017,0.047,0.918,23472.223,2.31,31.145,13.28,0.321,0.143,0.187,0.06,0.539,0.539,0.24,0.142,0.47,0.493,0.491,0.487,0.497,0.494,0.492,0.497,297.105,29444.59,-117.798,34.011
std,1481220.55,194367385515.461,19.635,28897.713,0.105,27.009,56.39,4.315,194668669938.9,14.295,386.229,196073010.323,1038.198,0.359,0.0,0.187,26105.065,0.364,0.091,0.094,0.182,0.149,0.43,0.206,0.179,0.391,0.251,0.159,0.129,0.212,0.274,388789.197,4.408,78.563,20.5,0.467,0.35,0.39,0.237,0.498,0.498,0.427,0.349,0.499,0.5,0.5,0.5,0.5,0.5,0.5,0.5,744.412,333142.12,0.743,0.311
min,1.0,250000000001.0,25.0,296.0,1.0,1.0,0.0,0.0,60250101011000.0,0.0,0.0,250000001.0,1100.0,0.0,2019.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017,0.0,-119.507,32.622
25%,1282775.25,375344021025.25,37.0,36770.0,1.0,1.0,0.136,0.0,60374333021019.0,0.0,0.369,371024353.0,1110.0,0.0,2019.0,0.0,1944.688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2192.075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.422,802.661,-118.3,33.821
50%,2565549.5,590006647226.5,59.0,48256.0,1.0,1.0,0.174,0.0,60590218163014.0,0.0,1.0,590072718.0,1112.0,0.0,2019.0,0.0,2477.901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4255.116,1.0,6.0,5.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,138.255,1030.415,-117.961,34.014
75%,3848323.75,650673041096.75,65.0,78120.0,1.0,1.0,0.359,0.0,60650451222017.0,0.0,2.355,650677633.0,1124.0,0.0,2019.0,0.0,3317.627,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6708.329,3.0,32.0,19.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,203.412,2119.335,-117.382,34.165
max,5131098.0,1110901013090.0,111.0,99999.0,69.0,600.0,24318.681,25.0,61119901000009.0,12691.0,838260.636,1110255921.0,9999.0,1.0,2019.0,1.0,4027332.368,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,138981161.871,49.0,1389.0,232.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,127873.231,136734317.019,-114.137,35.808


In [5]:
df.columns

Index(['OID_', 'RI_PARCEL_ID', 'APN_RAW_2019', 'COUNTY_NAME', 'RI_COUNTY_ID',
       'CITY_NAME', 'CITY_ID', 'MULTIPART', 'STACK', 'ACREAGE', 'SLOPE',
       'RI_DEM_GEO_ID_20', 'APN_DUP', 'IMPROVEMENT_RATIO', 'LAND_USE_2019',
       'LAND_USE_SOURCE_2019', 'RI_UNIQUE_PARCEL_ID_2016', 'APN',
       'CITY_GP_CODE', 'RI_GP_CODE', 'SP_INDEX', 'CITY_SP_CODE', 'RI_SP_CODE',
       'CITY_ZONE_CODE', 'RI_ZONE_CODE', 'LAND_USE_2016', 'YEAR',
       'PUBLIC_OWNED', 'PUB_AGENCY_NAME', 'PUBLIC_TYPE', 'PUBLIC_SOURCE',
       'BUILDING_SQFT', 'EPA_BROWN_NAME', 'EPA_BROWN_TYPE', 'FIRE_HAZARD',
       'SEARISE_1_METER', 'SEARISE_2_METER', 'FLOOD_PLAIN_ZONE', 'EQUAKE_ZONE',
       'LIQUAFACTION_ZONE', 'LANDSLIDE_ZONE', 'PROTECTED_AREA',
       'RIVER_WETLAND_AREA', 'WILDLIFE_AREA', 'CNDDB_RARE_SPECIES_AREA',
       'HABITAT_RESERVE_AREA', 'WETLAND_AREA', 'URBANIZED_AREA', 'UNBUILT_SF',
       'GROCERY_1_MILE', 'HEALTHCARE_1_MILE', 'OPENSPACE_1_MILE',
       'OPPORTUNITY_LEVEL', 'HIGH_QUALITY_TRANSIT_A

# APN Insights based on 2019 Data
## Initial Questions we should ask regarding each City


**ACREAGE**
- What is the average Acreage of a parcel per city? Name the top 10 and bottom 10

**SLOPE** 
- What is the average Slope of a parcel per city? Name the top 10 and bottom 10

**IMPROVEMENT RATIO** 
- What is the average Improvement Ratio per City? Name the top 10 and bottom 10

**SQUARE FOOTAGE**
- What is the average unbuilt square footage per city?
- What is the average square footage of a parcel per city? Name the top 10 and bottom 10


In [6]:
# Assuming a 'CITY_NAME' column exists:
df_average_acreage_per_city = df.groupby('CITY_NAME')['ACREAGE'].mean().reset_index(name='Average_Acreage')

# Lowest acreage per city 
top_acreage_per_city = df_average_acreage_per_city.sort_values(by='Average_Acreage', ascending=False).tail(10)

top_acreage_per_city = px.bar(top_acreage_per_city, x = 'CITY_NAME', y = 'Average_Acreage', title='Top 10 Cities by Average Acreage')

top_acreage_per_city.show()

In [7]:
# Find the average improvement ratio per city
df_average_improvement_ratio_per_city = df.groupby('CITY_NAME')['IMPROVEMENT_RATIO'].mean().reset_index(name='Average_Improvement_Ratio')

# Sort the DataFrame by the average improvement ratio
top_improvement_ratio_per_city = df_average_improvement_ratio_per_city.sort_values(by='Average_Improvement_Ratio', ascending=False).head(10)

# Visualization for top 10 cities with highest average improvement ratio
fig_unbuilt_sqft_per_city = px.bar(top_improvement_ratio_per_city, x='CITY_NAME', y='Average_Improvement_Ratio', title='Top 10 Cities with Highest Average Improvement Ratio')

fig_unbuilt_sqft_per_city.show()

In [8]:
# Find the average unbuilt square footage per city
df_average_unbuilt_sqft_per_city = df.groupby('CITY_NAME')['UNBUILT_SF'].mean().reset_index(name='Average_Unbuilt_Square_Footage')

# Sort the dataframe by 'Average_Unbuilt_Square_Footage' in descending order and take the first 10 rows
top_cities_unbuilt_sqft = df_average_unbuilt_sqft_per_city.sort_values('Average_Unbuilt_Square_Footage', ascending=False).head(10)

# Visualization for top 10 cities with highest unbuilt square footage
fig_unbuilt_sqft_per_city = px.bar(top_cities_unbuilt_sqft, x='CITY_NAME', y='Average_Unbuilt_Square_Footage',
                                   title='Average Unbuilt Square Footage per City')
fig_unbuilt_sqft_per_city.show()


In [9]:
# Find the average unbuilt square footage per city
df_average_building_sqft_per_city = df.groupby('CITY_NAME')['BUILDING_SQFT'].mean().reset_index(name='Average_Building_Square_Footage')

# Sort the dataframe by 'Average_Unbuilt_Square_Footage' in descending order and take the first 10 rows
top_cities_building_sqft = df_average_building_sqft_per_city.sort_values('Average_Building_Square_Footage', ascending=False).head(10)

# Visualization for top 10 cities with highest unbuilt square footage
fig_building_sqft_per_city = px.bar(top_cities_building_sqft, x='CITY_NAME', y='Average_Building_Square_Footage',
                                   title='Average Building Square Footage per City')
fig_building_sqft_per_city.show()


## A Deeper Dive regarding each City and their APN's

### We are going to remove outliers and evenly sample the data from each "LAND_USE_CLASS_NAME" to get a better understanding of each city


In [10]:
# Define a function to calculate IQR and define outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [11]:
# Function to sample up to n instances, or the city's total instances if fewer than n
def sample_n_or_less(group, n=30):
    return group.sample(n=min(n, len(group)), random_state=1) if len(group) > 0 else group

### Top 15 Building Square Footage across Single Family Residential Properties 

In [12]:
# Filter the dataframe for rows where LAND_USE_CLASS_NAME is 'Single Family Residential'
df_single_family = df[df['LAND_USE_CLASS_NAME'] == 'Single Family Residential']


# Apply the function to each city group to remove outliers
df_no_outliers = df_single_family.groupby('CITY_NAME', group_keys=False).apply(lambda group: remove_outliers(group, 'BUILDING_SQFT'))

# Apply the sampling function to each city's group
sampled_df = df_no_outliers.groupby('CITY_NAME', group_keys=False).apply(sample_n_or_less)

# Calculate average building square footage for the sampled 'Single Family Residential' per city
df_average_building_sqft_sampled = (
    sampled_df.groupby('CITY_NAME')['BUILDING_SQFT']
    .mean()
    .reset_index(name='Average_Building_Square_Footage')
)

# Sort the DataFrame to get the top 10 cities by average building square footage for 'Single Family Residential'
top_cities_building_sqft_sampled = (
    df_average_building_sqft_sampled.sort_values('Average_Building_Square_Footage', ascending=False)
    .head(15)
)

# Visualization for Average Building Square Footage for 'Single Family Residential' per City from the sampled data
fig_building_sqft_sampled = px.bar(
    top_cities_building_sqft_sampled, 
    x='CITY_NAME', 
    y='Average_Building_Square_Footage',
    title="Top 15 Cities by Average Building Square Footage for 'Multi-Family Residential' (Sampled)",
    template='plotly_dark'
)

# Customize the chart for higher contrast and legibility
fig_building_sqft_sampled.update_layout({
    'plot_bgcolor': 'black',  # Set the plot background to black
    'paper_bgcolor': 'black',  # Set the overall background to black
    'title': {'text': "Top 10 Cities by Average Building Square Footage for 'Single Family Residential' (Sampled)", 'x':0.5, 'xanchor': 'center'},
    'title_font': {'size': 24, 'color': 'white'},  # Increase title font size and set to white
    'font': {'color': 'white'},  # Set the color of all text to white
    'xaxis': {'title': {'standoff': 15}, 'tickfont': {'size': 14}},  # Adjust x-axis properties
    'yaxis': {'title': {'standoff': 15}, 'tickfont': {'size': 14}},  # Adjust y-axis properties
})

# Increase the text size for the bar labels
fig_building_sqft_sampled.update_traces(textfont_size=16)


fig_building_sqft_sampled.show()


Key Takeaways:


### Top 15 Building Square Footage across Multi-Family Residential Properties 

In [13]:
# Filter the dataframe for rows where LAND_USE_CLASS_NAME is 'Single Family Residential'
df_multi_family = df[df['LAND_USE_CLASS_NAME'] == 'Multi-Family Residential']


# Apply the function to each city group to remove outliers
df_no_outliers = df_multi_family.groupby('CITY_NAME', group_keys=False).apply(lambda group: remove_outliers(group, 'BUILDING_SQFT'))

# Apply the sampling function to each city's group
sampled_df = df_no_outliers.groupby('CITY_NAME', group_keys=False).apply(sample_n_or_less)

# Calculate average building square footage for the sampled 'Single Family Residential' per city
df_average_building_sqft_sampled = (
    sampled_df.groupby('CITY_NAME')['BUILDING_SQFT']
    .mean()
    .reset_index(name='Average_Building_Square_Footage')
)

# Sort the DataFrame to get the top 10 cities by average building square footage for 'Single Family Residential'
top_cities_building_sqft_sampled = (
    df_average_building_sqft_sampled.sort_values('Average_Building_Square_Footage', ascending=False)
    .head(15)
)

# Visualization for Average Building Square Footage for 'Single Family Residential' per City from the sampled data
fig_building_sqft_sampled = px.bar(
    top_cities_building_sqft_sampled, 
    x='CITY_NAME', 
    y='Average_Building_Square_Footage',
    title="Top 15 Cities by Average Building Square Footage for 'Multi-Family Residential' (Sampled)",
    template='plotly_dark'
)

# Customize the chart for higher contrast and legibility
fig_building_sqft_sampled.update_layout({
    'plot_bgcolor': 'black',  # Set the plot background to black
    'paper_bgcolor': 'black',  # Set the overall background to black
    'title': {'text': "Top 10 Cities by Average Building Square Footage for 'Multi-Family Residential' (Sampled)", 'x':0.5, 'xanchor': 'center'},
    'title_font': {'size': 24, 'color': 'white'},  # Increase title font size and set to white
    'font': {'color': 'white'},  # Set the color of all text to white
    'xaxis': {'title': {'standoff': 15}, 'tickfont': {'size': 14}},  # Adjust x-axis properties
    'yaxis': {'title': {'standoff': 15}, 'tickfont': {'size': 14}},  # Adjust y-axis properties
})

# Increase the text size for the bar labels
fig_building_sqft_sampled.update_traces(textfont_size=16)


fig_building_sqft_sampled.show()


Takeaways:

In [14]:
# Filter the dataframe for rows where LAND_USE_CLASS_NAME is 'Vacant'
df_vacant = df[df['LAND_USE_CLASS_NAME'] == 'Vacant']

# Remove outliers within each city's vacant land before sampling
df_vacant_no_outliers = df_vacant.groupby('CITY_NAME', group_keys=False).apply(lambda group: remove_outliers(group, 'ACREAGE'))

# Identify the top 10 cities with the most vacant land by acreage
top_cities_by_vacant_acreage = (
    df_vacant_no_outliers.groupby('CITY_NAME')['ACREAGE']
    .sum()
    .reset_index(name='Total_Vacant_Acreage')
    .sort_values('Total_Vacant_Acreage', ascending=False)
    .head(10)
)

# Get the list of top 10 cities to filter the no_outliers dataframe
top_cities_list = top_cities_by_vacant_acreage['CITY_NAME'].tolist()
df_vacant_top_cities_no_outliers = df_vacant_no_outliers[df_vacant_no_outliers['CITY_NAME'].isin(top_cities_list)]

# Apply the sampling function to each of the top 10 cities
df_sampled_vacant = df_vacant_top_cities_no_outliers.groupby('CITY_NAME', group_keys=False).apply(sample_n_or_less)

# Calculate average acreage for these samples for visualization
df_average_sampled_vacant_acreage = (
    df_sampled_vacant.groupby('CITY_NAME')['ACREAGE']
    .mean()
    .reset_index(name='Average_Sampled_Vacant_Acreage')
)

# Sort the DataFrame to get the top 10 cities by average sampled vacant acreage
top_cities_sampled_vacant_acreage = (
    df_average_sampled_vacant_acreage.sort_values('Average_Sampled_Vacant_Acreage', ascending=False)
    .head(10)
)

# Visualization for Average Sampled Vacant Acreage per City
fig_sampled_vacant_acreage = px.bar(
    top_cities_sampled_vacant_acreage, 
    x='CITY_NAME', 
    y='Average_Sampled_Vacant_Acreage',
    title="Top 10 Cities by Average Sampled Vacant Acreage",
    template='plotly_dark'
)

# Customize the chart for high contrast and legibility
fig_sampled_vacant_acreage.update_layout({
    'plot_bgcolor': 'black',
    'paper_bgcolor': 'black',
    'title': {'text': "Top 10 Cities by Average Sampled Vacant Acreage", 'x':0.5, 'xanchor': 'center'},
    'title_font': {'size': 24, 'color': 'white'},
    'font': {'color': 'white'},
    'xaxis': {'title': {'standoff': 15}, 'tickfont': {'size': 14}},
    'yaxis': {'title': {'standoff': 15}, 'tickfont': {'size': 14}},
})

fig_sampled_vacant_acreage.update_traces(textfont_size=16)

fig_sampled_vacant_acreage.show()

In [15]:
# Filter the dataframe for rows where LAND_USE_CLASS_NAME is 'Vacant'
df_vacant = df[df['LAND_USE_CLASS_NAME'] == 'Vacant']

# Further filter out 'unincorporated' areas if that's a city name or another field
df_vacant = df_vacant[df_vacant['CITY_NAME'] != 'Unincorporated']

# Calculate the count of vacant land instances for each city
df_count_vacant_instances = (
    df_vacant.groupby('CITY_NAME')
    .size()
    .reset_index(name='Count_Vacant_Instances')
)

# Sort the cities by count of vacant land instances in descending order
df_sorted_cities_vacant_count = (
    df_count_vacant_instances.sort_values('Count_Vacant_Instances', ascending=False)
)

# Visualization for Count of Vacant Land Instances per City
fig_count_vacant_instances = px.bar(
    df_sorted_cities_vacant_count, 
    x='CITY_NAME', 
    y='Count_Vacant_Instances',
    title="Cities by Count of Vacant Land Instances",
    template='plotly_dark'
)

# Customize the chart for high contrast and legibility
fig_count_vacant_instances.update_layout({
    'plot_bgcolor': 'black',
    'paper_bgcolor': 'black',
    'title': {'text': "Cities by Count of Vacant Land Instances", 'x':0.5, 'xanchor': 'center'},
    'title_font': {'size': 24, 'color': 'white'},
    'font': {'color': 'white'},
    'xaxis': {'title': {'standoff': 15}, 'tickfont': {'size': 14}},
    'yaxis': {'title': {'standoff': 15}, 'tickfont': {'size': 14}},
})

fig_count_vacant_instances.update_traces(textfont_size=16)

fig_count_vacant_instances.show()

## Using Polars and Pyspark to compare the data
### Pandas is a bit slow with the 3gb file

In [16]:
import polars as pl

file_path = DATA_DIR / '2019_RI_LAND_USE.csv'

df = pl.scan_csv(file_path).group_by(['CITY_NAME', 'LAND_USE_CLASS_NAME']) \
    .agg([
        pl.col('ACREAGE').sum().alias('Total_Acreage'),
        pl.col('ACREAGE').mean().alias('Average_Acreage'),
        pl.col('ACREAGE').median().alias('Median_Acreage'),
    ]).collect() 

In [33]:
df

CITY_NAME,LAND_USE_CLASS_NAME,Total_Acreage,Average_Acreage,Median_Acreage
str,str,f64,f64,f64
"""Unincorporated…","""Vacant""",2.7991e6,10.796083,2.475218
"""Unincorporated…","""Commercial and…",21668.82415,2.782692,0.344191
"""Unincorporated…","""Single Family …",359504.197869,0.80412,0.187307
"""El Centro""","""Education""",227.069224,5.538274,1.758965
"""El Centro""","""Agriculture""",975.59837,44.34538,23.602504
"""Imperial""","""Open Space and…",122.16248,6.786804,2.063615
"""Imperial""","""Mixed Resident…",12.204874,0.762805,0.391869
"""Imperial""","""Mobile Homes a…",24.059245,1.145678,0.606234
"""Los Angeles""","""Multi-Family R…",221913.749549,1.099824,0.215001
"""Agoura Hills""","""Transportation…",14.565858,1.820732,2.000472


In [34]:
import dash
from dash import html, dcc, Input, Output
import plotly.express as px
import pandas as pd

df = pl.scan_csv(file_path).group_by(['CITY_NAME', 'LAND_USE_CLASS_NAME']) \
    .agg([
        pl.col('ACREAGE').sum().alias('Total_Acreage'),
        pl.col('ACREAGE').mean().alias('Average_Acreage'),
        pl.col('ACREAGE').median().alias('Median_Acreage'),
    ]).collect() 
# Assume 'df' is your DataFrame with the necessary columns

app = dash.Dash(__name__)

# APP LAYOUT 
app.layout = html.Div([
    html.H1("Land Use Size Metrics by City", style={'text-align': 'center', 'color': 'white'}),

    # Wrap the dropdowns in a div with flexbox
    html.Div([
        dcc.Dropdown(id='city_dropdown', options=[{'label': i, 'value': i} for i in df['CITY_NAME'].unique()], 
                     multi=False, style={'width': "100%", 'color': 'black', 'background-color': 'white'}, 
                     placeholder='Filter by City...'),

        dcc.Dropdown(id='metric_dropdown', options=[
            {'label': 'Total Acreage', 'value': 'Total_Acreage'},
            {'label': 'Average Acreage', 'value': 'Average_Acreage'},
            {'label': 'Median Acreage', 'value': 'Median_Acreage'}
        ], multi=False, style={'width': "100%", 'color': 'black', 'background-color': 'white'}, 
        placeholder='Select Acreage Metric...')
    ], style={'display': 'flex', 'flex-direction': 'column', 'align-items': 'center', 'width': '60%', 'margin': '0 auto'}),

    html.Div(id='output_container', style={'color': 'white'}),
    html.Br(),

    dcc.Graph(id='land_use_size_chart', style={'color': 'white'})
], style={'backgroundColor': 'black', 'color': 'white', 'fontFamily': 'Arial', 'padding': '10px', 'height': '100vh'})

@app.callback(
    [Output(component_id='output_container', component_property='children'),
     Output(component_id='land_use_size_chart', component_property='figure')],
    [Input(component_id='city_dropdown', component_property='value'),
     Input(component_id='metric_dropdown', component_property='value')]
)

def update_graph(selected_city, selected_metric):
    if not selected_city or not selected_metric:
        return "Please select a city and metric to view the chart", dash.no_update
    
    # Corrected filtering using Polars
    filtered_df = df.filter(pl.col('CITY_NAME') == selected_city)

    # Convert to Pandas DataFrame for Plotly
    filtered_df = filtered_df.to_pandas()

    # Sort the DataFrame from highest to smallest based on the selected metric
    filtered_df = filtered_df.sort_values(by=selected_metric, ascending=False)

    # Creating the bar chart
    fig = px.bar(filtered_df, x="LAND_USE_CLASS_NAME", y=selected_metric, title=f'{selected_city}',
                 labels={'LAND_USE_CLASS_NAME': 'Land Use Class', selected_metric: 'Acreage'})

    # Update the layout of the figure to adjust text styles
    fig.update_layout(
        title=dict(x=0.5, xanchor='center', font=dict(size=24, color='blue', family='Arial')),
        xaxis=dict(title_font=dict(size=18, family='Arial'), tickfont=dict(size=14, family='Arial')),
        yaxis=dict(title_font=dict(size=18, family='Arial'), tickfont=dict(size=14, family='Arial')),
        plot_bgcolor='white',  # Adjust plot background color
        paper_bgcolor='white', # Adjust paper background color
    )

    # Update the layout to adjust the theme
    fig.update_layout(
        template='plotly_white',  # Use the 'plotly_white' theme as base
    )

    return f"Land Use Size Metrics for {selected_city}, Metric: {selected_metric}", fig



if __name__ == '__main__':
    app.run_server(debug=True)

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum, count, mean
 


## Dash App for APN Insights

### This is first built in Python and then converted to a Dash App. Pandas is pretty slow, so we will need to use polars to speed up the process.

**Dash Components:**
- Dropdown for City
- Dropdown for Land Use Class
- Dropdown for Top 10 or Bottom 10

**Dash Graphs:**
- 

**Dash Callbacks:**
- 