In [1]:
import os
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from tqdm import tqdm
import folium
from folium.plugins import MarkerCluster


import glob


# Set up the notebook for better visualization
plt.style.use('ggplot')
sns.set(style="whitegrid")
%matplotlib inline
plt.rcParams['figure.figsize'] = (14, 8)

# Define the base directory where the data is stored (same as in extraction script)
base_dir = "../data_output/transformers"

In [None]:
def load_state_data(state):
    """Load data for a single state"""
    state_dir = os.path.join(base_dir, f"{state}")
    csv_files = glob.glob(os.path.join(state_dir, "*.csv"))
    if csv_files:
        csv_file = csv_files[0]  # Take the first matching file
        df = pd.read_csv(csv_file)
        df['state'] = state
        return df
    else:
        print(f"No data found for {state}")
        return None

# Get list of processed states from the directory structure
processed_states = [d for d in os.listdir(base_dir) 
                    if os.path.isdir(os.path.join(base_dir, d)) and d.startswith("US-")]



# Load all state data into a single dataframe
all_data = []
for state in tqdm(processed_states, desc="Loading state data"):
    print(f"Loading data for {state}")
    state_data = load_state_data(state)
    if state_data is not None:
        all_data.append(state_data)

# Combine all state data
if all_data:
    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"Loaded {len(combined_df)} power infrastructure elements in total")
else:
    print("No data found. Make sure the extraction script has been run.")

In [None]:
# Count elements by type and state
summary_by_state = combined_df.groupby(['state', 'power']).size().unstack(fill_value=0)

# Add a total column
if 'transformer' in summary_by_state.columns and 'substation' in summary_by_state.columns:
    summary_by_state['total'] = summary_by_state['transformer'] + summary_by_state['substation']

# Sort by total count
summary_by_state = summary_by_state.sort_values('total', ascending=False)

# Display the summary
print("Summary of transformer and substation counts by state:")
display(summary_by_state)

# Calculate totals for the entire dataset
total_transformers = combined_df[combined_df['power'] == 'transformer'].shape[0]
total_substations = combined_df[combined_df['power'] == 'substation'].shape[0]
transformer_on_poles = combined_df[
    (combined_df['power'] == 'pole') & 
    (combined_df['tags'].str.contains('transformer', na=False))
].shape[0]

print(f"\nTotal transformers: {total_transformers}")
print(f"Total substations: {total_substations}")
print(f"Transformers on poles: {transformer_on_poles}")
print(f"Total power infrastructure elements: {total_transformers + total_substations + transformer_on_poles}")

In [None]:
### Visualization: 
# Create a bar plot of transformers and substations by state
plt.figure(figsize=(16, 10))

# Get the top 15 states by total count for better visualization
top_states = summary_by_state.head(15).copy()

# Plot
ax = top_states[['transformer', 'substation']].plot(kind='bar', stacked=True)
plt.title('Distribution of Transformers and Substations by State (Top 15)', fontsize=16)
plt.xlabel('State', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)

# Add counts as text on bars
for i, total in enumerate(top_states['total']):
    plt.text(i, total + 50, str(total), ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

# Create a pie chart for overall distribution
plt.figure(figsize=(10, 10))
labels = ['Transformers', 'Substations', 'Transformers on Poles']
sizes = [total_transformers, total_substations, transformer_on_poles]
colors = ['#ff9999','#66b3ff','#99ff99']
explode = (0.1, 0, 0)  # explode the 1st slice (Transformers)

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.title('Distribution of Power Infrastructure Elements', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Function to extract all tags from the dataset
def extract_tags(df):
    all_tags = {}
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Extracting tags"):
        if isinstance(row['tags'], str):
            try:
                tags = eval(row['tags'])  # Convert string representation to dict
                for k, v in tags.items():
                    if k not in all_tags:
                        all_tags[k] = []
                    all_tags[k].append(v)
            except:
                pass  # Skip malformed tags
    return all_tags

# Extract all tags
all_tags = extract_tags(combined_df)

# Count the most common tags
tag_counts = {k: len(v) for k, v in all_tags.items()}
tag_df = pd.DataFrame.from_dict(tag_counts, orient='index', columns=['count'])
tag_df = tag_df.sort_values('count', ascending=False)

# Display top 20 most common tags
print("Top 20 most common tags:")
display(tag_df.head(20))

# Visualize top 10 tags
plt.figure(figsize=(12, 8))
top_tags = tag_df.head(10)
sns.barplot(x=top_tags.index, y=top_tags['count'])
plt.title('Top 10 Most Common Tags', fontsize=16)
plt.xlabel('Tag', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Function to create a map for a specific state with robust error handling
def create_state_map(state_code):
    # Load GeoJSON data for the state
    geojson_files = glob.glob(os.path.join(base_dir,state_code, "*.geojson"))
    print(base_dir)
    print(geojson_files)
    if geojson_files:
        geojson_file = geojson_files[0]
    else:
        print(f"No GeoJSON data found for {state_code}")
        return None
    
    try:
        # Load the GeoJSON file
        gdf = gpd.read_file(geojson_file)
        # Print basic information
        print("DataFrame Info:")
        print(gdf.info())
        print("\nFirst few rows:")
        print(gdf.head())
        print("\nGeometry types:")
        print(gdf.geometry.type.value_counts())
        
        # Debug information
        print(f"GeoDataFrame shape: {gdf.shape}")        
        # Create a folium map centered at the mean of all points
        center = [gdf.geometry.y.mean(), gdf.geometry.x.mean()]
        m = folium.Map(location=center, zoom_start=7)
        
        # Add marker clusters
        marker_cluster = MarkerCluster().add_to(m)
        
        # Add points with popup information
        for idx, row in gdf.iterrows():
            try:
                # Determine how to access properties based on the dataframe structure
                if 'properties' in gdf.columns:
                    # If 'properties' is a column, use it directly
                    props = row['properties']
                    if isinstance(props, str):
                        # If it's a string, try to parse it
                        try:
                            props = eval(props)
                        except:
                            props = {}
                elif 'power' in gdf.columns:
                    # If properties are flattened to columns, create a dict
                    props = {
                        'power': row.get('power', ''),
                        'name': row.get('name', ''),
                        'operator': row.get('operator', '')
                    }
                else:
                    # If we can't find properties, use empty values
                    print(f"Warning: Could not determine properties structure for row {idx}")
                    props = {}
                
                # Extract information from properties with defaults
                power_type = props.get('power', 'Unknown')
                name = props.get('name', 'Unnamed') or 'Unnamed'  # Handle empty strings
                operator = props.get('operator', 'Unknown operator') or 'Unknown operator'
                
                # Determine color based on power type
                if power_type == 'transformer':
                    color = 'red'
                elif power_type == 'substation':
                    color = 'blue'
                else:
                    color = 'green'
                
                # Create popup content
                popup_content = f"""
                <b>Type:</b> {power_type}<br>
                <b>Name:</b> {name}<br>
                <b>Operator:</b> {operator}<br>
                """
                
                # Add marker to cluster
                folium.Marker(
                    location=[row.geometry.y, row.geometry.x],
                    popup=folium.Popup(popup_content, max_width=300),
                    icon=folium.Icon(color=color, icon='bolt', prefix='fa')
                ).add_to(marker_cluster)
                
            except Exception as e:
                print(f"Error processing row {idx}: {e}")
                continue
        
        return m
        
    except Exception as e:
        print(f"Error creating map for {state_code}: {e}")
        import traceback
        traceback.print_exc()
        return None

# Let's create maps for the top 3 states with most infrastructure
top_3_states = summary_by_state.index[:3].tolist()

for state in top_3_states:
    print(f"\n===== Creating map for {state} =====")
    state_map = create_state_map(state)
    if state_map:
        display(state_map)
        try:
            # Save the map to an HTML file (safely)
            output_file = f"{state}_power_infrastructure_map.html"
            state_map.save(output_file)
            print(f"Map saved to {output_file}")
        except Exception as e:
            print(f"Error saving map: {e}")

In [None]:
# Count and analyze operators
# Replace empty/missing values with "Unknown"
combined_df['operator'] = combined_df['operator'].fillna('Unknown')
combined_df.loc[combined_df['operator'] == '', 'operator'] = 'Unknown'

# Count operators
operators = combined_df['operator'].value_counts()

# Calculate percentage of unknown operators
unknown_count = operators.get('Unknown', 0)
total_count = len(combined_df)
unknown_percent = (unknown_count / total_count) * 100 if total_count > 0 else 0
print(f"Unknown operators: {unknown_count} ({unknown_percent:.2f}%)")

# Remove "Unknown" from the operators for the bar chart
if 'Unknown' in operators:
    known_operators = operators.drop('Unknown')
else:
    known_operators = operators

# Create DataFrame for plotting
operators_df = pd.DataFrame({'count': known_operators})
operators_df = operators_df[operators_df['count'] > 10]  # Filter out small operators

# Display top known operators
print(f"Found {len(known_operators)} unique known operators")
print("Top 20 known operators by infrastructure count:")
display(known_operators.head(20))

# Plot top known operators
plt.figure(figsize=(14, 8))
operators_df.head(15).plot(kind='bar', color='skyblue')
plt.title('Top 15 Known Operator Tags by Infrastructure Count', fontsize=16)
plt.xlabel('Operator', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Create a pie chart showing known vs unknown operators
plt.figure(figsize=(10, 10))
known_count = total_count - unknown_count
labels = ['Known Operators', 'Unknown Operators']
sizes = [known_count, unknown_count]
colors = ['#66b3ff', '#ff9999']
explode = (0, 0.1)  # explode the unknown slice

plt.pie(sizes, explode=explode, labels=labels, colors=colors, 
        autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')
plt.title('Proportion of Infrastructure with Known vs Unknown Operators', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Calculate infrastructure density per state
# We need state area data - let's use a simple approximation based on state boundaries
# This requires a shapefile of US states - you may need to download it

try:
    # Try to load US states shapefile (you may need to adjust the path)
    states_gdf = gpd.read_file('us_states_shapefile/us_states.shp')
    
    # Clean state codes to match our format
    states_gdf['STATE_CODE'] = 'US-' + states_gdf['STATE_ABBR']
    
    # Calculate area in square kilometers
    states_gdf['area_km2'] = states_gdf.to_crs('+proj=cea').area / 10**6
    
    # Merge with our summary data
    density_df = pd.merge(
        summary_by_state.reset_index(), 
        states_gdf[['STATE_CODE', 'area_km2']], 
        left_on='state', 
        right_on='STATE_CODE', 
        how='left'
    )
    
    # Calculate density
    density_df['density_per_1000km2'] = density_df['total'] / density_df['area_km2'] * 1000
    
    # Display density information
    print("Infrastructure density per 1000 km²:")
    display(density_df[['state', 'transformer', 'substation', 'total', 'area_km2', 'density_per_1000km2']]
            .sort_values('density_per_1000km2', ascending=False).head(10))
    
    # Plot density
    plt.figure(figsize=(14, 8))
    density_df.sort_values('density_per_1000km2', ascending=False).head(15).plot(
        x='state', y='density_per_1000km2', kind='bar'
    )
    plt.title('Power Infrastructure Density by State (per 1000 km²)', fontsize=16)
    plt.xlabel('State', fontsize=14)
    plt.ylabel('Density (elements per 1000 km²)', fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
except Exception as e:
    print(f"Could not analyze density: {e}")
    print("To analyze density, download a US states shapefile and adjust the path above.")

In [None]:
# Save the processed summary data for future use
summary_by_state.to_csv('transformer_substation_summary.csv')
tag_df.to_csv('tag_distribution.csv')

print("Summary data saved to CSV files.")
print("Analysis complete!")