<a href="https://colab.research.google.com/github/ElaheTorabi/Masters-Thesis/blob/main/Spatial_Autocorrelation_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spatial Autocorrelation Analysis of Fuel Consumption Data

Install & import required libraries

In [None]:
!pip install geopandas folium openpyxl
import pandas as pd
import folium


Upload the Excel files to Colab

In [None]:
from google.colab import files

uploaded = files.upload()


Read and combine all Excel files

In [None]:
dfs = []

for filename in uploaded.keys():
    df = pd.read_excel(filename)

    # Keep only one row per site (coordinates & ID are constant)
    site_info = df[[
        'TAG_NAME',
        'DEVICE_ID',
        'DUTYSTATION_NAME',
        'OGI_LAT',
        'OGI_LONG'
    ]].drop_duplicates()

    dfs.append(site_info)

# Combine all sites into one dataframe
sites_df = (
    pd.concat(dfs, ignore_index=True)
      .dropna(subset=['DEVICE_ID'])
      .drop_duplicates(subset='DEVICE_ID')
)


# Final clean
sites_df = sites_df.drop_duplicates(subset='DEVICE_ID')

sites_df

Create an interactive map

In [None]:
map_center = [
    sites_df['OGI_LAT'].mean(),
    sites_df['OGI_LONG'].mean()
]

m = folium.Map(location=map_center, zoom_start=7)

Add site markers with ID labels

In [None]:
for _, row in sites_df.iterrows():
    popup_text = f"""
    <b>Duty Station:</b> {row['DUTYSTATION_NAME']}<br>
    <b>Device ID:</b> {row['DEVICE_ID']}<br>
    <b>Tag Name:</b> {row['TAG_NAME']}
    """

    folium.Marker(
        location=[row['OGI_LAT'], row['OGI_LONG']],
        popup=popup_text,
        icon=folium.Icon(color='blue', icon='info-sign')
    ).add_to(m)

m


Computing Fuel consumption

In [None]:
dfs = []

for filename in uploaded.keys():
    df = pd.read_excel(filename)

    # --- Time handling ---
    df['Date_Hour_Desc'] = pd.to_datetime(df['Date_Hour_Desc'])
    df = df.sort_values('Date_Hour_Desc')

    # --- Observed consumption ---
    df['Consumption_observed'] = -df['LevelLiters'].diff()
    df.loc[df['Consumption_observed'] < 0, 'Consumption_observed'] = pd.NA

    df['Consumption_observed'] = (
        df['Consumption_observed']
        .rolling(window=3, center=True, min_periods=1)
        .mean()
    )

    # --- Outlier cleaning (Observed) ---
    Q1, Q3 = df["Consumption_observed"].quantile([0.25, 0.75])
    IQR = Q3 - Q1

    outliers = (
        (df["Consumption_observed"] < Q1 - 1.5 * IQR) |
        (df["Consumption_observed"] > Q3 + 1.5 * IQR)
    )

    df.loc[outliers, "Consumption_observed"] = pd.NA

    df["Consumption_observed"] = df["Consumption_observed"].interpolate(
        method="linear", limit_direction="both"
    )

    dfs.append(df)

all_df = pd.concat(dfs, ignore_index=True)

Aggregate observed consumption per site per year

In [None]:
all_df['Date_Hour_Desc'] = pd.to_datetime(all_df['Date_Hour_Desc'])
all_df['year'] = all_df['Date_Hour_Desc'].dt.year

In [None]:
agg_obs = (
    all_df
    .dropna(subset=['Consumption_observed'])
    .groupby(['DEVICE_ID', 'year'])
    .agg(
        Consumption_observed_mean=('Consumption_observed', 'mean'),
        OGI_LAT=('OGI_LAT', 'first'),
        OGI_LONG=('OGI_LONG', 'first'),
        DUTYSTATION_NAME=('DUTYSTATION_NAME', 'first')
    )
    .reset_index()
)

agg_obs.head()

Computing spatial autocorrelation using Moran’s I for each year by k-nearest neighbors

In [None]:
from libpysal.weights import KNN
from esda.moran import Moran

results = []

for year in agg_obs['year'].unique():
    df_year = agg_obs[agg_obs['year'] == year]

    if len(df_year) < 4:
        continue

    gdf_knn = gpd.GeoDataFrame(
        df_year,
        geometry=gpd.points_from_xy(df_year.OGI_LONG, df_year.OGI_LAT),
        crs="EPSG:4326"
    ).to_crs(epsg=32634)

    w = KNN.from_dataframe(gdf_knn, k=2)
    w.transform = 'r'

    y = gdf_knn['Consumption_observed_mean'].values
    moran = Moran(y, w)

    results.append({
        'year': year,
        'Moran_I': moran.I,
        'p_value': moran.p_sim
    })

moran_yearly_knn = pd.DataFrame(results)
moran_yearly_knn


Aggregate temperature-corrected consumption per site per year

In [None]:
# Ensure datetime
all_df['Date_Hour_Desc'] = pd.to_datetime(all_df['Date_Hour_Desc'])
all_df['year'] = all_df['Date_Hour_Desc'].dt.year

agg_corr = (
    all_df
    .dropna(subset=['Consumption_corrected'])
    .groupby(['DEVICE_ID', 'year'])
    .agg(
        Consumption_corrected_mean=('Consumption_corrected', 'mean'),
        OGI_LAT=('OGI_LAT', 'first'),
        OGI_LONG=('OGI_LONG', 'first'),
        DUTYSTATION_NAME=('DUTYSTATION_NAME', 'first')
    )
    .reset_index()
)

agg_corr.head()

Computing spatial autocorrelation of temperature-corrected consumption using Moran’s I for each year by k-nearest neighbors

In [None]:
from libpysal.weights import DistanceBand
from esda.moran import Moran
import geopandas as gpd

results_corr = []

for year in agg_corr['year'].unique():
    df_year = agg_corr[agg_corr['year'] == year]

    # Skip years with too few sites
    if len(df_year) < 4:
        continue

    # GeoDataFrame
    gdf_corr_knn = gpd.GeoDataFrame(
        df_year,
        geometry=gpd.points_from_xy(df_year.OGI_LONG, df_year.OGI_LAT),
        crs="EPSG:4326"
    ).to_crs(epsg=32634)

    # Spatial weights (same as observed!)
    w = KNN.from_dataframe(gdf_corr_knn, k=2)
    w.transform = 'r'

    # Moran's I
    y = gdf_corr_knn['Consumption_corrected_mean'].values
    moran = Moran(y, w)

    results_corr.append({
        'year': year,
        'Moran_I_corrected': moran.I,
        'p_value_corrected': moran.p_sim
    })

moran_corrected_corr_knn = pd.DataFrame(results_corr)
moran_corrected_corr_knn


Comparing the results

In [None]:
moran_compare = (
    moran_yearly_knn
    .merge(moran_corrected_corr_knn, on='year', how='inner')
)

moran_compare