# **Necessary libraries**
### load necessary libraries for one time

In [None]:
import os
import xarray as xr
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import netCDF4 as nc

import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from tqdm import tqdm
from IPython.display import display

from sklearn.metrics import mean_squared_error
from math import sqrt


# **Parameters Set up**

In [None]:
"""
Parameters:
    latitude_min (float):     Minimum latitude for filtering.
    latitude_max (float):     Maximum latitude for filtering.
    longitude_min (float):    Minimum longitude for filtering.
    longitude_max (float):    Maximum longitude for filtering.
    start_time (str):         Start of the date range in 'YYYY-MM-DD HH:MM' format.
    end_time (str):           End of the date range in 'YYYY-MM-DD HH:MM' format.
"""

start_time='2019-07-03 00:00:00'
end_time='2019-07-17 00:00:00'
longitude_min=-119.04
longitude_max=-116.28
latitude_min=33.261
latitude_max=34.75
time_difference=8

In [None]:
# Define paths for input and output CSV files
AQS_path = '/project/zhan248_1326/hhao4018/Model Evaluation/hourly_TEMP_2019/hourly_TEMP_2019.csv'

#1: only use the urban area stations
#0: no open
Use_Urban_Land = 1

ref_file_path = "/project/zhan248_1326/hhao4018/Model Evaluation/wrfout_d02_2016-08-10_10_00_00"

filtered_file_path = '/project/zhan248_1326/hhao4018/Model Evaluation/AQS_obs_T2.csv'

# Define the path to the WRF output folder
wrf_output_folder = '/project2/zhan248_1326/hhao4018/WRFv4.6.1_Modified_UQ/test/WRF_PCE_UQ_07_LA/'

file_path_obs = filtered_file_path

pairing_data_path = '/project/zhan248_1326/hhao4018/Model Evaluation/model_obs_pairs_T2.csv'

picture_save_path="1 Benchmark Evaluation Result/Winter_T2_v2.png"

# **Process Start**

### **Step 0: Using Urban Area Shift Urban obs stations**

In [None]:

def get_landuse_type(target_lons, target_lats, file_path=ref_file_path):
    dataset = nc.Dataset(file_path)
    landuse = dataset.variables["LU_INDEX"][:]
    lat = dataset.variables["XLAT"][:]
    lon = dataset.variables["XLONG"][:]
    
    landuse_types = []
    for target_lon, target_lat in zip(target_lons, target_lats):
        distance = np.sqrt((lat - target_lat)**2 + (lon - target_lon)**2)
        index = np.unravel_index(np.argmin(distance), distance.shape)
        
        landuse_types.append(landuse[index])
    
    return landuse_types

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

def get_unique_coords(csv_file):
    df = pd.read_csv(csv_file)
    coords = list(set(zip(df['Longitude'], df['Latitude'])))
    return coords

def filter_coordinates_by_landuse_and_shp(csv_file, ref_file_path, shp_file):
    coords = get_unique_coords(csv_file)
    lons, lats = zip(*coords)

    landuse_types = get_landuse_type(list(lons), list(lats), file_path=ref_file_path)

    gdf = gpd.read_file(shp_file)
    shape_union = gdf.unary_union

    filtered_coords = []
    for (lon, lat), lu in zip(coords, landuse_types):
        pt = Point(lon, lat)
        if (lu > 50 or lu == 13) and shape_union.contains(pt):
            filtered_coords.append((lon, lat))

    return filtered_coords

if __name__ == '__main__':
    filtered_coordinates = filter_coordinates_by_landuse_and_shp(AQS_path, ref_file_path, 'southcoastAirB_ExportFeature/southcoastAirB_ExportFeature.shp')
    print(filtered_coordinates)

### **Step 1: Read and shift AQS data**

In [None]:
def load_shift_obs(AQS_path, filtered_file_path, filtered_coordinates, Use_Urban_Land):
    """
    Load and filter AQS observation data based on spatial and temporal constraints.
    
    Parameters:
        AQS_path (str):         Path to the input observation data file (CSV format).
        filtered_file_path (str): Path to save the filtered observation data.
    """
    filtered_chunks = []
    
    with open(AQS_path, 'r', encoding='utf-8') as f:
        total_lines = sum(1 for line in f) - 1 

    chunksize = 10000
    for chunk in tqdm(pd.read_csv(AQS_path, chunksize=chunksize), 
                      total=(total_lines // chunksize) + 1, 
                      desc="Processing data chunks"):
        chunk['Date Full'] = pd.to_datetime(chunk['Date GMT'] + ' ' + chunk['Time GMT'])
        
        if Use_Urban_Land == 0:    
            filtered_chunk = chunk[
                (chunk['Date Full'] >= start_time) &
                (chunk['Date Full'] < end_time) &
                (chunk['Longitude'] >= longitude_min) &
                (chunk['Longitude'] <= longitude_max) &
                (chunk['Latitude'] >= latitude_min) &
                (chunk['Latitude'] <= latitude_max)
            ]
        else:
            mask_coords = chunk.apply(lambda row: (row['Longitude'], row['Latitude']) in  filtered_coordinates, axis=1)

            filtered_chunk = chunk[
                (chunk['Date Full'] >= start_time) &
                (chunk['Date Full'] < end_time) &
                (chunk['Longitude'] >= longitude_min) &
                (chunk['Longitude'] <= longitude_max) &
                (chunk['Latitude'] >= latitude_min) &
                (chunk['Latitude'] <= latitude_max) &
                (mask_coords)
            ]
        
        filtered_chunks.append(filtered_chunk)
    
    filtered_data = pd.concat(filtered_chunks, ignore_index=True)
    
    filtered_data.to_csv(filtered_file_path, index=False)

# Example usage
if __name__ == '__main__':

    # Call the function to load and filter the observation data
    load_shift_obs(AQS_path, filtered_file_path, filtered_coordinates, Use_Urban_Land)


### **Step 2: Read WRF-output data**

In [None]:
def read_wrfout(wrf_output_folder):
    """
    Reads WRF output files from a specified folder, extracting surface temperature (T2),
    latitude (XLAT), and longitude (XLONG) data.
    
    Parameters:
    wrf_output_folder (str): Path to the folder containing WRF output files.

    Returns:
    data_wrf_output (dict): Dictionary with filenames as keys and T2 data as values.
    latitudes_model (np.ndarray): Array of latitude values (XLAT) from the first WRF file.
    longitudes_model (np.ndarray): Array of longitude values (XLONG) from the first WRF file.
    """
    # Dictionary to store T2 data from each file
    data_wrf_output = {}
    
    # Initialize lat/long variables
    latitudes_model, longitudes_model = None, None

    # Get list of files and initialize progress bar
    files = [f for f in os.listdir(wrf_output_folder) if f.startswith("wrfout_d02")]
    for filename in tqdm(files, desc="Processing WRF output files"):
        
        # Construct full file path
        sub_hour_path = os.path.join(wrf_output_folder, filename)
        
        # Open the WRF file as an xarray dataset
        wrfout = xr.open_dataset(sub_hour_path, engine='netcdf4')
        
        # Store the T2 (surface temperature) data in the dictionary
        data_wrf_output[filename] = wrfout['T2']
        
        # Load latitude and longitude only once from the first matching file
        if latitudes_model is None or longitudes_model is None:
            latitudes_model = wrfout['XLAT'].values
            longitudes_model = wrfout['XLONG'].values

    data_wrf_output['latitudes'] = latitudes_model
    data_wrf_output['longitudes'] = longitudes_model
    
    return data_wrf_output

# Example usage
if __name__ == '__main__':
    
    # Call the function to read WRF data
    data_wrf_output = read_wrfout(wrf_output_folder)


### **Step 3: Match Obs data and model data**

In [None]:
def pairing_obs_model(file_path_obs, data_wrf_output, pairing_data_path):
    """
    Pair observed values with model values based on the nearest latitude and longitude grid points.
    
    Parameters:
    file_path_obs (str): Path to the CSV file containing observation data.
    data_wrf_output (dict): Dictionary containing model data, including latitudes and longitudes.
    pairing_data_path (str): Path to save the paired data output.
    time_difference (int): Time difference in hours to adjust for LA local time.
    """
    # Load observation data
    df_obs = pd.read_csv(file_path_obs)

    # Extract relevant columns
    Datetime = pd.to_datetime(df_obs['Date Full'])
    Hour = df_obs['Time GMT']
    StationID = df_obs['Site Num']
    StationName = df_obs['State Name']
    AT = df_obs['Sample Measurement']
    Lat = df_obs['Latitude']
    Long = df_obs['Longitude']

    # Initialize list to collect paired results
    results = []

    # Perform pairing of observations and model data with progress bar
    for i in tqdm(range(len(Datetime)), desc="Pairing observations with model data"):
        target_lat, target_lon = Lat[i], Long[i]

        # Calculate distances to find the nearest model grid point
        distance = np.sqrt((data_wrf_output['latitudes'] - target_lat)**2 + 
                           (data_wrf_output['longitudes'] - target_lon)**2)
        min_index = np.unravel_index(np.argmin(distance, axis=None), distance.shape)

        month, day, hour = Datetime[i].month, Datetime[i].day, Datetime[i].hour
        
        # Retrieve model temperature at the nearest grid point and convert from Kelvin to Celsius
        date_str = f"wrfout_d02_2019-{month:02d}-{day:02d}_{hour:02d}:00:00"
        t2 = data_wrf_output[date_str].isel(south_north=min_index[1], west_east=min_index[2]).values[0] - 273.15

        # Append paired observation and model data to results
        results.append([StationID[i],StationName[i], Datetime[i], Hour[i], (AT[i]-32)/1.8, t2, min_index[1], min_index[2],Lat[i], Long[i]])

    # Create DataFrame and save as CSV
    df = pd.DataFrame(results, columns=['StationID','StationName', 'LA Datetime', 'LA_Hour', 'T2_Obs', 'T2_model', 'sn', 'we','latitude','longitude'])
    df.to_csv(pairing_data_path, index=False)

# Example usage
if __name__ == '__main__':
    pairing_obs_model(file_path_obs, data_wrf_output, pairing_data_path)


# **Model Evaluation**

### **Step 1: common evaluation metrics**

In [None]:
# Define a function to calculate error metrics
def calculate_metrics(true, pred):
    """
    Calculates common error metrics to evaluate model performance.
    
    Parameters:
    true (array-like): Array of true observation values.
    pred (array-like): Array of model-predicted values.
    
    Returns:
    list: List containing MB, MAE, RMSE, NMB, NME, and R_squared values.
    """
    # Mean Bias (MB)
    MB = np.mean(pred - true)
    # Mean Absolute Error (MAE)
    MAE = np.mean(np.abs(pred - true))
    # Root Mean Square Error (RMSE)
    RMSE = np.sqrt(np.mean((pred - true) ** 2))
    # Normalized Mean Bias (NMB)
    NMB = (np.sum(pred - true) / np.sum(true)) * 100
    # Normalized Mean Error (NME)
    NME = (np.sum(np.abs(pred - true)) / np.sum(true)) * 100
    # Coefficient of Determination (R²)
    R_squared = np.corrcoef(pred, true)[0, 1] ** 2
    # Return all calculated error metrics
    return [MB, MAE, RMSE, NMB, NME, R_squared]

# Define a function to display error metrics
def display_metrics(pairing_data_path, model_name):
    """
    Loads paired data from a CSV file, calculates error metrics, 
    and displays the results.
    
    Parameters:
    pairing_data_path (str): Path to the CSV file containing paired observed and model data.
    model_name (str): Name of the model, used as a label in the results.
    """
    # Load the CSV file containing paired observed and model data
    df_model_obs_pairs = pd.read_csv(pairing_data_path)
    df_model_obs_pairs = df_model_obs_pairs.dropna(subset=['T2_Obs', 'T2_model'])
    # Convert T2_Obs and T2_model to numeric
    df_model_obs_pairs['T2_Obs'] = pd.to_numeric(df_model_obs_pairs['T2_Obs'], errors='coerce')
    df_model_obs_pairs['T2_model'] = pd.to_numeric(df_model_obs_pairs['T2_model'], errors='coerce')
    cleaned_data = df_model_obs_pairs.dropna(subset=['T2_Obs', 'T2_model'])
    
    # Call the calculate_metrics function to compute error metrics
    result = calculate_metrics(cleaned_data['T2_Obs'].values, cleaned_data['T2_model'].values)

    
    # Initialize a dictionary to store the metrics
    metrics = {
        'MB': [],       # Mean Bias
        'MAE': [],      # Mean Absolute Error
        'RMSE': [],     # Root Mean Square Error
        'NMB (%)': [],  # Normalized Mean Bias (as a percentage)
        'NME (%)': [],  # Normalized Mean Error (as a percentage)
        'R²': []        # Coefficient of Determination
    }
    
    # Populate the dictionary with calculated metric values
    for i, key in enumerate(metrics):
        metrics[key].append(result[i])
        
    # Convert metrics dictionary to DataFrame with the model name as the row index
    results_df = pd.DataFrame(metrics, index=[model_name])
    
    # Display the DataFrame
    display(results_df)

# Example usage
if __name__ == '__main__':

    display_metrics(pairing_data_path, 'T2')

### **Step 2: Plot scatter between obs and model data**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import gaussian_kde
import matplotlib as mpl
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def plot_scatter(pairing_data_path, picture_save_path):
    data = pd.read_csv(pairing_data_path)
    data = data.dropna(subset=['T2_Obs', 'T2_model'])
    
    # 提取观测值和模型值
    observed = data['T2_Obs'].values
    model = data['T2_model'].values
    
    # 拟合线性模型
    regressor = LinearRegression()
    observed_reshaped = observed.reshape(-1, 1)
    regressor.fit(observed_reshaped, model)
    model_pred = regressor.predict(observed_reshaped)
    slope = regressor.coef_[0]
    intercept = regressor.intercept_

    # 2) 计算统计指标
    sim=model
    obs=observed
    mb   = np.mean(sim - obs)                                     # Mean Bias
    mae  = mean_absolute_error(obs, sim)                          # MAE
    rmse = np.sqrt(mean_squared_error(obs, sim))                  # RMSE
    r2   = r2_score(obs, sim)      
    
    # 计算密度
    xy = np.vstack([observed, model])
    z = gaussian_kde(xy)(xy)
    idx = z.argsort()
    observed, model, z = observed[idx], model[idx], z[idx]

    # 绘制图表
    mpl.rcParams['font.size'] = 24
    mpl.rcParams['font.family'] = 'Times New Roman'
    
    fig, ax = plt.subplots(figsize=(10, 10))
    
    scatter = ax.scatter(observed, model, c=z, s=20, cmap='viridis', edgecolor=None)
    
    # 添加 y = x 线
    plt.plot([0, 100], [0, 100], 'r--', label='Simulation Temperature = Observation Temperature', linewidth =3)

    # 8) 注记统计指标
    stats_text = (
        f"MB   = {mb:.2f} K\n"
        f"MAE  = {mae:.2f} K\n"
        f"RMSE = {rmse:.2f} K\n"
        f"$R^2$   = {r2:.2f}"
    )
    ax.text(0.05, 0.95, stats_text,
            transform=ax.transAxes,
            fontsize=20,
            va='top',
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.7))

    # 设置图表细节
    ax.set_xlabel("Observation Temperature (°C)")
    ax.set_ylabel("Simulation Temperature (°C)")
    ax.set_xlim(observed.min(),observed.max())
    ax.set_ylim(observed.min(),observed.max())
    ax.set_title("")
    #ax.legend(loc="upper left")
    ax.grid(True, alpha=0.3)
    
    #plt.savefig(picture_save_path, format='png', dpi=800)
    plt.show()

# 示范调用函数
plot_scatter(pairing_data_path, picture_save_path)