In [1]:
import os
import joblib
import pandas as pd

In [2]:
PATH = "./rfr_joblib/"
files = os.listdir(PATH)
files

['rfr_model_2025-06-16_AEZ_6_N.joblib',
 'rfr_model_2025-06-17_AEZ_10_N.joblib',
 'rfr_model_2025-06-11_AEZ_4_N.joblib',
 'rfr_model_2025-06-16_AEZ_7_N.joblib',
 '.ipynb_checkpoints',
 'rfr_model_2025-06-11_AEZ_2_N.joblib',
 'rfr_model_2025-06-11_AEZ_5_N.joblib',
 'rfr_model_2025-06-18_AEZ_18_N.joblib',
 'rfr_model_2025-06-18_AEZ_19_N.joblib',
 'rfr_model_2025-06-11_AEZ_3_N.joblib',
 'rfr_model_2025-06-17_AEZ_12_N.joblib',
 'rfr_model_2025-06-16_AEZ_9_N.joblib',
 'rfr_model_2025-06-18_AEZ_16_N.joblib',
 'rfr_model_2025-06-16_AEZ_8_N.joblib',
 'rfr_model_2025-06-18_AEZ_15_N.joblib',
 'rfr_model_2025-06-17_AEZ_13_N.joblib',
 'rfr_model_2025-06-17_AEZ_11_N.joblib',
 'rfr_model_2025-06-17_AEZ_14_N.joblib',
 'rfr_model_2025-06-18_AEZ_17_N.joblib',
 'rfr_model_2025-06-18_AEZ_20_N.joblib']

In [3]:
data_dict = {"AEZ" : [], "features": []}

In [4]:
aez_list = []
features_list = []

In [6]:
for file in files:
    if file.endswith(".joblib"):
        aez = file.split("_AEZ_")[1].split("_")[0]
        rf = joblib.load(os.path.join(PATH, file))
        aez_list.append(aez)
        features_list.append(rf.feature_names_in_.tolist())

In [7]:
data_dict["AEZ"] = aez_list
data_dict["features"] = features_list

In [8]:
df = pd.DataFrame(data_dict)

In [9]:
df

Unnamed: 0,AEZ,features
0,6,"[temp, RI, elevation, precipitation, clay515, ..."
1,10,"[temp, elevation, precipitation, clay05, sand0..."
2,4,"[temp, elevation, precipitation, sand05, silt0..."
3,7,"[temp, NDVI, SI, HI, elevation, precipitation,..."
4,2,"[temp, elevation, precipitation, clay05, sand0..."
5,5,"[temp, elevation, precipitation, clay05, sand0..."
6,18,"[temp, BI, SI, elevation, precipitation, sand0..."
7,19,"[temp, RI, elevation, precipitation, clay05, s..."
8,3,"[temp, elevation, precipitation, clay05, silt0..."
9,12,"[temp, SI, RI, elevation, precipitation, clay0..."


In [10]:
df.to_csv("aez_features_N.csv", index=False)

In [9]:
import os
import pandas as pd

DATA_DIR = "./shc_data/"
SATELLITE_DIR = "SATELLITE_DATA"
YEAR = "2024"

PATH = os.path.join(DATA_DIR, SATELLITE_DIR, YEAR)
states = os.listdir(PATH)

state_dfs = []
for state in states:
    district_dfs = []
    files = os.listdir(os.path.join(PATH, state))
    for file in files:
        try:
            df = pd.read_csv(os.path.join(PATH, state, file))
        except pd.errors.EmptyDataError:
            continue
        district_dfs.append(df) 
    try:
        temp_df = pd.concat(district_dfs, ignore_index=True)
    except ValueError:
        continue
    
    state_dfs.append(temp_df)

merged_df = pd.concat(state_dfs, ignore_index=True)

# Apply your split logic and extract lat/lon
coords = merged_df['.geo'].apply(lambda x: x.split('"coordinates":')[1].split('}')[0].strip('[').strip(']').split(','))

# Convert to DataFrame and assign to new columns
merged_df[['longitude', 'latitude']] = pd.DataFrame(coords.tolist(), index=merged_df.index).astype(float)
merged_df = merged_df.drop(['.geo'], axis=1)

print(merged_df.head())

  system:index      B      BLUE    Cu     EC     Fe     GREEN       K     Mn  \
0          0_0  0.647  0.111101  0.42  0.110  11.69  0.127523  138.21   2.48   
1          1_0  0.647  0.124067  0.18  0.071  11.96  0.140537  147.69   1.17   
2          2_0  1.250  0.104526  0.70  0.396  98.46  0.124614  314.36   1.66   
3          4_0  1.310  0.127284  3.66  0.155   7.88  0.144306  182.92   9.06   
4          5_0  0.499  0.128535  3.56  0.090   7.76  0.145185  121.95  10.84   

       N  ...  sand05  sand515  silt05  silt515     slope  start_date  \
0  203.0  ...     381      376     384      383  3.461705  2024-01-01   
1  262.0  ...     376      373     405      410  2.775001  2024-01-01   
2  290.0  ...     339      342     414      413  2.065258  2024-01-01   
3  362.0  ...     339      339     432      436  2.067726  2024-01-01   
4  234.0  ...     385      380     391      393  1.388967  2024-01-01   

        temp   village  longitude   latitude  
0  30.038812     Asura  87.852136

In [10]:
merged_df.columns

Index(['system:index', 'B', 'BLUE', 'Cu', 'EC', 'Fe', 'GREEN', 'K', 'Mn', 'N',
       'NIR', 'OC', 'P', 'RED', 'S', 'SWIR1', 'SWIR2', 'TWI', 'Zn', 'aspect',
       'clay05', 'clay515', 'date', 'district', 'elevation', 'end_date', 'pH',
       'precipitation', 'sand05', 'sand515', 'silt05', 'silt515', 'slope',
       'start_date', 'temp', 'village', 'longitude', 'latitude'],
      dtype='object')

In [11]:
merged_df.to_csv(os.path.join(DATA_DIR, SATELLITE_DIR, YEAR, "COMBINED_2024.csv"), index=False)

In [13]:
from segregate_by_aez import segregate_by_aez

In [14]:
aez_df = segregate_by_aez(merged_df)

In [17]:
import numpy as np

unique_aezs = aez_df['ae_regcode'].unique()
unique_aezs = unique_aezs[~np.isnan(unique_aezs)]
unique_aezs

array([13., 15.,  9., 11.,  5.,  2.,  4., 19.,  6., 16., 17.,  7., 18.,
        3.,  8., 12., 14., 10., 20.])

In [18]:
for aez in unique_aezs:
    df = aez_df[aez_df['ae_regcode'] == aez]
    print(aez, len(df))
    df.to_csv(os.path.join(DATA_DIR, "AEZS", YEAR, f"AEZ_{int(aez)}.csv"), index=False)

13.0 223547
15.0 391792
9.0 247913
11.0 144661
5.0 162324
2.0 177533
4.0 498657
19.0 26601
6.0 147349
16.0 22575
17.0 30554
7.0 20348
18.0 60384
3.0 27082
8.0 161273
12.0 122799
14.0 55669
10.0 130745
20.0 214
