In [1]:
import os
import joblib
import pandas as pd

In [2]:
PATH = "./rfr_joblib/"
files = os.listdir(PATH)
files

['rfr_model_2025-06-16_AEZ_6_N.joblib',
 'rfr_model_2025-06-17_AEZ_10_N.joblib',
 'rfr_model_2025-06-11_AEZ_4_N.joblib',
 'rfr_model_2025-06-16_AEZ_7_N.joblib',
 '.ipynb_checkpoints',
 'rfr_model_2025-06-11_AEZ_2_N.joblib',
 'rfr_model_2025-06-11_AEZ_5_N.joblib',
 'rfr_model_2025-06-18_AEZ_18_N.joblib',
 'rfr_model_2025-06-18_AEZ_19_N.joblib',
 'rfr_model_2025-06-11_AEZ_3_N.joblib',
 'rfr_model_2025-06-17_AEZ_12_N.joblib',
 'rfr_model_2025-06-16_AEZ_9_N.joblib',
 'rfr_model_2025-06-18_AEZ_16_N.joblib',
 'rfr_model_2025-06-16_AEZ_8_N.joblib',
 'rfr_model_2025-06-18_AEZ_15_N.joblib',
 'rfr_model_2025-06-17_AEZ_13_N.joblib',
 'rfr_model_2025-06-17_AEZ_11_N.joblib',
 'rfr_model_2025-06-17_AEZ_14_N.joblib',
 'rfr_model_2025-06-18_AEZ_17_N.joblib',
 'rfr_model_2025-06-18_AEZ_20_N.joblib']

In [3]:
data_dict = {"AEZ" : [], "features": []}

In [4]:
aez_list = []
features_list = []

In [6]:
for file in files:
    if file.endswith(".joblib"):
        aez = file.split("_AEZ_")[1].split("_")[0]
        rf = joblib.load(os.path.join(PATH, file))
        aez_list.append(aez)
        features_list.append(rf.feature_names_in_.tolist())

In [7]:
data_dict["AEZ"] = aez_list
data_dict["features"] = features_list

In [8]:
df = pd.DataFrame(data_dict)

In [9]:
df

Unnamed: 0,AEZ,features
0,6,"[temp, RI, elevation, precipitation, clay515, ..."
1,10,"[temp, elevation, precipitation, clay05, sand0..."
2,4,"[temp, elevation, precipitation, sand05, silt0..."
3,7,"[temp, NDVI, SI, HI, elevation, precipitation,..."
4,2,"[temp, elevation, precipitation, clay05, sand0..."
5,5,"[temp, elevation, precipitation, clay05, sand0..."
6,18,"[temp, BI, SI, elevation, precipitation, sand0..."
7,19,"[temp, RI, elevation, precipitation, clay05, s..."
8,3,"[temp, elevation, precipitation, clay05, silt0..."
9,12,"[temp, SI, RI, elevation, precipitation, clay0..."


In [10]:
df.to_csv("aez_features_N.csv", index=False)

In [12]:
import os
import pandas as pd
num = 16

PATH = f"./new_data/processed/AEZ{num}"
files = os.listdir(PATH)
merged_df = pd.concat([pd.read_csv(os.path.join(PATH, file)) for file in files], ignore_index=True)

# Apply your split logic and extract lat/lon
coords = merged_df['.geo'].apply(lambda x: x.split('"coordinates":')[1].split('}')[0].strip('[').strip(']').split(','))

# Convert to DataFrame and assign to new columns
merged_df[['longitude', 'latitude']] = pd.DataFrame(coords.tolist(), index=merged_df.index).astype(float)
merged_df = merged_df.drop(['.geo'], axis=1)

print(merged_df.head())

  system:index     B      BLUE    Cu    EC     Fe     GREEN       K    Mn  \
0          0_0  0.23  0.173895  0.16  0.11  10.54  0.187054  125.00  0.58   
1          1_0  0.74  0.206968  0.29  0.19  33.82  0.202451  342.26  0.83   
2          2_0  0.64  0.175892  0.42  0.41  24.26  0.176174  356.25  0.35   
3          3_0  0.75  0.191287  0.18  0.20  25.58  0.191329  154.75  0.65   
4          4_0  0.98  0.197591  0.28  0.29   0.21  0.203190  269.23  0.65   

         N  ...  precipitation  sand05  sand515  silt05  silt515      slope  \
0  401.680  ...       5.964205     379      380     337      337  14.416960   
1  272.912  ...       5.609920     382      377     356      362   4.544931   
2  339.296  ...       5.906609     395      386     378      379  22.865652   
3  287.664  ...       5.742670     402      398     345      343  24.227806   
4  228.656  ...       5.609920     386      384     337      342  22.350290   

        temp              village  longitude   latitude  
0  2

In [13]:
merged_df.columns

Index(['system:index', 'B', 'BLUE', 'Cu', 'EC', 'Fe', 'GREEN', 'K', 'Mn', 'N',
       'NIR', 'OC', 'P', 'RED', 'S', 'SWIR1', 'SWIR2', 'TWI', 'Zn', 'aspect',
       'clay05', 'clay515', 'district', 'elevation', 'pH', 'precipitation',
       'sand05', 'sand515', 'silt05', 'silt515', 'slope', 'temp', 'village',
       'longitude', 'latitude'],
      dtype='object')

In [14]:
merged_df.to_csv(f"./new_data/processed/AEZ_{num}.csv", index=False)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from scipy.stats import median_abs_deviation

In [2]:
# Function to calculate Modified Z-Score
def modified_z_score(series):
    median = np.median(series)
    mad = median_abs_deviation(series)
    epsilon = 1e-9 if mad == 0 else 0  # Adjust small value if necessary
    mad += epsilon
    # if mad == 0:  # Use standard deviation if MAD is zero
        # std_dev = np.std(series, ddof=1)
        # return (series - median) / std_dev if std_dev > 0 else series * 0
    # print(median, mad)
    return 0.6745 * (series - median) / mad

def apply_z_filter(df):
    df = df[df['OC'] >= 0.0]
    df = df[df['OC'] <= 1.0]
    df = df[df['pH'] >= 1.0]
    df = df[df['pH'] <= 14.0]
    
    threshold = 3.5
    props = ['B', 'Mn', 'K', 'P', 'Cu', 'S', 'OC', 'Zn', 'EC', 'Fe']
    for prop in props:
        df = df[df[prop] >= 0]
        # print(np.abs(modified_z_score(df[prop])))
        df = df[np.abs(modified_z_score(df[prop])) <= threshold]
        # print(f"After normalizing {prop} : size of df {len(df)}")
        # plot_props(df)

    return df

In [3]:
PATH = f"./new_data/processed"
files = os.listdir(PATH)
dfs_list = []
for file in files:
    if file.endswith(".csv"):
        aez = file.split("AEZ_")[1].split(".csv")[0]
        df = pd.read_csv(os.path.join(PATH, file))
        df['ae_regcode'] = aez
        print(aez)
        dfs_list.append(df)

merged_df = pd.concat(dfs_list, ignore_index=True)
merged_df

18
11
14
10
8
17
19
16
12
5
3
6
20
4
15
9
13
2
7


Unnamed: 0,system:index,B,BLUE,Cu,EC,Fe,GREEN,K,Mn,N,...,sand05,sand515,silt05,silt515,slope,temp,village,longitude,latitude,ae_regcode
0,0_0,1.416667,0.090758,6.816549,0.90,90.363400,0.118589,137.0,15.649901,128.10,...,369,363,315,316,2.948019,30.495243,Badabaranga,84.592329,19.154193,18
1,1_0,0.841667,0.090758,6.468004,0.70,93.809048,0.118589,12.0,17.872237,192.15,...,369,363,315,316,2.948019,30.495243,Badabaranga,84.592329,19.154193,18
2,2_0,1.008333,0.088346,6.583759,0.80,62.696599,0.116399,40.0,15.556399,106.75,...,369,363,315,316,2.170676,30.495243,Badabaranga,84.592329,19.154462,18
3,3_0,1.175000,0.088225,6.503522,1.40,82.161155,0.118448,82.0,14.712748,170.80,...,369,363,315,316,0.927410,30.495243,Badabaranga,84.592329,19.154732,18
4,4_0,1.358333,0.098169,7.498398,0.60,52.097904,0.127258,61.0,18.314672,170.80,...,369,363,315,316,2.948021,30.495243,Badabaranga,84.592329,19.155001,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3284625,3995_0,2.780000,0.148664,1.048000,0.36,21.760000,0.179964,597.0,7.752000,141.00,...,415,409,296,294,3.050446,30.356029,Khandrika Seetharamavaram,81.132019,16.964280,7
3284626,3996_0,3.860000,0.148664,0.550000,0.21,23.920000,0.179964,491.0,9.292000,125.00,...,415,409,296,294,3.050446,30.356029,Khandrika Seetharamavaram,81.132019,16.964280,7
3284627,3997_0,3.860000,0.076385,0.550000,0.21,23.920000,0.092395,491.0,9.292500,125.00,...,421,408,308,303,1.940001,30.353268,Khandrika Seetharamavaram,81.081084,17.096602,7
3284628,3998_0,1.000000,0.117673,11.680000,0.11,6.990000,0.139467,145.0,1.164000,119.00,...,423,411,291,291,2.148309,30.305014,Edavalli,81.205860,16.918196,7


In [4]:
merged_df.columns

Index(['system:index', 'B', 'BLUE', 'Cu', 'EC', 'Fe', 'GREEN', 'K', 'Mn', 'N',
       'NIR', 'OC', 'P', 'RED', 'S', 'SWIR1', 'SWIR2', 'TWI', 'Zn', 'aspect',
       'clay05', 'clay515', 'district', 'elevation', 'pH', 'precipitation',
       'sand05', 'sand515', 'silt05', 'silt515', 'slope', 'temp', 'village',
       'longitude', 'latitude', 'ae_regcode'],
      dtype='object')

In [5]:
filtered_df = merged_df.groupby("district", group_keys=False).apply(apply_z_filter)

  filtered_df = merged_df.groupby("district", group_keys=False).apply(apply_z_filter)


In [6]:
unique_aezs = filtered_df['ae_regcode'].unique()

In [9]:
print(f"{unique_aezs}")
for aez in unique_aezs:
    df = filtered_df[filtered_df['ae_regcode'] == aez]
    if aez == '16':
        print(aez)
        df.to_csv(f"./new_data/filtered/AEZ_{aez}.csv", index=False)

['15' '13' '10' '5' '4' '9' '2' '6' '17' '16' '18' '12' '7' '14' '3' '8'
 '11' '19' '20']
16


In [3]:
import os
import glob
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# Load AEZ GeoJSON file
aez_gdf = gpd.read_file("../WRIS/Agro Ecological Regions.geojson")

# Ensure AEZ data is in the correct coordinate reference system (CRS)
aez_gdf = aez_gdf.to_crs(epsg=4326)  # Convert to WGS84 if not already

def segregate_by_aez(df):
    # Convert to GeoDataFrame
    gdf = gpd.GeoDataFrame(
        df, 
        geometry=[Point(xy) for xy in zip(df["longitude"], df["latitude"])],
        crs="EPSG:4326"
    )
    
    # Spatial join: Find which AEZ each point belongs to
    joined_gdf = gpd.sjoin(gdf, aez_gdf, how="left", predicate="within")
    
    # Keep only relevant columns
    result_df = joined_gdf.drop(columns=["geometry", "index_right", 'GmlID', 'objectid', 'physio_reg', 'area_sqkm', 'st_area_shape_', 'st_length_shape_'])
    
    return result_df

In [4]:
PATH = f"./new_data/processed"
files = os.listdir(PATH)
dfs_list = []
for file in files:
    if file.endswith(".csv"):
        aez = file.split("AEZ_")[1].split(".csv")[0]
        df = pd.read_csv(os.path.join(PATH, file))
        # df['ae_regcode'] = aez
        print(aez, len(df))
        dfs_list.append(df)

merged_df = pd.concat(dfs_list, ignore_index=True)
merged_df

18 121191
11 147617
14 71671
10 139002
8 268755
17 25810
19 37551
16 30785
12 161622
5 171533
3 47926
6 220008
20 313
4 525859
15 497388
9 299761
13 253637
2 222855
7 41346


Unnamed: 0,system:index,B,BLUE,Cu,EC,Fe,GREEN,K,Mn,N,...,precipitation,sand05,sand515,silt05,silt515,slope,temp,village,longitude,latitude
0,0_0,1.416667,0.090758,6.816549,0.90,90.363400,0.118589,137.0,15.649901,128.10,...,3.054822,369,363,315,316,2.948019,30.495243,Badabaranga,84.592329,19.154193
1,1_0,0.841667,0.090758,6.468004,0.70,93.809048,0.118589,12.0,17.872237,192.15,...,3.054822,369,363,315,316,2.948019,30.495243,Badabaranga,84.592329,19.154193
2,2_0,1.008333,0.088346,6.583759,0.80,62.696599,0.116399,40.0,15.556399,106.75,...,3.054822,369,363,315,316,2.170676,30.495243,Badabaranga,84.592329,19.154462
3,3_0,1.175000,0.088225,6.503522,1.40,82.161155,0.118448,82.0,14.712748,170.80,...,3.054822,369,363,315,316,0.927410,30.495243,Badabaranga,84.592329,19.154732
4,4_0,1.358333,0.098169,7.498398,0.60,52.097904,0.127258,61.0,18.314672,170.80,...,3.054822,369,363,315,316,2.948021,30.495243,Badabaranga,84.592329,19.155001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3284625,3995_0,2.780000,0.148664,1.048000,0.36,21.760000,0.179964,597.0,7.752000,141.00,...,3.432853,415,409,296,294,3.050446,30.356029,Khandrika Seetharamavaram,81.132019,16.964280
3284626,3996_0,3.860000,0.148664,0.550000,0.21,23.920000,0.179964,491.0,9.292000,125.00,...,3.432853,415,409,296,294,3.050446,30.356029,Khandrika Seetharamavaram,81.132019,16.964280
3284627,3997_0,3.860000,0.076385,0.550000,0.21,23.920000,0.092395,491.0,9.292500,125.00,...,3.471516,421,408,308,303,1.940001,30.353268,Khandrika Seetharamavaram,81.081084,17.096602
3284628,3998_0,1.000000,0.117673,11.680000,0.11,6.990000,0.139467,145.0,1.164000,119.00,...,3.310586,423,411,291,291,2.148309,30.305014,Edavalli,81.205860,16.918196


In [5]:
filtered_df = merged_df.groupby("district", group_keys=False).apply(apply_z_filter)

  filtered_df = merged_df.groupby("district", group_keys=False).apply(apply_z_filter)


In [6]:
aez_df = segregate_by_aez(filtered_df)

In [7]:
unique_aezs = aez_df['ae_regcode'].unique()

In [10]:
for aez in unique_aezs:
    df = aez_df[aez_df['ae_regcode'] == aez]
    print(aez, len(df))
    df.to_csv(f"./new_data/resampled/AEZ_{int(aez)}.csv", index=False)

15.0 399641
13.0 207902
10.0 105589
5.0 129461
4.0 397867
9.0 229618
2.0 157481
6.0 158319
17.0 17813
16.0 23478
18.0 76822
12.0 112686
7.0 26215
14.0 52080
3.0 35566
8.0 190755
11.0 105568
19.0 29097
nan 0


ValueError: cannot convert float NaN to integer