# Processing the Satellite data downloaded from Google Drive to add lat, long as columns

In [1]:
import os
import pandas as pd

DATA_DIR = "./shc_data/"
SATELLITE_DIR = "SATELLITE_DATA"
YEAR = "AGRI_2023-24"

PATH = os.path.join(DATA_DIR, SATELLITE_DIR, YEAR)
states = os.listdir(PATH)

state_dfs = []
for state in states:
    district_dfs = []
    files = os.listdir(os.path.join(PATH, state))
    for file in files:
        try:
            df = pd.read_csv(os.path.join(PATH, state, file))
        except pd.errors.EmptyDataError:
            continue
        district_dfs.append(df) 
    try:
        temp_df = pd.concat(district_dfs, ignore_index=True)
    except ValueError:
        continue
    
    state_dfs.append(temp_df)

merged_df = pd.concat(state_dfs, ignore_index=True)

# Apply your split logic and extract lat/lon
coords = merged_df['.geo'].apply(lambda x: x.split('"coordinates":')[1].split('}')[0].strip('[').strip(']').split(','))

# Convert to DataFrame and assign to new columns
merged_df[['longitude', 'latitude']] = pd.DataFrame(coords.tolist(), index=merged_df.index).astype(float)
merged_df = merged_df.drop(['.geo'], axis=1)

print(merged_df.head())

  system:index     B      BLUE     Cu     EC      Fe     GREEN      K     Mn  \
0          0_0  0.91  0.112519  1.204  0.718   8.609  0.123572  258.0  1.204   
1          1_0  0.81  0.129469  0.781  0.549  11.275  0.138334  147.0  4.543   
2          2_0  0.82  0.144166  0.977  0.717  19.871  0.155562  370.0  3.941   
3          3_0  0.63  0.193156  1.398  0.636  12.180  0.204409  209.0  4.396   
4          4_0  1.00  0.162538  1.115  0.459  13.693  0.171962  236.0  5.854   

       N  ...  sand05  sand515  silt05  silt515     slope  \
0  202.0  ...     316      317     392      390  4.208206   
1  222.0  ...     286      291     415      400  1.383771   
2  223.0  ...     282      285     414      409  4.739286   
3  181.0  ...     291      294     400      397  1.383168   
4  216.0  ...     312      311     376      373  1.383140   

            start_date       temp      village  longitude   latitude  
0  2023-07-01T00:00:00  30.234000        Pabra  86.113357  25.561966  
1  2023-07

In [11]:
# merged_df.to_csv(os.path.join(DATA_DIR, SATELLITE_DIR, YEAR, "COMBINED_2024.csv"), index=False)

# Split the Satellite data into AEZs

In [10]:
import numpy as np
from utils.segregate_by_aez import segregate_by_aez

SAVE_DIR = os.path.join(DATA_DIR, "AEZS", YEAR)
os.makedirs(SAVE_DIR, exist_ok=True)

In [11]:
aez_df = segregate_by_aez(merged_df)

In [12]:
unique_aezs = aez_df['ae_regcode'].unique()
unique_aezs = unique_aezs[~np.isnan(unique_aezs)]
unique_aezs

array([13.,  9., 11., 15.,  4.,  2.,  5., 19.,  6., 16., 17., 18.,  7.,
       12.,  3.,  8., 10., 14., 20.])

In [13]:
for aez in unique_aezs:
    df = aez_df[aez_df['ae_regcode'] == aez]
    print(int(aez), len(df))
    df.to_csv(os.path.join(SAVE_DIR, f"AEZ_{int(aez)}.csv"), index=False)

13 205491
9 223225
11 119979
15 356828
4 410215
2 151428
5 145290
19 28087
6 136459
16 21233
17 20879
18 74000
7 24207
12 98214
3 32929
8 193841
10 118962
14 53571
20 203


# Features selected for the prediction for each AEZ

In [1]:
import os
import joblib
import pandas as pd

In [8]:
PATH = "./rfr_joblib/"
files = os.listdir(PATH)
data_dict = {"AEZ" : [], "features": []}
PRED_VARIABLE = 'N'
aez_list = []
features_list = []

for file in files:
    if file.endswith(f"{PRED_VARIABLE}.joblib"):
        aez = file.split("_AEZ_")[1].split("_")[0]
        rf = joblib.load(os.path.join(PATH, file))
        feature_importance_df = pd.DataFrame({
            "Feature": rf.feature_names_in_.tolist(),
            "Importance": rf.feature_importances_
        })
        # Sort in Descending Order
        feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
        aez_list.append(int(aez))
        features_list.append(feature_importance_df["Feature"].tolist())

data_dict["AEZ"] = aez_list
data_dict["features"] = features_list

df = pd.DataFrame(data_dict)
df = df.sort_values(by='AEZ', ascending=True)
df.to_csv(f"aez_features_{PRED_VARIABLE}.csv", index=False)