In [None]:
import os
import re
import gc
import numpy as np
import pandas as pd
import random

from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

gc.enable()
import warnings
warnings.filterwarnings("ignore")

In [None]:
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed) 
random.seed(seed) 
tf.random.set_seed(seed) 

## Climate Features

In [None]:
# Define the function to assign climate zone based on latitude and longitude
def assign_climate_zone(lat, lon):
    # Coastal Mediterranean Climate (Latitude ~ 31-36, Longitude ~ -6 to -10)
    if (31 <= lat <= 36) and (-10 <= lon <= -6):
        return "Mediterranean"
    
    # Semi-Arid Climate (Latitude ~ 27-33, Longitude ~ -6 to -8)
    elif (27 <= lat <= 33) and (-8 <= lon <= -6):
        return "Semi-Arid"
    
    # Arid Climate (Latitude ~ 28-33, Longitude ~ -10 to -5, mainly desert regions)
    elif (28 <= lat <= 33) and (-10 <= lon <= -5):
        return "Arid"
    
    # Highland Climate (Latitude ~ 31-34, Longitude ~ -6 to -5, Atlas Mountains)
    elif (31 <= lat <= 34) and (-6 <= lon <= -5):
        return "Highland"
    
    # Default case: Other locations
    else:
        return "Unknown"

# Apply the function to each row of the DataFrame
data['climate_zone'] = data.apply(lambda row: assign_climate_zone(row['latitude'], row['longitude']), axis=1)

label_encoder = LabelEncoder()
data['climate_zone'] = label_encoder.fit_transform(data['climate_zone'])
climate_zone_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(climate_zone_mapping)

## Soil Features

In [None]:
# Soil Health Index
data['soil_health_index'] = (data['soil_ph'] + data['organic_matter_percent']) / 2

# Soil Fertility Index (SFI)
cec_average = 9.166667 # from cec_average
data['soil_fertility_index'] = cec_average + data['organic_matter_percent'] - abs(data['soil_ph'] - 6.5)

# Crop Rotation Index
data['crop_rotation_cycle'] = (data['crop'] != data['previous_crop']).astype(int)

# Nutrient Availability Index
data['nutrient_availability_index'] = data['soil_ph'] * data['phosphorus_ppm'] * data['potassium_ppm']

# Categorize Soil Properties
bins = [0, 6.5, 7.5, 14]
data['soil_ph_category'] = pd.cut(data['soil_ph'], bins=bins, labels=['acidic', 'neutral', 'alkaline'])

label_encoder = LabelEncoder()
data['soil_ph_category'] = label_encoder.fit_transform(data['soil_ph_category'])

# Stress Indicators
data['low_phosphorus_stress'] = (data['phosphorus_ppm'] < 20).astype(int)
data['high_conductivity_stress'] = (data['electrical_conductivity'] > 1.5).astype(int)

# Crop-Soil-NPK Interactions
data['crop_soil_interaction'] = data['organic_matter_percent'] * data['npk_potassium_k2o']
data['npk_interaction'] = data['npk_nitrogen'] * data['npk_phosphorus_p2o5'] * data['npk_potassium_k2o']

In [None]:
# Soil Neighborhood Aggregates
kmeans = KMeans(n_clusters=2, random_state=42)
data['geo_cluster'] = kmeans.fit_predict(data[['longitude', 'latitude']])
cluster_aggregates = data.groupby('geo_cluster').agg({
    'soil_ph': ['median'],
    'npk_nitrogen': ['median'],
    'npk_phosphorus_p2o5': ['median'],
    'npk_potassium_k2o': ['median'],
})
cluster_aggregates.columns = [f'{col[0]}_{col[1]}' for col in cluster_aggregates.columns]
data = data.merge(cluster_aggregates, on='geo_cluster', how='left')

In [None]:
# Spatial Features: Encode latitude and longitude as cyclic features
data['longitude_cos'] = np.cos(np.radians(data['longitude']))
data['latitude_cos'] = np.cos(np.radians(data['latitude']))
# data['longitude_sin'] = np.sin(np.radians(data['longitude']))
# data['latitude_sin'] = np.sin(np.radians(data['latitude']))

In [None]:
# Encoding Categorical Variables
categorical_cols = ['previous_crop', 'region', 'province', 'sub_program']
ohe = pd.get_dummies(data[categorical_cols], columns=categorical_cols, drop_first=True)
data = pd.concat([data, ohe], axis=1)