In [1]:
import numpy as np
import pandas as pd

import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import make_scorer, average_precision_score

import gc

In [2]:
class configs:

    train_path = "playground-series-s5e6/train.csv"
    test_path = "playground-series-s5e6/test.csv"
    original_path = "playground-series-s5e6/Fertilizer Prediction.csv"
    sample_sub_path = "playground-series-s5e6/sample_submission.csv"

    target = "Fertilizer Name"
    n_folds = 5
    seed = 42

## 1. Data loading & Inspection

In [6]:
# Load data
try:
    train_df = pd.read_csv(configs.train_path)
    test_df = pd.read_csv(configs.test_path)
    sample_submission = pd.read_csv(configs.sample_sub_path)
except FileNotFoundError:
    print("Ensure 'train.csv', 'test.csv', and 'sample_submission.csv' are in the same directory.")
    # Fallback for Kaggle environment or if files are in parent directory
    train_df = pd.read_csv('/kaggle/input/predicting-optimal-fertilizers/train.csv')
    test_df = pd.read_csv('/kaggle/input/predicting-optimal-fertilizers/test.csv')
    sample_submission = pd.read_csv('/kaggle/input/predicting-optimal-fertilizers/sample_submission.csv')

if 'Temparature' in train_df.columns:
    train_df = train_df.rename(columns={'Temparature': 'Temperature'})
if 'Temparature' in test_df.columns:
    test_df = test_df.rename(columns={'Temparature': 'Temperature'})

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("\nTrain columns:", train_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())
print("\nTrain info:")
train_df.info()
print("\nTest info:")
test_df.info()

Train shape: (750000, 10)
Test shape: (250000, 9)

Train columns: ['id', 'Temperature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type', 'Nitrogen', 'Potassium', 'Phosphorous', 'Fertilizer Name']
Test columns: ['id', 'Temperature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type', 'Nitrogen', 'Potassium', 'Phosphorous']

Train info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temperature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
d

## 2. Feature Engineering

In [7]:
def feature_engineer(df):
    # Nutrient Ratios (highly impactful)
    df['N_P_ratio'] = df['Nitrogen'] / (df['Phosphorous'] + 1e-6) # Add epsilon to avoid division by zero
    df['N_K_ratio'] = df['Nitrogen'] / (df['Potassium'] + 1e-6)
    df['P_K_ratio'] = df['Phosphorous'] / (df['Potassium'] + 1e-6)
    df['N_P_K_sum'] = df['Nitrogen'] + df['Phosphorous'] + df['Potassium']
    df['N_P_K_prod'] = df['Nitrogen'] * df['Phosphorous'] * df['Potassium']

    # Interaction between Soil Type and Crop Type
    df['Soil_Crop_Type'] = df['Soil Type'] + '_' + df['Crop Type']

    # Climate Index
    df['Temp_Hum_Index'] = df['Temperature'] * df['Humidity'] / 100
    df['Temp_Moist_Index'] = df['Temperature'] * df['Moisture'] / 100
    df['Hum_Moist_Index'] = df['Humidity'] * df['Moisture'] / 100

    # Polynomial features for nutrients (consider if models need more non-linearity)
    # df['N_sq'] = df['Nitrogen']**2
    # df['P_sq'] = df['Phosphorous']**2
    # df['K_sq'] = df['Potassium']**2

    # Binning (e.g., for nutrient levels - Low/Med/High) - might be useful for tree splits
    # This requires defining sensible bins based on EDA
    # bins_N = [0, 10, 30, np.inf]
    # labels_N = ['N_Low', 'N_Medium', 'N_High']
    # df['Nitrogen_bin'] = pd.cut(df['Nitrogen'], bins=bins_N, labels=labels_N, right=False)

    return df

train_df = feature_engineer(train_df)
test_df = feature_engineer(test_df)

print("\nFeatures after engineering:")
print(train_df.columns.tolist())


Features after engineering:
['id', 'Temperature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type', 'Nitrogen', 'Potassium', 'Phosphorous', 'Fertilizer Name', 'N_P_ratio', 'N_K_ratio', 'P_K_ratio', 'N_P_K_sum', 'N_P_K_prod', 'Soil_Crop_Type', 'Temp_Hum_Index', 'Temp_Moist_Index', 'Hum_Moist_Index']


## 3. Encoding Categorical Features

In [8]:
# Label Encode the target variable
le = LabelEncoder()
train_df['Fertilizer_Encoded'] = le.fit_transform(train_df['Fertilizer Name'])
num_classes = len(le.classes_)
print(f"\nNumber of target classes: {num_classes}")
print("Target classes mapping:", dict(zip(le.classes_, range(num_classes))))

# Identify categorical features (including the engineered one)
categorical_features = ['Soil Type', 'Crop Type', 'Soil_Crop_Type'] # Add other binned features if created

# Convert categorical features to 'category' dtype for LightGBM/CatBoost
for col in categorical_features:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

# For XGBoost (if native support isn't fully utilized, or for older versions):
# One-Hot Encoding (for models that don't support native categoricals or for specific cases)
# However, for LightGBM/CatBoost, native handling is generally preferred.
# train_df = pd.get_dummies(train_df, columns=categorical_features, drop_first=True)
# test_df = pd.get_dummies(test_df, columns=categorical_features, drop_first=True)
# Align columns after one-hot encoding
# train_cols = set(train_df.columns)
# test_cols = set(test_df.columns)
# common_cols = list(train_cols.intersection(test_cols))
# train_df = train_df[common_cols + ['Fertilizer_Encoded', 'Fertilizer Name']] # Keep target
# test_df = test_df[common_cols]


Number of target classes: 7
Target classes mapping: {'10-26-26': 0, '14-35-14': 1, '17-17-17': 2, '20-20': 3, '28-28': 4, 'DAP': 5, 'Urea': 6}


## 4. MAP@3 Metric Function 

In [10]:
def map_at_k(predictions, actuals, k=3):
    """
    Calculates Mean Average Precision @ k.
    predictions: List of lists, where each inner list contains predicted labels sorted by confidence.
    actuals: List of actual labels.
    """
    if len(predictions) != len(actuals):
        raise ValueError("Predictions and actuals must have the same length.")

    total_precision = 0.0
    for i in range(len(predictions)):
        predicted_labels = predictions[i][:k] # Only consider top K
        actual_label = actuals[i]

        ap = 0.0
        num_hits = 0
        for j, pred_label in enumerate(predicted_labels):
            if pred_label == actual_label:
                num_hits += 1
                ap += num_hits / (j + 1)
                break # Only count the first correct prediction for Average Precision

        total_precision += ap

    return total_precision / len(predictions)

## 5. Model Training (Gradient Boosting Ensemble)

In [13]:
# Define features and target
FEATURES = [col for col in train_df.columns if col not in ['id', 'Fertilizer Name', 'Fertilizer_Encoded']]
TARGET = 'Fertilizer_Encoded'

X = train_df[FEATURES]
y = train_df[TARGET]
X_test = test_df[FEATURES]

# Ensure categorical features are correctly recognized by LightGBM/CatBoost
lgbm_categorical_features = [col for col in FEATURES if X[col].dtype.name == 'category']
cat_categorical_features_indices = [X.columns.get_loc(col) for col in lgbm_categorical_features]

NFOLDS = 5
skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof_preds_lgb = np.zeros((len(X), num_classes))
oof_preds_xgb = np.zeros((len(X), num_classes))
oof_preds_cat = np.zeros((len(X), num_classes))

test_preds_lgb = np.zeros((len(X_test), num_classes))
test_preds_xgb = np.zeros((len(X_test), num_classes))
test_preds_cat = np.zeros((len(X_test), num_classes))

models = {
    'lgb': [],
    'xgb': [],
    'cat': []
}

print("\nStarting K-Fold Training...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"--- Fold {fold+1}/{NFOLDS} ---")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    # LightGBM
    print("Training LightGBM...")
    lgb_params = {
        'objective': 'multiclass',
        'num_class': num_classes,
        'metric': 'multi_logloss', # MAP@3 not directly optimized by default in LGBM
        'boosting_type': 'gbdt',
        'n_estimators': 2000, # Increased for potential early stopping
        'learning_rate': 0.02,
        'num_leaves': 31,
        'max_depth': -1,
        'seed': 42 + fold,
        'n_jobs': -1,
        'verbose': -1, # Suppress verbose output
        'colsample_bytree': 0.7,
        'subsample': 0.7,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'min_child_samples': 20,
        'early_stopping_round': 100 # Add early stopping
    }
    lgb_model = lgb.LGBMClassifier(**lgb_params)
    lgb_model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='multi_logloss', # or use 'map' if custom metric is implemented in LGBM
                  callbacks=[lgb.log_evaluation(period=0)], # Suppress period logging
                  categorical_feature=lgbm_categorical_features)
    oof_preds_lgb[val_idx] = lgb_model.predict_proba(X_val)
    test_preds_lgb += lgb_model.predict_proba(X_test) / NFOLDS
    models['lgb'].append(lgb_model)
    gc.collect()

    # XGBoost
    print("Training XGBoost...")
    xgb_params = {
        'objective': 'multi:softprob',
        'num_class': num_classes,
        'eval_metric': 'mlogloss',
        'tree_method': 'hist', # Still use 'hist' for general efficiency
        'n_estimators': 2000,
        'learning_rate': 0.02,
        'max_depth': 6,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'gamma': 0.1,
        'lambda': 0.1,
        'alpha': 0.1,
        'random_state': 42 + fold,
        'n_jobs': -1,
        'early_stopping_rounds': 100,
        'enable_categorical': True, # <--- ADD THIS LINE
    }
    xgb_model = xgb.XGBClassifier(**xgb_params)
    xgb_model.fit(X_train, y_train,
                eval_set=[(X_val, y_val)],
                verbose=False)
    oof_preds_xgb[val_idx] = xgb_model.predict_proba(X_val)
    test_preds_xgb += xgb_model.predict_proba(X_test) / NFOLDS
    models['xgb'].append(xgb_model)
    gc.collect()

    # CatBoost
    print("Training CatBoost...")
    cat_params = {
        'objective': 'MultiClass',
        'iterations': 2000,
        'learning_rate': 0.02,
        'depth': 6,
        'l2_leaf_reg': 3,
        'loss_function': 'MultiClass',
        'eval_metric': 'MultiClass', # MAP@3 not directly optimized by default in CatBoost
        'random_seed': 42 + fold,
        'verbose': 0, # Suppress verbose output
        'early_stopping_rounds': 100, # Add early stopping
        'allow_writing_files': False, # Important for Kaggle notebooks to prevent I/O errors
        'cat_features': cat_categorical_features_indices,
    }
    cat_model = cb.CatBoostClassifier(**cat_params)
    cat_model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  verbose=False)
    oof_preds_cat[val_idx] = cat_model.predict_proba(X_val)
    test_preds_cat += cat_model.predict_proba(X_test) / NFOLDS
    models['cat'].append(cat_model)
    gc.collect()

print("\nTraining Complete.")


Starting K-Fold Training...
--- Fold 1/5 ---
Training LightGBM...
Training XGBoost...
Training CatBoost...


KeyboardInterrupt: 

## 6. Ensemble and MAP@3 Evaluation

In [None]:
# Evaluate OOF predictions for each model type
oof_actuals = y.tolist()

oof_lgb_ranked_preds = []
for row in oof_preds_lgb:
    sorted_indices = np.argsort(row)[::-1]
    oof_lgb_ranked_preds.append(le.inverse_transform(sorted_indices).tolist())
map3_lgb = map_at_k(oof_lgb_ranked_preds, oof_actuals, k=3)
print(f"LightGBM OOF MAP@3: {map3_lgb:.5f}")

oof_xgb_ranked_preds = []
for row in oof_preds_xgb:
    sorted_indices = np.argsort(row)[::-1]
    oof_xgb_ranked_preds.append(le.inverse_transform(sorted_indices).tolist())
map3_xgb = map_at_k(oof_xgb_ranked_preds, oof_actuals, k=3)
print(f"XGBoost OOF MAP@3: {map3_xgb:.5f}")

oof_cat_ranked_preds = []
for row in oof_preds_cat:
    sorted_indices = np.argsort(row)[::-1]
    oof_cat_ranked_preds.append(le.inverse_transform(sorted_indices).tolist())
map3_cat = map_at_k(oof_cat_ranked_preds, oof_actuals, k=3)
print(f"CatBoost OOF MAP@3: {map3_cat:.5f}")

# Simple Averaging Ensemble of OOF predictions
oof_ensemble_preds = (oof_preds_lgb + oof_preds_xgb + oof_preds_cat) / 3

oof_ensemble_ranked_preds = []
for row in oof_ensemble_preds:
    sorted_indices = np.argsort(row)[::-1]
    oof_ensemble_ranked_preds.append(le.inverse_transform(sorted_indices).tolist())
map3_ensemble = map_at_k(oof_ensemble_ranked_preds, oof_actuals, k=3)
print(f"Ensemble (Average) OOF MAP@3: {map3_ensemble:.5f}")

# More advanced ensembling: Stacking
# You could train a meta-model (e.g., Logistic Regression) on oof_preds_lgb, oof_preds_xgb, oof_preds_cat
# to predict the target. This often yields better results.
# Example:
# from sklearn.linear_model import LogisticRegression
# meta_model = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=42)
# meta_model.fit(oof_ensemble_preds, y) # Train meta-model on combined OOF predictions
# Then use meta_model.predict_proba on the combined test_preds to get final predictions.

## 7. Submission File Generation

In [None]:
# Simple Averaging Ensemble of test predictions
final_test_preds_proba = (test_preds_lgb + test_preds_xgb + test_preds_cat) / 3

# For each row, get the top 3 predicted fertilizer names
predictions_for_submission = []
for proba_row in final_test_preds_proba:
    # Get indices of the top 3 probabilities
    top_3_indices = np.argsort(proba_row)[::-1][:3]
    # Convert indices back to original fertilizer names
    top_3_fertilizers = le.inverse_transform(top_3_indices)
    predictions_for_submission.append(" ".join(top_3_fertilizers))

# Create submission DataFrame
submission_df = pd.DataFrame({'id': test_df['id'], 'Fertilizer Name': predictions_for_submission})
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file created: submission.csv")
print(submission_df.head())