In [1]:
# Libaries
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

print('Libaires imported')

Libaires imported


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s5e6/sample_submission.csv')

print('Data imported')

Data imported


In [3]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Preview the data
display(train.head())
display(test.head())
display(sample_submission.head())

Train shape: (750000, 10)
Test shape: (250000, 9)


Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP


Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,750000,31,70,52,Sandy,Wheat,34,11,24
1,750001,27,62,45,Red,Sugarcane,30,14,15
2,750002,28,72,28,Clayey,Ground Nuts,14,15,4
3,750003,37,53,57,Black,Ground Nuts,18,17,36
4,750004,31,55,32,Red,Pulses,13,19,14


Unnamed: 0,id,Fertilizer Name
0,750000,14-35-14 10-26-26 Urea
1,750001,14-35-14 10-26-26 Urea
2,750002,14-35-14 10-26-26 Urea
3,750003,14-35-14 10-26-26 Urea
4,750004,14-35-14 10-26-26 Urea


# Feature Enginering

### Decoupling NPK components from fertilizer labels

In [4]:
# Manual NPK values for named fertilizers (approximate real-world values)
manual_npk = {
    "Urea": (46, 0, 0),
    "DAP": (18, 46, 0),
    "28-28": (28, 28, 0),
    "20-20": (20, 20, 0)
}

# Function to extract NPK from fertilizer name
def extract_npk(fert_name):
    match = re.match(r"(\d+)-(\d+)-(\d+)", fert_name)
    if match:
        return tuple(map(int, match.groups()))
    elif fert_name in manual_npk:
        return manual_npk[fert_name]
    else:
        # If unknown format, return placeholder
        return (0, 0, 0)

# Apply to training set
train[['fert_n', 'fert_p', 'fert_k']] = train['Fertilizer Name'].apply(
    lambda x: pd.Series(extract_npk(x))
)

# Apply to label lookup only
fert_npk_df = train[['Fertilizer Name', 'fert_n', 'fert_p', 'fert_k']].drop_duplicates()

# NPK Delta metrics
train['delta_n'] = train['fert_n'] - train['Nitrogen']
train['delta_p'] = train['fert_p'] - train['Phosphorous']
train['delta_k'] = train['fert_k'] - train['Potassium']

### Crop Metadata

In [5]:
crop_meta = {
    "Paddy": {"feed": "heavy", "part": "grain"},
    "Wheat": {"feed": "heavy", "part": "grain"},
    "Maize": {"feed": "heavy", "part": "grain"},
    "Barley": {"feed": "moderate", "part": "grain"},
    "Cotton": {"feed": "moderate", "part": "fiber"},
    "Tobacco": {"feed": "moderate", "part": "leaf"},
    "Sugarcane": {"feed": "heavy", "part": "stem"},
    "Millets": {"feed": "light", "part": "grain"},
    "Pulses": {"feed": "light", "part": "grain"},
    "Oil seeds": {"feed": "moderate", "part": "seed"},
    "Ground Nuts": {"feed": "moderate", "part": "seed"},
}

# Create mapping DataFrame
crop_df = pd.DataFrame.from_dict(crop_meta, orient='index').reset_index()
crop_df.columns = ['Crop Type', 'feed_type', 'crop_part']

# Merge into train/test
train = train.merge(crop_df, on='Crop Type', how='left')
test = test.merge(crop_df, on='Crop Type', how='left')

In [6]:
# Encode target (Fertilizer Name)
le_fert = LabelEncoder()
train['Fert_Label'] = le_fert.fit_transform(train['Fertilizer Name'])

# Encode categorical features: Soil Type, Crop Type, Crop Feed, Crop Part
le_soil = LabelEncoder()
le_crop = LabelEncoder()
le_feed = LabelEncoder()
le_part = LabelEncoder()

train['Soil_Type_enc'] = le_soil.fit_transform(train['Soil Type'])
train['Crop_Type_enc'] = le_crop.fit_transform(train['Crop Type'])
test['Soil_Type_enc'] = le_soil.transform(test['Soil Type'])
test['Crop_Type_enc'] = le_crop.transform(test['Crop Type'])

train['feed_type_enc'] = le_feed.fit_transform(train['feed_type'].fillna("unknown"))
test['feed_type_enc'] = le_feed.transform(test['feed_type'].fillna("unknown"))

train['crop_part_enc'] = le_part.fit_transform(train['crop_part'].fillna("unknown"))
test['crop_part_enc'] = le_part.transform(test['crop_part'].fillna("unknown"))

# Feature columns
features = [
    'Temparature', 'Humidity', 'Moisture',
    'Nitrogen', 'Phosphorous', 'Potassium',
    'Soil_Type_enc', 'Crop_Type_enc',
    'feed_type_enc', 'crop_part_enc',
]

# Split data for local validation
X_train, X_val, y_train, y_val = train_test_split(
    train[features], train['Fert_Label'], test_size=0.2, random_state=42
)

# Convert to LightGBM datasets
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

# Define params
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_train)),
    'learning_rate': 0.05,
    'num_leaves': 64,
    'max_depth': 7,
    'verbosity': -1,
    'metric': 'multi_logloss',
    'random_state': 42
}

# Train with early stopping
model = lgb.train(
    params,
    dtrain,
    num_boost_round=500,
    valid_sets=[dval],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50)
    ]
)

# Predict probabilities for validation
val_probs = model.predict(X_val, num_iteration=model.best_iteration)
top3_val_preds = np.argsort(-val_probs, axis=1)[:, :3]

# Custom MAP@3 function
def mapk(actual, predicted, k=3):
    score = 0.0
    for a, p in zip(actual, predicted):
        try:
            score += 1.0 / (p[:k].tolist().index(a) + 1)
        except ValueError:
            score += 0.0
    return score / len(actual)

# Evaluate MAP@3 on validation set
map3_val = mapk(y_val.values, top3_val_preds, k=3)
print(f"Validation MAP@3: {map3_val:.4f}")

# Predict probabilities for test set
test_probs = model.predict(test[features], num_iteration=model.best_iteration)
top3_test_preds = np.argsort(-test_probs, axis=1)[:, :3]

# Convert label indices to fertilizer names row-by-row
top3_labels = np.array([
    le_fert.inverse_transform(row) for row in top3_test_preds
])

Training until validation scores don't improve for 50 rounds
[50]	valid_0's multi_logloss: 1.93151
[100]	valid_0's multi_logloss: 1.92675
[150]	valid_0's multi_logloss: 1.92402
[200]	valid_0's multi_logloss: 1.92248
[250]	valid_0's multi_logloss: 1.92136
[300]	valid_0's multi_logloss: 1.92055
[350]	valid_0's multi_logloss: 1.92003
[400]	valid_0's multi_logloss: 1.9195
[450]	valid_0's multi_logloss: 1.91914
[500]	valid_0's multi_logloss: 1.91883
Did not meet early stopping. Best iteration is:
[499]	valid_0's multi_logloss: 1.91882
Validation MAP@3: 0.3323


In [7]:
# Format the submission
submission = pd.DataFrame({
    "id": test["id"],
    "Fertilizer Name": [" ".join(row) for row in top3_labels]
})

# Save to CSV
submission.to_csv("submission.csv", index=False)