In [1]:
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [2]:
df1 = pd.read_csv("train.csv")
df2 = pd.read_csv("test.csv")
if 'Fertilizer Name' not in df2.columns:
    df2['Fertilizer Name'] = 0

data = pd.concat([df1, df2], axis=0)
data = data.set_index("id")

In [3]:
df2 = pd.read_csv("test.csv")
test_ids = df2['id'].copy() 

In [4]:
soil_mapping = {'Black': 1, 'Clayey': 2, 'Loamy': 3, 'Red': 4, 'Sandy': 5}
crop_mapping = {'Barley': 1, 'Cotton': 2, 'Ground Nuts': 3, 'Maize': 4, 'Millets': 5,
                'Oil seeds': 6, 'Paddy': 7, 'Pulses': 8, 'Sugarcane': 9, 'Tobacco': 10, 'Wheat': 11}

data['Soil Type'] = data['Soil Type'].map(soil_mapping)
data['Crop Type'] = data['Crop Type'].map(crop_mapping)

In [5]:
#normalising nitrogen, phosphorous and pottasium levels
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data[['Nitrogen', 'Phosphorous', 'Potassium']] = scaler.fit_transform(
    data[['Nitrogen', 'Phosphorous', 'Potassium']]
)

In [6]:
data["N_to_P"] = data["Nitrogen"] / (data["Phosphorous"] + 1)
data["N_to_K"] = data["Nitrogen"] / (data["Potassium"] + 1)
data["P_to_K"] = data["Phosphorous"] / (data["Potassium"] + 1)
data["total_NPK"] = data["Nitrogen"] + data["Phosphorous"] + data["Potassium"]

In [7]:
data["Phosphorous_log"] = np.log1p(data["Phosphorous"])

In [8]:
# Re-split
df1 = data.loc[df1.index]
df2 = data.loc[df2.index]

In [22]:
le = LabelEncoder()
y = le.fit_transform(df1['Fertilizer Name'])

orig = df1.copy()

X = df1.drop('Fertilizer Name', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
X_train['sample_weight'] = 1
orig['sample_weight'] = 5 

X_full = pd.concat([
    X_train.drop('sample_weight', axis=1),
    orig.drop(['sample_weight', 'Fertilizer Name'], axis=1)  #drop target
])

y_full = pd.concat([
    pd.Series(y_train, index=X_train.index),
    pd.Series(le.transform(orig['Fertilizer Name']), index=orig.index)
])

In [31]:
from sklearn.utils import compute_class_weight
weights = pd.concat([X_train['sample_weight'], orig['sample_weight']])
class_weights_dict = dict(zip(np.unique(y), weights))

categorical_features = ['Soil Type', 'Crop Type']

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    depth=8,
    random_strength=2,
    l2_leaf_reg=5,
    loss_function='MultiClass',
    eval_metric='TotalF1',
    cat_features=categorical_features,
    verbose=100,
    random_state=42
)

In [32]:
model.fit(X_full, y_full, sample_weight=weights)

0:	learn: 0.0901999	total: 6.17s	remaining: 1h 42m 46s
100:	learn: 0.1137053	total: 11m 1s	remaining: 1h 38m 4s
200:	learn: 0.1332871	total: 25m 24s	remaining: 1h 40m 59s
300:	learn: 0.1421879	total: 39m 34s	remaining: 1h 31m 53s
400:	learn: 0.1480330	total: 53m	remaining: 1h 19m 11s
500:	learn: 0.1561354	total: 1h 7m 31s	remaining: 1h 7m 15s
600:	learn: 0.1728593	total: 1h 25m 52s	remaining: 57m
700:	learn: 0.1852220	total: 1h 43m 3s	remaining: 43m 57s
800:	learn: 0.1945601	total: 1h 56m 57s	remaining: 29m 3s
900:	learn: 0.2022866	total: 2h 10m 9s	remaining: 14m 18s
999:	learn: 0.2091652	total: 2h 24m 9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x14f020e4d70>

In [33]:
y_pred_proba = model.predict_proba(X_test)

def map_at_3(y_true, y_pred_proba):
    top_3 = np.argsort(y_pred_proba, axis=1)[:, -3:][:, ::-1]
    map_score = 0.0
    for i in range(len(y_true)):
        true_label = y_true[i]
        predicted = top_3[i]
        if true_label in predicted:
            rank = np.where(predicted == true_label)[0][0] + 1
            map_score += 1 / rank
    return map_score / len(y_true)

map3_score = map_at_3(y_test, y_pred_proba)
print(f"MAP@3 Score (CatBoost): {map3_score:.4f}")

MAP@3 Score (CatBoost): 0.3513


In [34]:
test_features = df2.drop('Fertilizer Name', axis=1)
pred_proba = model.predict_proba(test_features)
top_3_indices = np.argsort(pred_proba, axis=1)[:, -3:][:, ::-1]
top_3_labels = model.classes_[top_3_indices]  # these are encoded ints

decoded_labels = le.inverse_transform(top_3_labels.flatten()).reshape(top_3_labels.shape)

submission_preds = [' '.join(row) for row in decoded_labels]

submission = pd.DataFrame({
    'id': test_ids,mn  
    'Fertilizer Name': submission_preds
})

submission.to_csv("submission.csv", index=False)

In [None]:
# Plot confusion matrix

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred_raw = model.predict(X_test)
y_pred = le.transform(y_pred_raw)  

cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(xticks_rotation=45)

In [None]:
#model predicts well for 14-35-14, 17-17-17, 28-28
#Urea and DAP are harder to distinguish

In [None]:
if hasattr(y_test, 'values'):
    y_test_array = y_test.values.ravel()
else:
    y_test_array = np.ravel(y_test)

y_pred_flat = np.ravel(y_pred_raw)
y_test_decoded_flat = np.ravel(y_test_decoded)

misclassified = y_pred_flat != y_test_decoded_flat
for i in np.where(misclassified)[0][:10]:
    true_label = y_test_decoded[i]
    pred_label = y_pred_raw[i]
    print(f"Row {i} | True: {true_label} | Pred: {pred_label} | Features: {X_test.iloc[i].to_dict()}")


In [None]:
#Many misclassifications are between fertilizers with similar NPK profiles, model is confusing similar formulas
#Model isnt handling extremes well either. Changes added : addition of derived ratios; transforming highly skewed features; using class weights parameter