In [1]:
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [2]:
df1 = pd.read_csv("train.csv")
df2 = pd.read_csv("test.csv")
if 'Fertilizer Name' not in df2.columns:
    df2['Fertilizer Name'] = 0

data = pd.concat([df1, df2], axis=0)
data = data.set_index("id")

In [3]:
df2 = pd.read_csv("test.csv")
test_ids = df2['id'].copy() 

In [4]:
soil_mapping = {'Black': 1, 'Clayey': 2, 'Loamy': 3, 'Red': 4, 'Sandy': 5}
crop_mapping = {'Barley': 1, 'Cotton': 2, 'Ground Nuts': 3, 'Maize': 4, 'Millets': 5,
                'Oil seeds': 6, 'Paddy': 7, 'Pulses': 8, 'Sugarcane': 9, 'Tobacco': 10, 'Wheat': 11}

data['Soil Type'] = data['Soil Type'].map(soil_mapping)
data['Crop Type'] = data['Crop Type'].map(crop_mapping)

# Re-split
df1 = data.loc[df1.index]
df2 = data.loc[df2.index]

In [5]:
le = LabelEncoder()
y = le.fit_transform(df1['Fertilizer Name'])

X = df1.drop('Fertilizer Name', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
categorical_features = ['Soil Type', 'Crop Type']

model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='TotalF1',
    cat_features=categorical_features,
    verbose=100,
    random_state=42
)


In [9]:
model.fit(df1.drop('Fertilizer Name', axis=1), df1['Fertilizer Name'])

0:	learn: 0.1022067	total: 2.34s	remaining: 19m 29s
100:	learn: 0.1575617	total: 5m 31s	remaining: 21m 51s
200:	learn: 0.1752424	total: 12m 23s	remaining: 18m 26s
300:	learn: 0.1868602	total: 19m 24s	remaining: 12m 49s
400:	learn: 0.1936800	total: 25m 13s	remaining: 6m 13s
499:	learn: 0.1995091	total: 31m 7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1c48a630860>

In [11]:
def map_at_3(y_true, y_pred_proba):
    top_3 = np.argsort(y_pred_proba, axis=1)[:, -3:][:, ::-1]
    map_score = 0.0
    for i in range(len(y_true)):
        true_label = y_true[i]
        predicted = top_3[i]
        if true_label in predicted:
            rank = np.where(predicted == true_label)[0][0] + 1
            map_score += 1 / rank
    return map_score / len(y_true)

# Evaluate
map3_score = map_at_3(y_test_indices, y_pred_proba)
print(f"MAP@3 Score (CatBoost): {map3_score:.4f}")

NameError: name 'y_test_indices' is not defined

In [13]:
test_features = df2.drop('Fertilizer Name', axis=1)
pred_proba = model.predict_proba(test_features)
top_3_indices = np.argsort(pred_proba, axis=1)[:, -3:][:, ::-1]
top_3_labels = model.classes_[top_3_indices]

submission_preds = [' '.join(row) for row in top_3_labels]
submission = pd.DataFrame({
    'id': test_ids,  # or test_ids if you saved them earlier
    'Fertilizer Name': submission_preds
})

submission.to_csv("submission.csv", index=False)