In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import top_k_accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
df1 = pd.read_csv("train.csv")
df2 = pd.read_csv("test.csv")

df2["Fertilizer Name"] = 0

data = pd.concat([df1, df2], axis=0)
data = data.set_index("id")

In [3]:
soil_mapping = {'Black': 1, 'Clayey': 2, 'Loamy': 3, 'Red': 4, 'Sandy': 5}
crop_mapping = {
    'Barley': 1, 'Cotton': 2, 'Ground Nuts': 3, 'Maize': 4, 'Millets': 5,
    'Oil seeds': 6, 'Paddy': 7, 'Pulses': 8, 'Sugarcane': 9, 'Tobacco': 10, 'Wheat': 11
}
data['Soil Type'] = data['Soil Type'].map(soil_mapping)
data['Crop Type'] = data['Crop Type'].map(crop_mapping)

df1 = data.loc[df1.index]
df2 = data.loc[df2.index]

In [25]:
X_train = df1.drop('Fertilizer Name', axis=1)
y_train = df1['Fertilizer Name']
X_test = df2.drop('Fertilizer Name', axis=1)

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)

In [28]:
test_ids = pd.read_csv("sample_submission.csv")["id"]
print(test_ids)

0         750000
1         750001
2         750002
3         750003
4         750004
           ...  
249995    999995
249996    999996
249997    999997
249998    999998
249999    999999
Name: id, Length: 250000, dtype: int64


In [7]:
xgb_model = XGBClassifier(eval_metric="mlogloss", random_state=42)
xgb_model.fit(X_train, y_train_enc)
xgb_proba = xgb_model.predict_proba(X_test)

Parameters: { "use_label_encoder" } are not used.



In [8]:
lgb_model = LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train_enc)
lgb_proba = lgb_model.predict_proba(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041589 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 8
[LightGBM] [Info] Start training from score -1.884866
[LightGBM] [Info] Start training from score -1.880057
[LightGBM] [Info] Start training from score -1.897538
[LightGBM] [Info] Start training from score -1.911544
[LightGBM] [Info] Start training from score -1.909121
[LightGBM] [Info] Start training from score -2.067671
[LightGBM] [Info] Start training from score -2.094845


In [9]:
cat_model = CatBoostClassifier(verbose=0, random_state=42)
cat_model.fit(X_train, y_train)
cat_proba = cat_model.predict_proba(X_test)

In [15]:
best_score = 0
best_weights = (1/3, 1/3, 1/3)

for w1 in np.arange(0, 1.1, 0.1):
    for w2 in np.arange(0, 1.1 - w1, 0.1):
        w3 = 1.0 - w1 - w2
        blended = w1 * xgb_proba + w2 * lgb_proba + w3 * cat_proba
        top3 = np.argsort(blended, axis=1)[:, -3:][:, ::-1]
        score = top_k_accuracy_score(
            y_train_enc[:len(blended)],
            blended,
            k=3,
            labels=np.arange(blended.shape[1])
        )
        if score > best_score:
            best_score = score
            best_weights = (w1, w2, w3)

print(f"\n best weights: XGB={best_weights[0]:.2f}, LGB={best_weights[1]:.2f}, CAT={best_weights[2]:.2f}")
print(f"best map@3 score: {best_score:.4f}")


 best weights: XGB=0.50, LGB=0.00, CAT=0.50
best map@3 score: 0.5835


In [18]:
print("Max index in top_3_indices:", top_3_indices.max())
print("Number of classes in label encoder:", len(le.classes_))

print("xgb_proba:", xgb_proba.shape)
print("lgb_proba:", lgb_proba.shape)
print("cat_proba:", cat_proba.shape)

Max index in top_3_indices: 6
Number of classes in label encoder: 7
xgb_proba: (250000, 7)
lgb_proba: (250000, 7)
cat_proba: (250000, 7)


In [21]:
print(test_ids)

Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
            9,
       ...
       249990, 249991, 249992, 249993, 249994, 249995, 249996, 249997, 249998,
       249999],
      dtype='int64', length=250000)


In [29]:
final_blend = (
    best_weights[0] * xgb_proba +
    best_weights[1] * lgb_proba +
    best_weights[2] * cat_proba
)

top_3_flat = top_3_indices.ravel()  
top_3_labels_flat = le.inverse_transform(top_3_flat)  
top_3_labels = top_3_labels_flat.reshape(top_3_indices.shape)

submission = pd.DataFrame({
    'id': test_ids,
    'Fertilizer Name': [' '.join(row) for row in top_3_labels]
})
submission.to_csv("submission_blended.csv", index=False)