In [12]:
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [8]:
df1 = pd.read_csv("train.csv")
df2 = pd.read_csv("test.csv")
if 'Fertilizer Name' not in df2.columns:
    df2['Fertilizer Name'] = 0

data = pd.concat([df1, df2], axis=0)
data = data.set_index("id")

In [9]:
df2 = pd.read_csv("test.csv")
test_ids = df2['id'].copy() 

In [10]:
soil_mapping = {'Black': 1, 'Clayey': 2, 'Loamy': 3, 'Red': 4, 'Sandy': 5}
crop_mapping = {'Barley': 1, 'Cotton': 2, 'Ground Nuts': 3, 'Maize': 4, 'Millets': 5,
                'Oil seeds': 6, 'Paddy': 7, 'Pulses': 8, 'Sugarcane': 9, 'Tobacco': 10, 'Wheat': 11}

data['Soil Type'] = data['Soil Type'].map(soil_mapping)
data['Crop Type'] = data['Crop Type'].map(crop_mapping)

# Re-split
df1 = data.loc[df1.index]
df2 = data.loc[df2.index]

In [11]:
le = LabelEncoder()
y = le.fit_transform(df1['Fertilizer Name'])

X = df1.drop('Fertilizer Name', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
param_grid = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [200, 300, 500],
    'l2_leaf_reg': [1, 3, 5, 7]
}

In [14]:
cbc = CatBoostClassifier(verbose=0, random_state=42)

random_search = RandomizedSearchCV(
    estimator=cbc,
    param_distributions=param_grid,
    n_iter=10,  
    scoring='accuracy',
    cv=3,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

print("Best Params:", random_search.best_params_)

Best Params: {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 500, 'depth': 6}


In [21]:
def map_at_3(y_true, y_pred_proba):
    top_3 = np.argsort(y_pred_proba, axis=1)[:, -3:][:, ::-1]
    map_score = 0.0
    for i in range(len(y_true)):
        true_label = y_true[i] 
        predicted = top_3[i]
        if true_label in predicted:
            rank = np.where(predicted == true_label)[0][0] + 1
            map_score += 1 / rank
    return map_score / len(y_true)

In [16]:
y_pred_proba = best_model.predict_proba(X_test)
map3_score = map_at_3(y_test, y_pred_proba)
print(f"Tuned MAP@3 Score: {map3_score:.4f}")

Tuned MAP@3 Score: 0.3206


In [18]:
pip install xgboost lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
    --------------------------------------- 0.0/1.5 MB 262.6 kB/s eta 0:00:06
   - -------------------------------------- 0.0/1.5 MB 245.8 kB/s eta 0:00:06
   -- ------------------------------------- 0.1/1.5 MB 476.3 kB/s eta 0:00:03
   --- ------------------------------------ 0.1/1.5 MB 554.9 kB/s eta 0:00:03
   ----- ---------------------------------- 0.2/1.5 MB 731.4 kB/s eta 0:00:02
   ------- -------------------------------- 0.3/1.5 MB 827.5 kB/s eta 0:00:02
   -------- ------------------------------- 0.3/1.5 MB 893.0 kB/s eta 0:00:02
   ---------- ----------------------------- 0.4/1.5 MB 955.7 kB/s eta 0:00:02
   --------------- -

In [22]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import log_loss

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)
    score = map_at_3(y_test, y_pred_proba)
    results[name] = score
    print(f"{name} MAP@3 Score: {score:.4f}\n")

print("Model Comp")
for model_name, score in results.items():
    print(f"{model_name}: {score:.4f}")


Training Logistic Regression...
Logistic Regression MAP@3 Score: 0.2839

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



XGBoost MAP@3 Score: 0.3299

Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030379 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 8
[LightGBM] [Info] Start training from score -1.885565
[LightGBM] [Info] Start training from score -1.877350
[LightGBM] [Info] Start training from score -1.900582
[LightGBM] [Info] Start training from score -1.909654
[LightGBM] [Info] Start training from score -1.910836
[LightGBM] [Info] Start training from score -2.069993
[LightGBM] [Info] Start training from score -2.091474
LightGBM MAP@3 Score: 0.3207

==== Model Comparison ====
Logistic Regression: 0.2839
XGBoost: 0.3299
LightGBM: 0.3207


In [24]:
#tuning XGBoost

In [23]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=3,
    scoring='neg_log_loss', #closest to MAP@3
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best parameters found:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)
map3_score = map_at_3(y_test, y_pred_proba)
print(f"Tuned XGBoost MAP@3 Score: {map3_score:.4f}")

Fitting 3 folds for each of 36 candidates, totalling 108 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters found:
{'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200, 'subsample': 1.0}
Tuned XGBoost MAP@3 Score: 0.3329


In [27]:
from sklearn.preprocessing import LabelEncoder

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(df1['Fertilizer Name'])

# Fit the model
best_model.fit(df1.drop('Fertilizer Name', axis=1), y_encoded)

Parameters: { "use_label_encoder" } are not used.



In [29]:
test_features = df2.drop('Fertilizer Name', axis=1)
pred_proba = best_model.predict_proba(test_features)

top_3_indices = np.argsort(pred_proba, axis=1)[:, -3:][:, ::-1]
top_3_class_indices = best_model.classes_[top_3_indices]

top_3_labels = np.vectorize(lambda x: le.inverse_transform([x])[0])(top_3_class_indices)

submission_preds = [' '.join(row) for row in top_3_labels]

submission = pd.DataFrame({
    'id': test_ids,
    'Fertilizer Name': submission_preds
})

submission.to_csv("submission.csv", index=False)