In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
cwd = os.getcwd()
print("Current working directory is {}".format(cwd))

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector

from datetime import datetime
import joblib

import lightgbm as lgb

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv
Current working directory is /kaggle/working


## Data preprocessing

In [2]:
def sort_columns(df):
    return df.sort_index(axis=1)


def describe_data(df):
    ## Basic statistics
    describe = df.describe(include='all')
    info = df.info()  # Return None, print df.info() directly to console.
    null_count = df.isnull().sum()
    ## Unique values
    unique_count = df.nunique()
    sample_size = df.shape[0]
    unique_ratio = unique_count / sample_size
    ## print data descriptions
    print("\n====== df:\n")
    print(df)
    print("\n====== describe:\n")
    print(describe)
    print("\n======info: \n")
    print(df.info())
    print("\n====== null_count: \n")
    print(null_count)
    print("\n====== unique_count: \n")
    print(unique_count)
    print("\n====== unique_ratio: \n")
    print(unique_ratio)

    data_description = {
        "describe": describe,
        "info": info,
        "null_count": null_count,
        "unique_count": unique_count,
        "sample_size": sample_size,
        "unique_ratio": unique_ratio
    }
    return data_description


def drop_columns(df, cols_to_drop):
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
    return df   

def intersect_train_test_columns(train_df, test_df):
    ## Find common columns between train and test
    common_cols = train_df.columns.intersection(test_df.columns)
    ## Keep only those columns
    train_aligned = train_df[common_cols].copy()
    test_aligned = test_df[common_cols].copy()
    return train_aligned, test_aligned

######
## Don't ever use dummies for one-hot encoding. Big issue when doing online prediction with new data.
## pd.get_dummies() will mess up the one-hot positions.
## Use OneHotEncoder from scikit-learn instead.
######
# def category_to_onehot(df, **kwargs):
#     return pd.get_dummies(df, **kwargs)

def build_onehot_encoder(train_df):
    cat_cols = train_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

    onehot_encoder = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
        ],
        remainder='passthrough'
    )
    onehot_encoder.fit(train_df)
    return onehot_encoder

In [3]:
# Parameter settings
labels = ["Fertilizer Name"]
timestamp = datetime.now().strftime('%Y%m%d')
output_pred_num = 3

# Read inputs
train_df = pd.read_csv("../input/playground-series-s5e6/train.csv")
train_df_src = train_df.copy()
test_df = pd.read_csv("../input/playground-series-s5e6/test.csv")
test_df_src = test_df.copy()
sample_df = pd.read_csv("../input/playground-series-s5e6/sample_submission.csv")


In [4]:
# train_df
# test_df_src
sample_df.iloc[0, 1]


'14-35-14 10-26-26 Urea'

In [5]:
train_df_src

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [6]:
######
## Data preprocessing
######


## Describe data
train_df_description = describe_data(train_df)

## Sort data (Ensuring pd.get_dummies() gives consistent orders on train and test data.)
train_df = sort_columns(train_df)
test_df = sort_columns(test_df)

## Drop columns
cols_to_drop = ["id"]
train_df = drop_columns(train_df, cols_to_drop)
train_df

## Split labels from training data
train_feature_df = train_df.drop(columns=labels)
train_label_df = train_df[labels]

## Take the intersection of train and test features.
train_feature_df, test_df = intersect_train_test_columns(train_feature_df, test_df)

## Transform categorical features into one-hot encoding.
oh_encoder = build_onehot_encoder(train_feature_df)

train_encoded = pd.DataFrame(
    oh_encoder.transform(train_df),
    columns=oh_encoder.get_feature_names_out(),
    index=train_df.index
)
test_encoded = pd.DataFrame(
    oh_encoder.transform(test_df),
    columns=oh_encoder.get_feature_names_out(),
    index=test_df.index
)


# train_encoded = train_encoded.iloc[:200,]
# train_label_df = train_label_df.iloc[:200]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temparature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
dtypes: int64(7), object(3)
memory usage: 57.2+ MB


            id  Temparature  Humidity  Moisture Soil Type    Crop Type  \
0            0           37        70        36    Clayey    Sugarcane   
1            1           27        69        65     Sandy      Millets   
2            2           29        63        32     Sandy     

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [7]:
train_label_df.values.ravel()

array(['28-28', '28-28', '17-17-17', ..., '10-26-26', '20-20', 'Urea'],
      dtype=object)

In [8]:
test_encoded

Unnamed: 0,cat__Crop Type_Barley,cat__Crop Type_Cotton,cat__Crop Type_Ground Nuts,cat__Crop Type_Maize,cat__Crop Type_Millets,cat__Crop Type_Oil seeds,cat__Crop Type_Paddy,cat__Crop Type_Pulses,cat__Crop Type_Sugarcane,cat__Crop Type_Tobacco,...,cat__Soil Type_Clayey,cat__Soil Type_Loamy,cat__Soil Type_Red,cat__Soil Type_Sandy,remainder__Humidity,remainder__Moisture,remainder__Nitrogen,remainder__Phosphorous,remainder__Potassium,remainder__Temparature
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,70.0,52.0,34.0,24.0,11.0,31.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,62.0,45.0,30.0,15.0,14.0,27.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,72.0,28.0,14.0,4.0,15.0,28.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,53.0,57.0,18.0,36.0,17.0,37.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,55.0,32.0,13.0,14.0,19.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,66.0,30.0,14.0,18.0,7.0,26.0
249996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,62.0,55.0,28.0,7.0,14.0,33.0
249997,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,53.0,64.0,28.0,27.0,11.0,36.0
249998,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,67.0,26.0,33.0,10.0,0.0,36.0


## Training and Testing

In [9]:
######
## Train models with cross_val_score()
######

## Set parameters
rf_model_args = {
    'n_estimators': 200,
    'criterion': "gini",
    'max_depth': 10,
    'min_samples_split': 10,
    'min_samples_leaf': 5,
    'random_state': 42,
    'max_features': "sqrt",
    'n_jobs': -1,
    'oob_score':True,
    'max_samples': 0.85,
    'verbose': 0,
}

rf_fit_cv_args = dict()

rf_cv_args = {
    'cv': 10,
    'scoring': ["neg_log_loss", 'accuracy'],
    'n_jobs': -1,
    'verbose': 1,
    'fit_params': rf_fit_cv_args,
    'return_train_score': True,
}

x_rf = train_encoded
y_rf = train_label_df.values.ravel()

## Training and validation
rf = RandomForestClassifier(**rf_model_args)
rf_cv_scores = cross_validate(rf, x_rf, y_rf, **rf_cv_args)
rf_cv_avg_scores = dict()
for k, v in rf_cv_scores.items():
    mean_val = np.mean(v)
    rf_cv_avg_scores[k] = mean_val
print(f"\nMean CV Score:\n{rf_cv_avg_scores}")
print(f"\nAll Fold Scores:\n{rf_cv_scores}")

rf.fit(x_rf, y_rf) # Train on the whole training set as the final model.
joblib.dump(rf, f'/kaggle/working/RandomForest_{timestamp}.joblib')

## Predict on training data
train_preds = rf.predict(x_rf)

## Model analytics
importances = dict(zip(rf.feature_names_in_, rf.feature_importances_))
sorted_importances = dict(sorted(
    zip(rf.feature_names_in_, rf.feature_importances_),
    key=lambda x: x[1],
    reverse=True
))


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.7min finished



Mean CV Score:
{'fit_time': 72.41558847427368, 'score_time': 2.279756689071655, 'test_neg_log_loss': -1.9396636825425357, 'train_neg_log_loss': -1.9391663561869308, 'test_accuracy': 0.16327333333333333, 'train_accuracy': 0.1666005925925926}

All Fold Scores:
{'fit_time': array([81.09264517, 80.88508201, 81.46665573, 83.51454663, 80.0211246 ,
       79.81647182, 79.58602118, 79.29652071, 39.3388133 , 39.13800359]), 'score_time': array([2.21620202, 2.32041001, 2.37028599, 3.09396482, 2.57072282,
       2.61001396, 2.51556611, 2.35897899, 1.43033981, 1.31108236]), 'test_neg_log_loss': array([-1.9396403 , -1.93977012, -1.93959948, -1.93973159, -1.93964984,
       -1.93950187, -1.93965294, -1.93972195, -1.93955651, -1.93981221]), 'train_neg_log_loss': array([-1.93918988, -1.93915915, -1.93919019, -1.93913599, -1.93919091,
       -1.93918835, -1.93916817, -1.93914725, -1.93917582, -1.93911786]), 'test_accuracy': array([0.16434667, 0.16298667, 0.1622    , 0.16314667, 0.16221333,
       0.163

### Load and use pretrained LightGBM

In [10]:
# rf = joblib.load('RandomForest_20240624T170312.joblib')

In [11]:
dict(sorted(
    zip(rf.feature_names_in_, rf.feature_importances_),
    key=lambda x: x[1],
    reverse=True
))

{'cat__Crop Type_Pulses': 0.17779040530501947,
 'cat__Crop Type_Sugarcane': 0.1366278106664231,
 'remainder__Moisture': 0.11566569753157066,
 'remainder__Potassium': 0.09248350390495474,
 'remainder__Phosphorous': 0.07017433345129986,
 'remainder__Nitrogen': 0.05601204186120258,
 'cat__Soil Type_Clayey': 0.05503052457496341,
 'cat__Soil Type_Sandy': 0.03813332280450628,
 'remainder__Humidity': 0.03777147522848737,
 'cat__Crop Type_Ground Nuts': 0.03775144940933523,
 'cat__Crop Type_Wheat': 0.0327002370393265,
 'remainder__Temparature': 0.029123377839360972,
 'cat__Soil Type_Black': 0.02819710950321532,
 'cat__Soil Type_Loamy': 0.021115818516187813,
 'cat__Crop Type_Paddy': 0.017047397593737575,
 'cat__Soil Type_Red': 0.013577136054016158,
 'cat__Crop Type_Tobacco': 0.012154139869452109,
 'cat__Crop Type_Cotton': 0.012089009415033988,
 'cat__Crop Type_Maize': 0.006667637738677946,
 'cat__Crop Type_Millets': 0.0038708766437301986,
 'cat__Crop Type_Oil seeds': 0.003326084180338276,
 'cat_

In [12]:
# ######
# ## Train models with KFold()
# ######

# kf = KFold(n_splits=10, shuffle=True, random_state=42)
# rf = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

# fold = 1
# for train_index, val_index in kf.split(train_encoded):
#     X_train, X_val = train_encoded.iloc[train_index], train_encoded.iloc[val_index]
#     y_train, y_val = train_label_df.values.ravel()[train_index], train_label_df.values.ravel()[val_index]
    
#     rf.fit(X_train, y_train)
#     preds = rf.predict(X_val)
#     acc = accuracy_score(y_val, preds)
    
#     print(f"Fold {fold} Accuracy: {acc:.4f}")
#     fold += 1

## Prediction

### Single Label Prediction

In [13]:
# test_predict = rf.predict(test_encoded)
# print(test_predict)

### MAP@5 Prediction

In [14]:
# probs = rf.predict_proba(test_encoded)

# top_predict = np.argsort(probs, axis=1)[:, -output_pred_num:][:, ::-1]
# submission = pd.DataFrame({
#     'id': test_df_src['id'].values,
# })
# submission["Fertilizer Name"] = [
#     " ".join(rf.classes_[row]) for row in top_predict
# ]

# submission.to_csv('submission.csv', index=False)
# print(submission)


### Meta-learner Prediction

#### Meta-learning feature creation

In [15]:
train_probs = rf.predict_proba(train_encoded)
test_probs = rf.predict_proba(test_encoded)

train_meta_features = pd.DataFrame(
    data=train_probs,
    columns=rf.classes_
)

test_meta_features = pd.DataFrame(
    data=test_probs,
    columns=rf.classes_
)

top_num = None  # Natural numbers or None to take all.
top_features = list(sorted_importances.keys()) if top_num is None else list(sorted_importances.keys())[:top_num]


extended_train_meta_features = pd.merge(train_encoded[top_features], train_meta_features, left_index=True, right_index=True, how="left")
extended_test_meta_features = pd.merge(test_encoded[top_features], test_meta_features, left_index=True, right_index=True, how="left")

extended_train_meta_features = sort_columns(extended_train_meta_features)
extended_test_meta_features = sort_columns(extended_test_meta_features)

In [16]:
extended_train_meta_features

Unnamed: 0,10-26-26,14-35-14,17-17-17,20-20,28-28,DAP,Urea,cat__Crop Type_Barley,cat__Crop Type_Cotton,cat__Crop Type_Ground Nuts,...,cat__Soil Type_Clayey,cat__Soil Type_Loamy,cat__Soil Type_Red,cat__Soil Type_Sandy,remainder__Humidity,remainder__Moisture,remainder__Nitrogen,remainder__Phosphorous,remainder__Potassium,remainder__Temparature
0,0.159204,0.158863,0.151964,0.154572,0.153531,0.114018,0.107848,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,70.0,36.0,36.0,5.0,4.0,37.0
1,0.145917,0.147457,0.142657,0.147728,0.155278,0.132168,0.128794,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,69.0,65.0,30.0,18.0,6.0,27.0
2,0.147967,0.151658,0.148097,0.143557,0.147866,0.131191,0.129664,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,63.0,32.0,24.0,16.0,12.0,29.0
3,0.148405,0.150594,0.147143,0.144256,0.148029,0.131960,0.129612,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,62.0,54.0,39.0,4.0,12.0,35.0
4,0.149339,0.151829,0.148547,0.141879,0.148118,0.131815,0.128473,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,58.0,43.0,37.0,16.0,2.0,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,0.153988,0.151301,0.148711,0.148100,0.150858,0.123469,0.123573,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,69.0,30.0,8.0,6.0,16.0,25.0
749996,0.158788,0.158529,0.149669,0.157518,0.154573,0.113935,0.106989,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,64.0,58.0,38.0,20.0,8.0,37.0
749997,0.146216,0.146084,0.143780,0.154160,0.162822,0.122485,0.124453,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,68.0,59.0,6.0,29.0,11.0,35.0
749998,0.149178,0.151247,0.147885,0.143590,0.146420,0.131860,0.129820,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,68.0,29.0,9.0,12.0,11.0,31.0


#### Define inputs

In [17]:
x_meta = extended_train_meta_features
y_meta = train_label_df.values.ravel()
print(y_meta)

lgbm_model_args = {
    'objective': "multiclass",
    'num_class': 7,
    'boosting_type': "gbdt",
    'n_estimators': 3000,
    "early_stopping_round": 100,  # No effect, only for compatibility. Call early_stopping in fit directly.
    'learning_rate': 0.03,
    "num_leaves": 31,
    'max_depth': 8,
    'random_state': 42,
    'reg_alpha': 1.0, 
    'reg_lambda': 1.0,
    'device_type': "gpu",
    'verbosity': -1,
}

lgbm_fit_cv_args = {
    # 'eval_set': [(x_valid, y_valid)],
    # 'callbacks': [lgb.early_stopping(stopping_rounds=50)],  # Early stopping not supported in sklearn cross_validate().
    'eval_metric': ['multi_logloss', 'multi_error'],  
}

lgbm_fit_custom_args = {
    'eval_set': None,
    'eval_metric': ['multi_logloss', 'multi_error'],
    'callbacks':[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=3)
    ],
}

lgbm_cv_args = {
    'cv': 10,
    'n_jobs': -1,
    'verbose': 2,
    'fit_params': lgbm_fit_cv_args
}


['28-28' '28-28' '17-17-17' ... '10-26-26' '20-20' 'Urea']


#### Model with cross_validate

In [18]:
# lgbm = lgb.LGBMClassifier(**lgbm_model_args)
# lgbm_cv_scores = cross_validate(lgbm, x_meta, y_meta, **lgbm_cv_args)
# lgbm_cv_avg_scores = dict()
# for k, v in lgbm_cv_scores.items():
#     mean_val = np.mean(v)
#     lgbm_cv_avg_scores[k] = mean_val
# print(f"\nMean CV Score:\n{lgbm_cv_avg_scores}")
# print(f"\nAll Fold Scores:\n{lgbm_cv_scores}")

# lgbm.fit(x_meta, y_meta, **lgbm_fit_cv_args)
# lgbm.booster_.save_model('/kaggle/working/LightGBM_{}.txt'.format(timestamp))

# # Predict on training data
# train_preds = lgbm.predict(x_meta)

# # # Method 1: Using .score()
# # train_accuracy = lgbm.score(x_meta, y_meta)

# # Method 2: Using accuracy_score
# train_accuracy = accuracy_score(y_meta, train_preds)
# print(f"Training Accuracy: {train_accuracy:.4f}")

#### Model with custom flow

In [19]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
test_proba_pred_all_folds = list()
best_iters = dict()

for fold, (train_idx, val_idx) in enumerate(kf.split(x_meta, y_meta)):
    print(f'train_idx is {train_idx}, length is {len(train_idx)}')
    print(f'val_idx is {val_idx}, length is {len(val_idx)}')
    lgbm = lgb.LGBMClassifier(**lgbm_model_args)
    x_train, x_val = x_meta.iloc[train_idx], x_meta.iloc[val_idx]
    y_train, y_val = y_meta[train_idx], y_meta[val_idx]
    lgbm_fit_custom_args['eval_set'] = [
        (x_train, y_train),
        (x_val, y_val)
    ]
    # lgbm_fit_custom_args['eval_set'] = [(x_val, y_val)]
    lgbm.fit(x_train, y_train, **lgbm_fit_custom_args)
    lgbm_fit_custom_args['eval_set'] = None
    lgbm.booster_.save_model('/kaggle/working/LightGBM_fold{}_{}.txt'.format(fold, timestamp))
    # Model evaluation
    best_iter = lgbm.best_iteration_
    best_iters{fold} = best_iter
    eval_result = lgbm.evals_result_
    final_logloss = eval_result['valid_0']['multi_logloss'][best_iter - 1]
    final_error = eval_result['valid_0']['multi_error'][best_iter - 1]
    print(f"[Fold {fold+1}] Final log loss: {final_logloss:.5f}, Final error: {final_error:.5f}")
    # Make prediction
    test_proba_pred_probs = lgbm.predict_proba(extended_test_meta_features, num_iteration=best_iter)
    test_proba_pred_all_folds.append(test_proba_pred_probs)

# Shape: (n_folds, n_samples, n_classes)
stacked_preds = np.stack(test_proba_pred_all_folds, axis=0)

# Average across folds
kfold_avg_probs = np.mean(stacked_preds, axis=0)

train_idx is [     0      3      4 ... 749997 749998 749999], length is 675000
val_idx is [     1      2      8 ... 749958 749959 749991], length is 75000




Training until validation scores don't improve for 50 rounds
[50]	valid_0's multi_logloss: 1.93279	valid_0's multi_error: 0.82112
[100]	valid_0's multi_logloss: 1.92959	valid_0's multi_error: 0.816453
[150]	valid_0's multi_logloss: 1.92742	valid_0's multi_error: 0.812467
[200]	valid_0's multi_logloss: 1.9261	valid_0's multi_error: 0.811413
[250]	valid_0's multi_logloss: 1.92512	valid_0's multi_error: 0.810133
[300]	valid_0's multi_logloss: 1.92438	valid_0's multi_error: 0.80948
[350]	valid_0's multi_logloss: 1.92387	valid_0's multi_error: 0.80864
[400]	valid_0's multi_logloss: 1.92338	valid_0's multi_error: 0.80776
train_idx is [     0      1      2 ... 749996 749997 749999], length is 675000
val_idx is [     5      6     30 ... 749993 749995 749998], length is 75000
Training until validation scores don't improve for 50 rounds
[50]	valid_0's multi_logloss: 1.93284	valid_0's multi_error: 0.822013
[100]	valid_0's multi_logloss: 1.92944	valid_0's multi_error: 0.817547
[150]	valid_0's mult

In [26]:
    best_iter = lgbm.best_iteration_
    eval_result = lgbm.evals_result_
    final_logloss = eval_result['valid_0']['multi_logloss'][best_iter - 1]
    final_error = eval_result['valid_0']['multi_error'][best_iter - 1]
    print(f"[Fold {fold+1}] Final log loss: {final_logloss:.5f}, Final error: {final_error:.5f}")

[Fold 10] Final log loss: 1.92307, Final error: 0.80896


In [28]:
eval_result

{'valid_0': OrderedDict([('multi_logloss',
               [1.9420364852561705,
                1.9415576604327858,
                1.941118904742254,
                1.9406877476156985,
                1.9402813336074818,
                1.9398898985109074,
                1.9395169622268589,
                1.9391706914334341,
                1.938835562779004,
                1.938525083804345,
                1.9382391570373345,
                1.9379466615472556,
                1.9376906897247248,
                1.937427732126346,
                1.9371821223938985,
                1.936956391387741,
                1.936743492362174,
                1.9365380786282553,
                1.9363332596044358,
                1.9361457442026173,
                1.9359608295479878,
                1.9357878639674917,
                1.9356251259940669,
                1.9354616525825616,
                1.935307097360657,
                1.9351528873701072,
                1.9350176595

### Load and use pretrained LightGBM

In [20]:
# lgbm = lightgbm.Booster(model_file='LightGBM_xxxx.txt')

### Create submission file

In [21]:
# probs = lgbm.predict_proba(extended_test_meta_features)

probs = kfold_avg_probs
top_predict = np.argsort(probs, axis=1)[:, -output_pred_num:][:, ::-1]

submission = pd.DataFrame({
    'id': test_df_src['id'].values,
})
submission["Fertilizer Name"] = [
    " ".join(rf.classes_[row]) for row in top_predict
]

submission.to_csv('submission.csv', index=False)
print(submission)

            id             Fertilizer Name
0       750000          10-26-26 28-28 DAP
1       750001     17-17-17 20-20 14-35-14
2       750002        20-20 28-28 14-35-14
3       750003  14-35-14 17-17-17 10-26-26
4       750004        20-20 17-17-17 28-28
...        ...                         ...
249995  999995  17-17-17 10-26-26 14-35-14
249996  999996     14-35-14 10-26-26 20-20
249997  999997         14-35-14 Urea 28-28
249998  999998          17-17-17 DAP 28-28
249999  999999  10-26-26 17-17-17 14-35-14

[250000 rows x 2 columns]


In [29]:
submission_timestamp = os.path.getmtime('submission.csv')
print("Last modified:", datetime.fromtimestamp(submission_timestamp).strftime('%Y-%m-%d %H:%M:%S'))


Last modified: 2025-06-25 20:04:06


In [23]:
rf.classes_

array(['10-26-26', '14-35-14', '17-17-17', '20-20', '28-28', 'DAP',
       'Urea'], dtype=object)

In [24]:
rf.feature_names_in_

array(['cat__Crop Type_Barley', 'cat__Crop Type_Cotton',
       'cat__Crop Type_Ground Nuts', 'cat__Crop Type_Maize',
       'cat__Crop Type_Millets', 'cat__Crop Type_Oil seeds',
       'cat__Crop Type_Paddy', 'cat__Crop Type_Pulses',
       'cat__Crop Type_Sugarcane', 'cat__Crop Type_Tobacco',
       'cat__Crop Type_Wheat', 'cat__Soil Type_Black',
       'cat__Soil Type_Clayey', 'cat__Soil Type_Loamy',
       'cat__Soil Type_Red', 'cat__Soil Type_Sandy',
       'remainder__Humidity', 'remainder__Moisture',
       'remainder__Nitrogen', 'remainder__Phosphorous',
       'remainder__Potassium', 'remainder__Temparature'], dtype=object)

In [25]:
submission

Unnamed: 0,id,Fertilizer Name
0,750000,10-26-26 28-28 DAP
1,750001,17-17-17 20-20 14-35-14
2,750002,20-20 28-28 14-35-14
3,750003,14-35-14 17-17-17 10-26-26
4,750004,20-20 17-17-17 28-28
...,...,...
249995,999995,17-17-17 10-26-26 14-35-14
249996,999996,14-35-14 10-26-26 20-20
249997,999997,14-35-14 Urea 28-28
249998,999998,17-17-17 DAP 28-28
