In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
cwd = os.getcwd()
print("Current working directory is {}".format(cwd))

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, log_loss,

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector

from datetime import datetime
import joblib

import lightgbm as lgb

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv
Current working directory is /kaggle/working


## Data preprocessing

In [40]:
def sort_columns(df):
    return df.sort_index(axis=1)

def describe_data(df):
    ## Basic statistics
    describe = df.describe(include='all')
    info = df.info()  # Return None, print df.info() directly to console.
    null_count = df.isnull().sum()
    ## Unique values
    unique_count = df.nunique()
    sample_size = df.shape[0]
    unique_ratio = unique_count / sample_size
    ## print data descriptions
    print("\n====== df:\n")
    print(df)
    print("\n====== describe:\n")
    print(describe)
    print("\n======info: \n")
    print(df.info())
    print("\n====== null_count: \n")
    print(null_count)
    print("\n====== unique_count: \n")
    print(unique_count)
    print("\n====== unique_ratio: \n")
    print(unique_ratio)

    data_description = {
        "describe": describe,
        "info": info,
        "null_count": null_count,
        "unique_count": unique_count,
        "sample_size": sample_size,
        "unique_ratio": unique_ratio
    }
    return data_description

def drop_columns(df, cols_to_drop):
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
    return df   

def intersect_train_test_columns(train_df, test_df):
    ## Find common columns between train and test
    common_cols = train_df.columns.intersection(test_df.columns)
    ## Keep only those columns
    train_aligned = train_df[common_cols].copy()
    test_aligned = test_df[common_cols].copy()
    return train_aligned, test_aligned

######
## Don't ever use dummies for one-hot encoding. Big issue when doing online prediction with new data.
## pd.get_dummies() will mess up the one-hot positions.
## Use OneHotEncoder from scikit-learn instead.
######
# def category_to_onehot(df, **kwargs):
#     return pd.get_dummies(df, **kwargs)

def transform_numeric_to_category(df):
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        df[col] = df[col].astype('category')
    return df

def build_label_encoders(train_df):
    '''
    LabelEncoder is strictly meant for 1D arrays (i.e., one column at a time). 
    It doesn’t support fitting across a 2D DataFrame of multiple categorical columns
    '''
    cat_cols = train_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    encoders = dict()
    for col in cat_cols:
        le = LabelEncoder()
        train_df[col] = train_df[col].astype(str)  # Ensure consistent type
        le.fit(train_df[col])
        encoders[col] = le
    return encoders

def build_onehot_encoder(train_df):
    cat_cols = train_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    encoder = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
        ],
        remainder='passthrough'
    )
    encoder.fit(train_df)
    return encoder

def mapk(y_true, y_pred_probs, k=3, labels=None):
    """
    Compute Mean Average Precision at K (MAP@k).

    Parameters:
    - y_true: array-like of shape (n_samples,) – true labels (can be str or int)
    - y_pred_probs: array-like of shape (n_samples, n_classes) – predicted class probabilities
    - k: int – number of top predictions to consider
    - labels: list – ordered list of label names corresponding to columns in y_pred_probs

    Returns:
    - map_k: float – MAP@k score
    """
    if labels is None:
        raise ValueError("You must provide a list of label names in `labels` to map predicted indices to class names.")

    top_k_preds = np.argsort(-y_pred_probs, axis=1)[:, :k]  # Get top-k predicted indices
    y_true = np.asarray(y_true)

    score = 0.0
    for i in range(len(y_true)):
        true_label = y_true[i]
        predicted_labels = [labels[idx] for idx in top_k_preds[i]]

        if true_label in predicted_labels:
            rank = predicted_labels.index(true_label)
            score += 1.0 / (rank + 1)

    return score / len(y_true)

In [3]:
# Parameter settings
labels = ["Fertilizer Name"]
timestamp = datetime.now().strftime('%Y%m%d')
output_pred_num = 3

# Read inputs
train_df = pd.read_csv("../input/playground-series-s5e6/train.csv")
train_df_src = train_df.copy()
test_df = pd.read_csv("../input/playground-series-s5e6/test.csv")
test_df_src = test_df.copy()
sample_df = pd.read_csv("../input/playground-series-s5e6/sample_submission.csv")


In [4]:
# train_df
# test_df_src
sample_df.iloc[0, 1]


'14-35-14 10-26-26 Urea'

In [5]:
train_df_src

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [6]:
############
## Data Cleaning
############

## Describe data
train_df_description = describe_data(train_df)

## Sort data (Ensuring pd.get_dummies() gives consistent orders on train and test data.)
train_df = sort_columns(train_df)
test_df = sort_columns(test_df)

## Drop columns
cols_to_drop = ["id"]
train_df = drop_columns(train_df, cols_to_drop)
test_df = drop_columns(test_df, cols_to_drop)

## Split labels from training data
train_feature_df = train_df.drop(columns=labels)
train_label_df = train_df[labels]

## Take the intersection of train and test features.
train_feature_df, test_df = intersect_train_test_columns(train_feature_df, test_df)

############
## Feature Engineering
############

## Transform categorical features into label encoding.
label_encoders = build_label_encoders(train_df.copy())
train_encoded = train_feature_df.copy()
for col, le in label_encoders.items():
    if col in train_encoded.columns:
        train_encoded[col] = le.transform(train_encoded[col].astype(str))
    else:
        print(f"Warning: Column '{col}' not found in train_encoded. Skipping...")
        
test_encoded = test_df.copy()
for col, le in label_encoders.items():
    if col in test_encoded.columns:
        test_encoded[col] = le.transform(test_encoded[col].astype(str))
    else:
        print(f"Warning: Column '{col}' not found in test_encoded. Skipping...")
        
train_label_encoded = train_label_df.copy()
for col in train_label_encoded.columns:
    if col in label_encoders.keys():
        train_label_encoded[col] = train_label_encoded[col].transform(train_label_encoded[col].astype(str))
        print(f"Train label column '{col}' is encoded with LabelEncoder.")

## Transform numerical features into categorical features.
train_encoded = transform_numeric_to_category(train_encoded)
test_encoded = transform_numeric_to_category(test_encoded)   

# ## Transform categorical features into one-hot encoding.
# oh_encoder = build_onehot_encoder(train_feature_df)

# train_encoded = pd.DataFrame(
#     oh_encoder.transform(train_df),
#     columns=oh_encoder.get_feature_names_out(),
#     index=train_df.index
# )
# test_encoded = pd.DataFrame(
#     oh_encoder.transform(test_df),
#     columns=oh_encoder.get_feature_names_out(),
#     index=test_df.index
# )


# train_encoded = train_encoded.iloc[:200,]
# train_label_df = train_label_df.iloc[:200]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temparature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
dtypes: int64(7), object(3)
memory usage: 57.2+ MB


            id  Temparature  Humidity  Moisture Soil Type    Crop Type  \
0            0           37        70        36    Clayey    Sugarcane   
1            1           27        69        65     Sandy      Millets   
2            2           29        63        32     Sandy     

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()




In [7]:
train_label_df.values.ravel()

array(['28-28', '28-28', '17-17-17', ..., '10-26-26', '20-20', 'Urea'],
      dtype=object)

In [8]:
print(f'train_encoded.info():\n{train_encoded.info()}')
print(f'{train_encoded}')
print(f'test_encoded.info():{test_encoded.info()}')
print(f'{test_encoded}')
print(f'train_label_df.info():\n{train_label_df.info()}')
print(f'{train_label_df}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   Crop Type    750000 non-null  category
 1   Humidity     750000 non-null  category
 2   Moisture     750000 non-null  category
 3   Nitrogen     750000 non-null  category
 4   Phosphorous  750000 non-null  category
 5   Potassium    750000 non-null  category
 6   Soil Type    750000 non-null  category
 7   Temparature  750000 non-null  category
dtypes: category(8)
memory usage: 5.7 MB
train_encoded.info():
None
       Crop Type Humidity Moisture Nitrogen Phosphorous Potassium Soil Type  \
0              8       70       36       36           5         4         1   
1              4       69       65       30          18         6         4   
2              4       63       32       24          16        12         4   
3              0       62       54       39           4        12 

## Training and Testing

In [9]:
######
## Train models with cross_val_score()
######

## Set parameters
rf_model_args = {
    'n_estimators': 100,
    'criterion': "gini",
    'max_depth': 10,
    'min_samples_split': 8,
    'min_samples_leaf': 4,
    'random_state': 42,
    'max_features': "sqrt",
    'n_jobs': -1,
    'oob_score':True,
    'max_samples': 0.85,
    'verbose': 2,
}

rf_fit_cv_args = dict()

rf_cv_args = {
    'cv': 10,
    'scoring': ["neg_log_loss", 'accuracy'],
    'n_jobs': -1,
    'verbose': 2,
    'fit_params': rf_fit_cv_args,
    'return_train_score': True,
}

x_rf = train_encoded
y_rf = train_label_df.values.ravel()

## Training and validation
rf = RandomForestClassifier(**rf_model_args)
rf_cv_scores = cross_validate(rf, x_rf, y_rf, **rf_cv_args)
rf_cv_avg_scores = dict()
for k, v in rf_cv_scores.items():
    mean_val = np.mean(v)
    rf_cv_avg_scores[k] = mean_val
print(f"\nMean CV Score:\n{rf_cv_avg_scores}")
print(f"\nAll Fold Scores:\n{rf_cv_scores}")

rf.fit(x_rf, y_rf) # Train on the whole training set as the final model.
joblib.dump(rf, f'/kaggle/working/RandomForest_{timestamp}.joblib')

## Predict on training data
train_preds = rf.predict(x_rf)

## Model analytics
importances = dict(zip(rf.feature_names_in_, rf.feature_importances_))
sorted_importances = dict(sorted(
    zip(rf.feature_names_in_, rf.feature_importances_),
    key=lambda x: x[1],
    reverse=True
))


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.8min finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.8min finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.8min finished
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurre


Mean CV Score:
{'fit_time': 109.50842010974884, 'score_time': 2.817884993553162, 'test_neg_log_loss': -1.9357043948059833, 'train_neg_log_loss': -1.9217093618758052, 'test_accuracy': 0.17346666666666666, 'train_accuracy': 0.22282992592592593}

All Fold Scores:
{'fit_time': array([125.1990664 , 124.48820472, 123.94623995, 124.82291913,
       121.74275017, 122.21556401, 118.60752463, 119.53266811,
        58.68365645,  55.84560752]), 'score_time': array([3.18464446, 3.48298573, 3.3856802 , 3.07216692, 3.37313533,
       3.85301304, 2.24164939, 2.57098413, 1.71317601, 1.30141473]), 'test_neg_log_loss': array([-1.93555524, -1.93582262, -1.93560292, -1.93560309, -1.93572535,
       -1.93569881, -1.9359214 , -1.93562044, -1.9355303 , -1.93596377]), 'train_neg_log_loss': array([-1.92161437, -1.9217307 , -1.9214957 , -1.92178497, -1.92149377,
       -1.92200741, -1.92193681, -1.92147098, -1.92186604, -1.92169287]), 'test_accuracy': array([0.17405333, 0.17458667, 0.17525333, 0.17438667, 0.172

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   21.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    3.9s finished


### Load and use pretrained LightGBM

In [10]:
# rf = joblib.load('RandomForest_20240624T170312.joblib')

In [11]:
dict(sorted(
    zip(rf.feature_names_in_, rf.feature_importances_),
    key=lambda x: x[1],
    reverse=True
))

{'Crop Type': 0.14961496745194205,
 'Phosphorous': 0.1486025009510934,
 'Moisture': 0.14742730481610244,
 'Nitrogen': 0.13814684319411713,
 'Humidity': 0.11975940240806966,
 'Potassium': 0.11575224589356092,
 'Temparature': 0.10116056100084132,
 'Soil Type': 0.07953617428427315}

In [12]:
# ######
# ## Train models with KFold()
# ######

# kf = KFold(n_splits=10, shuffle=True, random_state=42)
# rf = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

# fold = 1
# for train_index, val_index in kf.split(train_encoded):
#     X_train, X_val = train_encoded.iloc[train_index], train_encoded.iloc[val_index]
#     y_train, y_val = train_label_df.values.ravel()[train_index], train_label_df.values.ravel()[val_index]
    
#     rf.fit(X_train, y_train)
#     preds = rf.predict(X_val)
#     acc = accuracy_score(y_val, preds)
    
#     print(f"Fold {fold} Accuracy: {acc:.4f}")
#     fold += 1

## Prediction

### Single Label Prediction

In [13]:
# test_predict = rf.predict(test_encoded)
# print(test_predict)

### MAP@5 Prediction

In [14]:
# probs = rf.predict_proba(test_encoded)

# top_predict = np.argsort(probs, axis=1)[:, -output_pred_num:][:, ::-1]
# submission = pd.DataFrame({
#     'id': test_df_src['id'].values,
# })
# submission["Fertilizer Name"] = [
#     " ".join(rf.classes_[row]) for row in top_predict
# ]

# submission.to_csv('submission.csv', index=False)
# print(submission)


### Meta-learner Prediction

#### Meta-learning feature creation

In [20]:
train_probs = rf.predict_proba(train_encoded)
test_probs = rf.predict_proba(test_encoded)

train_meta_features = pd.DataFrame(
    data=train_probs,
    columns=rf.classes_
)

test_meta_features = pd.DataFrame(
    data=test_probs,
    columns=rf.classes_
)

top_num = None  # Natural numbers or None to take all.
top_features = list(sorted_importances.keys()) if top_num is None else list(sorted_importances.keys())[:top_num]


extended_train_meta_features = pd.merge(train_encoded[top_features], train_meta_features, left_index=True, right_index=True, how="left")
extended_test_meta_features = pd.merge(test_encoded[top_features], test_meta_features, left_index=True, right_index=True, how="left")

# extended_train_meta_features = train_encoded
# extended_test_meta_features = test_encoded

extended_train_meta_features = sort_columns(extended_train_meta_features)
extended_test_meta_features = sort_columns(extended_test_meta_features)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    4.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.4s finished


In [21]:
extended_train_meta_features

Unnamed: 0,10-26-26,14-35-14,17-17-17,20-20,28-28,Crop Type,DAP,Humidity,Moisture,Nitrogen,Phosphorous,Potassium,Soil Type,Temparature,Urea
0,0.160397,0.156644,0.151622,0.156898,0.154453,8,0.113856,70,36,36,5,4,1,37,0.106130
1,0.133386,0.138343,0.139770,0.151205,0.164035,4,0.138931,69,65,30,18,6,4,27,0.134331
2,0.143776,0.155079,0.149232,0.135966,0.146565,4,0.132108,63,32,24,16,12,4,29,0.137275
3,0.141191,0.150828,0.145048,0.140930,0.146502,0,0.135560,62,54,39,4,12,4,35,0.139942
4,0.145118,0.152140,0.142522,0.133137,0.148489,6,0.144581,58,43,37,16,2,3,35,0.134015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,0.158621,0.152051,0.147911,0.145701,0.152967,3,0.123612,69,30,8,6,16,1,25,0.119139
749996,0.158347,0.161379,0.153631,0.158694,0.156533,8,0.110015,64,58,38,20,8,2,37,0.101400
749997,0.147204,0.136814,0.145586,0.144795,0.171357,2,0.118215,68,59,6,29,11,4,35,0.136029
749998,0.139154,0.145708,0.148389,0.144861,0.152640,1,0.130135,68,29,9,12,11,3,31,0.139113


#### Define inputs

In [41]:
x_meta = extended_train_meta_features
y_meta = train_label_df.values.ravel()
print(y_meta)

lgbm_model_args = {
    'objective': 'multiclass',
    'num_class': 7,
    'boosting_type': 'gbdt',
    'n_estimators': 10,
    'early_stopping_round': 100,  # No effect, only for compatibility. Call early_stopping in fit directly.
    'learning_rate': 0.03,
    "num_leaves": 31,
    'max_depth': 8,
    'random_state': 42,
    'reg_alpha': 1.0, 
    'reg_lambda': 1.0,
    'device_type': 'gpu',
    'verbosity': -1,
}

lgbm_fit_cv_args = {
    # 'eval_set': [(x_valid, y_valid)],
    # 'callbacks': [lgb.early_stopping(stopping_rounds=50)],  # Early stopping not supported in sklearn cross_validate().
    'eval_metric': ['multi_logloss', 'multi_error'],  
}

lgbm_fit_custom_args = {
    'eval_set': None,
    'eval_metric': ['multi_logloss', 'multi_error'],
    'callbacks':[
        lgb.early_stopping(stopping_rounds=lgbm_model_args['early_stopping_round']),
        lgb.log_evaluation(period=40)
    ],
}

lgbm_cv_args = {
    'cv': 10,
    'n_jobs': -1,
    'verbose': 2,
    'fit_params': lgbm_fit_cv_args
}


['28-28' '28-28' '17-17-17' ... '10-26-26' '20-20' 'Urea']


#### Model with cross_validate

In [18]:
# lgbm = lgb.LGBMClassifier(**lgbm_model_args)
# lgbm_cv_scores = cross_validate(lgbm, x_meta, y_meta, **lgbm_cv_args)
# lgbm_cv_avg_scores = dict()
# for k, v in lgbm_cv_scores.items():
#     mean_val = np.mean(v)
#     lgbm_cv_avg_scores[k] = mean_val
# print(f"\nMean CV Score:\n{lgbm_cv_avg_scores}")
# print(f"\nAll Fold Scores:\n{lgbm_cv_scores}")

# lgbm.fit(x_meta, y_meta, **lgbm_fit_cv_args)
# lgbm.booster_.save_model('/kaggle/working/LightGBM_{}.txt'.format(timestamp))

# # Predict on training data
# train_preds = lgbm.predict(x_meta)

# # # Method 1: Using .score()
# # train_accuracy = lgbm.score(x_meta, y_meta)

# # Method 2: Using accuracy_score
# train_accuracy = accuracy_score(y_meta, train_preds)
# print(f"Training Accuracy: {train_accuracy:.4f}")

In [49]:
#### Model with custom flow

kfold_num = lgbm_cv_args['cv']
kf = StratifiedKFold(n_splits=kfold_num, shuffle=True, random_state=42)
test_proba_pred_all_folds = list()
best_iters = dict()

for fold, (train_idx, val_idx) in enumerate(kf.split(x_meta, y_meta)):
    print(f'train_idx is {train_idx}, length is {len(train_idx)}')
    print(f'val_idx is {val_idx}, length is {len(val_idx)}')
    lgbm = lgb.LGBMClassifier(**lgbm_model_args)
    x_train, x_val = x_meta.iloc[train_idx], x_meta.iloc[val_idx]
    y_train, y_val = y_meta[train_idx], y_meta[val_idx]
    lgbm_fit_custom_args['eval_set'] = [
        (x_val, y_val),
        (x_train, y_train)
    ]
    # lgbm_fit_custom_args['eval_set'] = [(x_val, y_val)]
    lgbm.fit(x_train, y_train, **lgbm_fit_custom_args)
    lgbm_fit_custom_args['eval_set'] = None
    lgbm.booster_.save_model('/kaggle/working/LightGBM_fold{}_{}.txt'.format(fold, timestamp))
    # Model evaluation
    best_iter = lgbm.best_iteration_
    best_iters[fold] = best_iter
    num_iteration = best_iter + 0
    labels = lgbm._classes
    
    eval_result = lgbm.evals_result_
    final_logloss = eval_result['valid_0']['multi_logloss'][best_iter - 1]
    final_error = eval_result['valid_0']['multi_error'][best_iter - 1]
    print(f"[Fold {fold}] Final log loss: {final_logloss:.5f}, Final error: {final_error:.5f}")
    
    pred_prob_train = lgbm.predict_proba(x_train, num_iteration=num_iteration)
    pred_prob_val = lgbm.predict_proba(x_val, num_iteration=num_iteration)
    pred_train = lgbm.predict(x_train, num_iteration=num_iteration)
    pred_val = lgbm.predict(x_val, num_iteration=num_iteration)
    print("LightGBM Train Accuracy:", accuracy_score(y_train, pred_train))  
    print("LightGBM Validation Accuracy:", accuracy_score(y_val, pred_val)) 
    
    mapk_train = mapk(y_train, pred_prob_train, k=3, labels=labels)
    mapk_val = mapk(y_val, pred_prob_val, k=3, labels=labels)
    print("LightGBM Train MAP3:", mapk_train)
    print("LightGBM Validation MAP3:", mapk_val)
    
    # Make prediction
    test_proba_pred_probs = lgbm.predict_proba(extended_test_meta_features, num_iteration=num_iteration)
    test_proba_pred_all_folds.append(test_proba_pred_probs)
    print(f"\n[Fold {fold} is finished.]\n\n\n")
    break
    
# Shape: (n_folds, n_samples, n_classes)
stacked_preds = np.stack(test_proba_pred_all_folds, axis=0)

# Average across folds
kfold_avg_probs = np.mean(stacked_preds, axis=0)

train_idx is [     0      3      4 ... 749997 749998 749999], length is 675000
val_idx is [     1      2      8 ... 749958 749959 749991], length is 75000
Training until validation scores don't improve for 100 rounds
[Fold 0] Final log loss: 1.90452, Final error: 0.75708
LightGBM Train Accuracy: 0.24458962962962963
LightGBM Validation Accuracy: 0.24292
LightGBM Train MAP3: 0.3810022222227678
LightGBM Validation MAP3: 0.3788799999999455

[Fold 0 is finished.]





In [29]:
test_proba_pred_probs

array([[0.14134949, 0.16374991, 0.12428031, ..., 0.10007338, 0.23513533,
        0.14074599],
       [0.14240171, 0.06863644, 0.2940959 , ..., 0.10314605, 0.08213463,
        0.14685274],
       [0.18569046, 0.22137025, 0.08704059, ..., 0.12311314, 0.13092342,
        0.0655512 ],
       ...,
       [0.11258372, 0.11757071, 0.11960376, ..., 0.0936471 , 0.24701549,
        0.21005927],
       [0.24928079, 0.11994384, 0.16542677, ..., 0.13664993, 0.12722959,
        0.13514867],
       [0.16923341, 0.21961188, 0.20213568, ..., 0.12810482, 0.06981856,
        0.08115627]])

### Load and use pretrained LightGBM

In [None]:
# lgbm = lightgbm.Booster(model_file='LightGBM_xxxx.txt')

### Create submission file

In [50]:
# probs = lgbm.predict_proba(extended_test_meta_features)

probs = kfold_avg_probs
top_predict = np.argsort(probs, axis=1)[:, -output_pred_num:][:, ::-1]

submission = pd.DataFrame({
    'id': test_df_src['id'].values,
})
submission["Fertilizer Name"] = [
    " ".join(rf.classes_[row]) for row in top_predict
]

submission.to_csv('submission.csv', index=False)
print(submission)

            id             Fertilizer Name
0       750000          DAP 28-28 14-35-14
1       750001     17-17-17 10-26-26 20-20
2       750002     20-20 14-35-14 10-26-26
3       750003  14-35-14 17-17-17 10-26-26
4       750004     20-20 10-26-26 17-17-17
...        ...                         ...
249995  999995  17-17-17 14-35-14 10-26-26
249996  999996        10-26-26 28-28 20-20
249997  999997       14-35-14 17-17-17 DAP
249998  999998  17-17-17 10-26-26 14-35-14
249999  999999  14-35-14 17-17-17 10-26-26

[250000 rows x 2 columns]


In [51]:
submission_timestamp = os.path.getmtime('submission.csv')
print("Last modified:", datetime.fromtimestamp(submission_timestamp).strftime('%Y-%m-%d %H:%M:%S'))

Last modified: 2025-06-26 18:20:54


In [None]:
rf.classes_

In [None]:
rf.feature_names_in_

In [None]:
submission