In [41]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
cwd = os.getcwd()
print("Current working directory is {}".format(cwd))

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector

from datetime import datetime
import joblib

import lightgbm as lgb

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv
Current working directory is /kaggle/working


## Data preprocessing

In [19]:
def sort_columns(df):
    return df.sort_index(axis=1)


def describe_data(df):
    ## Basic statistics
    describe = df.describe(include='all')
    info = df.info()  # Return None, print df.info() directly to console.
    null_count = df.isnull().sum()
    ## Unique values
    unique_count = df.nunique()
    sample_size = df.shape[0]
    unique_ratio = unique_count / sample_size
    ## print data descriptions
    print("\n====== df:\n")
    print(df)
    print("\n====== describe:\n")
    print(describe)
    print("\n======info: \n")
    print(df.info())
    print("\n====== null_count: \n")
    print(null_count)
    print("\n====== unique_count: \n")
    print(unique_count)
    print("\n====== unique_ratio: \n")
    print(unique_ratio)

    data_description = {
        "describe": describe,
        "info": info,
        "null_count": null_count,
        "unique_count": unique_count,
        "sample_size": sample_size,
        "unique_ratio": unique_ratio
    }
    return data_description


def drop_columns(df, cols_to_drop):
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
    return df   

def intersect_train_test_columns(train_df, test_df):
    ## Find common columns between train and test
    common_cols = train_df.columns.intersection(test_df.columns)
    ## Keep only those columns
    train_aligned = train_df[common_cols].copy()
    test_aligned = test_df[common_cols].copy()
    return train_aligned, test_aligned

######
## Don't ever use dummies for one-hot encoding. Big issue when doing online prediction with new data.
## pd.get_dummies() will mess up the one-hot positions.
## Use OneHotEncoder from scikit-learn instead.
######
# def category_to_onehot(df, **kwargs):
#     return pd.get_dummies(df, **kwargs)

def build_onehot_encoder(train_df):
    cat_cols = train_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

    onehot_encoder = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
        ],
        remainder='passthrough'
    )
    onehot_encoder.fit(train_df)
    return onehot_encoder

In [20]:
# Parameter settings
labels = ["Fertilizer Name"]
timestamp = datetime.now().strftime('%Y%m%d')
output_pred_num = 3

# Read inputs
train_df = pd.read_csv("../input/playground-series-s5e6/train.csv")
train_df_src = train_df.copy()
test_df = pd.read_csv("../input/playground-series-s5e6/test.csv")
test_df_src = test_df.copy()
sample_df = pd.read_csv("../input/playground-series-s5e6/sample_submission.csv")


In [21]:
# train_df
# test_df_src
sample_df.iloc[0, 1]


'14-35-14 10-26-26 Urea'

In [22]:
train_df_src

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [23]:
######
## Data preprocessing
######


## Describe data
train_df_description = describe_data(train_df)

## Sort data (Ensuring pd.get_dummies() gives consistent orders on train and test data.)
train_df = sort_columns(train_df)
test_df = sort_columns(test_df)

## Drop columns
cols_to_drop = ["id"]
train_df = drop_columns(train_df, cols_to_drop)
train_df

## Split labels from training data
train_feature_df = train_df.drop(columns=labels)
train_label_df = train_df[labels]

## Take the intersection of train and test features.
train_feature_df, test_df = intersect_train_test_columns(train_feature_df, test_df)

## Transform categorical features into one-hot encoding.
oh_encoder = build_onehot_encoder(train_feature_df)

train_encoded = pd.DataFrame(
    oh_encoder.transform(train_df),
    columns=oh_encoder.get_feature_names_out(),
    index=train_df.index
)
test_encoded = pd.DataFrame(
    oh_encoder.transform(test_df),
    columns=oh_encoder.get_feature_names_out(),
    index=test_df.index
)


# train_encoded = train_encoded.iloc[:200,]
# train_label_df = train_label_df.iloc[:200]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temparature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
dtypes: int64(7), object(3)
memory usage: 57.2+ MB


            id  Temparature  Humidity  Moisture Soil Type    Crop Type  \
0            0           37        70        36    Clayey    Sugarcane   
1            1           27        69        65     Sandy      Millets   
2            2           29        63        32     Sandy     

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [24]:
train_label_df.values.ravel()

array(['28-28', '28-28', '17-17-17', ..., '10-26-26', '20-20', 'Urea'],
      dtype=object)

In [25]:
test_encoded

Unnamed: 0,cat__Crop Type_Barley,cat__Crop Type_Cotton,cat__Crop Type_Ground Nuts,cat__Crop Type_Maize,cat__Crop Type_Millets,cat__Crop Type_Oil seeds,cat__Crop Type_Paddy,cat__Crop Type_Pulses,cat__Crop Type_Sugarcane,cat__Crop Type_Tobacco,...,cat__Soil Type_Clayey,cat__Soil Type_Loamy,cat__Soil Type_Red,cat__Soil Type_Sandy,remainder__Humidity,remainder__Moisture,remainder__Nitrogen,remainder__Phosphorous,remainder__Potassium,remainder__Temparature
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,70.0,52.0,34.0,24.0,11.0,31.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,62.0,45.0,30.0,15.0,14.0,27.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,72.0,28.0,14.0,4.0,15.0,28.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,53.0,57.0,18.0,36.0,17.0,37.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,55.0,32.0,13.0,14.0,19.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,66.0,30.0,14.0,18.0,7.0,26.0
249996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,62.0,55.0,28.0,7.0,14.0,33.0
249997,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,53.0,64.0,28.0,27.0,11.0,36.0
249998,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,67.0,26.0,33.0,10.0,0.0,36.0


## Training and Testing

In [26]:
######
## Train models with cross_val_score()
######

## Set parameters
rf_model_args = {
    'n_estimators': 100,
    'criterion': "gini",
    'max_depth': 5,
    'min_samples_split': 20,
    'min_samples_leaf': 10,
    'random_state': 42,
    'max_features': "sqrt",
    'n_jobs': -1,
    'oob_score':True,
    'max_samples': 0.85,
    'verbose': 0,
}

rf_fit_cv_args = dict()

rf_cv_args = {
    'cv': 10,
    'scoring': ["neg_log_loss", 'accuracy'],
    'n_jobs': -1,
    'verbose': 1,
    'fit_params': rf_fit_cv_args,
    'return_train_score': True,
}

x_rf = train_encoded
y_rf = train_label_df.values.ravel()

## Training and validation
rf = RandomForestClassifier(**rf_model_args)
rf_cv_scores = cross_validate(rf, x_rf, y_rf, **rf_cv_args)
rf_cv_avg_scores = dict()
for k, v in rf_cv_scores.items():
    mean_val = np.mean(v)
    rf_cv_avg_scores[k] = mean_val
print(f"\nMean CV Score:\n{rf_cv_avg_scores}")
print(f"\nAll Fold Scores:\n{rf_cv_scores}")

rf.fit(x_rf, y_rf) # Train on the whole training set as the final model.
joblib.dump(rf, f'/kaggle/working/RandomForest_{timestamp}.joblib')

## Predict on training data
train_preds = rf.predict(x_rf)

## Model analytics
importances = dict(zip(rf.feature_names_in_, rf.feature_importances_))
sorted_importances = dict(sorted(
    zip(rf.feature_names_in_, rf.feature_importances_),
    key=lambda x: x[1],
    reverse=True
))


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   33.9s finished



Mean CV Score:
{'fit_time': 2.816225600242615, 'score_time': 0.7864110231399536, 'test_neg_log_loss': -1.9415196911414903, 'train_neg_log_loss': -1.941007610719435, 'test_accuracy': 0.15682266666666667, 'train_accuracy': 0.1578808888888889}

All Fold Scores:
{'fit_time': array([2.99765849, 3.00321817, 3.04978013, 3.0532558 , 3.16797757,
       3.22343755, 3.2009306 , 3.17317915, 1.67957783, 1.61324072]), 'score_time': array([0.73977518, 0.79020858, 0.77044463, 0.76907039, 1.02518487,
       1.22016931, 1.02519011, 0.73321033, 0.38478708, 0.40606976]), 'test_neg_log_loss': array([-1.94121974, -1.94110919, -1.94271843, -1.94160742, -1.9422434 ,
       -1.94202865, -1.94088563, -1.94144685, -1.9404875 , -1.9414501 ]), 'train_neg_log_loss': array([-1.94084165, -1.94088589, -1.94155452, -1.94119955, -1.94139402,
       -1.94143001, -1.94072069, -1.94066716, -1.94073858, -1.94064402]), 'test_accuracy': array([0.1566    , 0.15754667, 0.15597333, 0.156     , 0.15724   ,
       0.15746667, 0.1

  warn(


### Load and use pretrained LightGBM

In [27]:
# rf = joblib.load('RandomForest_20240624T170312.joblib')

In [28]:
dict(sorted(
    zip(rf.feature_names_in_, rf.feature_importances_),
    key=lambda x: x[1],
    reverse=True
))

{'cat__Crop Type_Sugarcane': 0.1842105341779941,
 'cat__Crop Type_Wheat': 0.14115710521528846,
 'cat__Soil Type_Loamy': 0.10871842517476277,
 'cat__Soil Type_Clayey': 0.10501057421106356,
 'remainder__Moisture': 0.0728118845650263,
 'remainder__Potassium': 0.06951586542680697,
 'cat__Crop Type_Pulses': 0.06170291631495272,
 'remainder__Nitrogen': 0.05001861022566251,
 'cat__Crop Type_Tobacco': 0.04618713977479941,
 'remainder__Humidity': 0.03872015647781372,
 'remainder__Temparature': 0.03580547555284086,
 'remainder__Phosphorous': 0.03264050600888853,
 'cat__Crop Type_Cotton': 0.030566632906469337,
 'cat__Soil Type_Red': 0.02293417396763075,
 'cat__Crop Type_Barley': 0.0,
 'cat__Crop Type_Ground Nuts': 0.0,
 'cat__Crop Type_Maize': 0.0,
 'cat__Crop Type_Millets': 0.0,
 'cat__Crop Type_Oil seeds': 0.0,
 'cat__Crop Type_Paddy': 0.0,
 'cat__Soil Type_Black': 0.0,
 'cat__Soil Type_Sandy': 0.0}

In [29]:
# ######
# ## Train models with KFold()
# ######

# kf = KFold(n_splits=10, shuffle=True, random_state=42)
# rf = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

# fold = 1
# for train_index, val_index in kf.split(train_encoded):
#     X_train, X_val = train_encoded.iloc[train_index], train_encoded.iloc[val_index]
#     y_train, y_val = train_label_df.values.ravel()[train_index], train_label_df.values.ravel()[val_index]
    
#     rf.fit(X_train, y_train)
#     preds = rf.predict(X_val)
#     acc = accuracy_score(y_val, preds)
    
#     print(f"Fold {fold} Accuracy: {acc:.4f}")
#     fold += 1

## Prediction

### Single Label Prediction

In [30]:
# test_predict = rf.predict(test_encoded)
# print(test_predict)

### MAP@5 Prediction

In [31]:
# probs = rf.predict_proba(test_encoded)

# top_predict = np.argsort(probs, axis=1)[:, -output_pred_num:][:, ::-1]
# submission = pd.DataFrame({
#     'id': test_df_src['id'].values,
# })
# submission["Fertilizer Name"] = [
#     " ".join(rf.classes_[row]) for row in top_predict
# ]

# submission.to_csv('submission.csv', index=False)
# print(submission)


### Meta-learner Prediction

#### Meta-learning feature creation

In [32]:
train_probs = rf.predict_proba(train_encoded)
test_probs = rf.predict_proba(test_encoded)

train_meta_features = pd.DataFrame(
    data=train_probs,
    columns=rf.classes_
)

test_meta_features = pd.DataFrame(
    data=test_probs,
    columns=rf.classes_
)

top_num = None  # Natural numbers or None to take all.
top_features = list(sorted_importances.keys()) if top_num is None else list(sorted_importances.keys())[:top_num]


extended_train_meta_features = pd.merge(train_encoded[top_features], train_meta_features, left_index=True, right_index=True, how="left")
extended_test_meta_features = pd.merge(test_encoded[top_features], test_meta_features, left_index=True, right_index=True, how="left")

extended_train_meta_features = sort_columns(extended_train_meta_features)
extended_test_meta_features = sort_columns(extended_test_meta_features)

In [33]:
extended_train_meta_features

Unnamed: 0,10-26-26,14-35-14,17-17-17,20-20,28-28,DAP,Urea,cat__Crop Type_Barley,cat__Crop Type_Cotton,cat__Crop Type_Ground Nuts,...,cat__Soil Type_Clayey,cat__Soil Type_Loamy,cat__Soil Type_Red,cat__Soil Type_Sandy,remainder__Humidity,remainder__Moisture,remainder__Nitrogen,remainder__Phosphorous,remainder__Potassium,remainder__Temparature
0,0.160917,0.159250,0.158417,0.147965,0.154938,0.107385,0.111127,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,70.0,36.0,36.0,5.0,4.0,37.0
1,0.143634,0.145609,0.137639,0.150401,0.160094,0.133421,0.129202,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,69.0,65.0,30.0,18.0,6.0,27.0
2,0.146248,0.154408,0.149506,0.142778,0.144130,0.133284,0.129647,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,63.0,32.0,24.0,16.0,12.0,29.0
3,0.146248,0.154408,0.149506,0.142778,0.144130,0.133284,0.129647,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,62.0,54.0,39.0,4.0,12.0,35.0
4,0.146248,0.154408,0.149506,0.142778,0.144130,0.133284,0.129647,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,58.0,43.0,37.0,16.0,2.0,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,0.199105,0.140940,0.117450,0.167785,0.148770,0.128635,0.097315,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,69.0,30.0,8.0,6.0,16.0,25.0
749996,0.168814,0.166428,0.153354,0.173204,0.161370,0.085123,0.091707,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,64.0,58.0,38.0,20.0,8.0,37.0
749997,0.143634,0.145609,0.137639,0.150401,0.160094,0.133421,0.129202,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,68.0,59.0,6.0,29.0,11.0,35.0
749998,0.146248,0.154408,0.149506,0.142778,0.144130,0.133284,0.129647,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,68.0,29.0,9.0,12.0,11.0,31.0


#### Define inputs

In [61]:
x_meta = extended_train_meta_features
y_meta = train_label_df.values.ravel()
print(y_meta)

lgbm_model_args = {
    'objective': "multiclass",
    'num_class': 7,
    'boosting_type': "gbdt",
    'n_estimators': 5000,
    "early_stopping_round": 30,  # No effect, only for compatibility. Call early_stopping in fit directly.
    'learning_rate': 0.03,
    "num_leaves": 31,
    'max_depth': 8,
    'random_state': 42,
    'reg_alpha': 1.0, 
    'reg_lambda': 1.0,
    'device_type': "gpu",
    'verbosity': -1,
}

lgbm_fit_cv_args = {
    # 'eval_set': [(x_valid, y_valid)],
    # 'callbacks': [lgb.early_stopping(stopping_rounds=50)],  # Early stopping not supported in sklearn cross_validate().
    'eval_metric': ['multi_logloss', 'multi_error'],  
}

lgbm_fit_custom_args = {
    'eval_set': None,
    'eval_metric': ['multi_logloss', 'multi_error'],
    'callbacks':[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50)
    ],
}

lgbm_cv_args = {
    'cv': 10,
    'n_jobs': -1,
    'verbose': 2,
    'fit_params': lgbm_fit_cv_args
}


['28-28' '28-28' '17-17-17' ... '10-26-26' '20-20' 'Urea']


#### Model with cross_validate

In [38]:
# lgbm = lgb.LGBMClassifier(**lgbm_model_args)
# lgbm_cv_scores = cross_validate(lgbm, x_meta, y_meta, **lgbm_cv_args)
# lgbm_cv_avg_scores = dict()
# for k, v in lgbm_cv_scores.items():
#     mean_val = np.mean(v)
#     lgbm_cv_avg_scores[k] = mean_val
# print(f"\nMean CV Score:\n{lgbm_cv_avg_scores}")
# print(f"\nAll Fold Scores:\n{lgbm_cv_scores}")

# lgbm.fit(x_meta, y_meta, **lgbm_fit_cv_args)
# lgbm.booster_.save_model('/kaggle/working/LightGBM_{}.txt'.format(timestamp))

# # Predict on training data
# train_preds = lgbm.predict(x_meta)

# # # Method 1: Using .score()
# # train_accuracy = lgbm.score(x_meta, y_meta)

# # Method 2: Using accuracy_score
# train_accuracy = accuracy_score(y_meta, train_preds)
# print(f"Training Accuracy: {train_accuracy:.4f}")

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 445
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 29
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 64 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 15 dense feature groups (11.44 MB) transferred to GPU in 0.017303 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -1.884866
[LightGBM] [Info] Start training from score -1.880057
[LightGBM] [Info] Start training from score -1.897538
[LightGBM] [Info] Start training from score -1.911544
[LightGBM] [Info] Start training from score -1.909121
[LightGBM] [Info] Start training from score -2.067671
[LightGBM] [Info] Start training from score -2.094845


<lightgbm.basic.Booster at 0x7aaa6b9d0f90>

#### Model with custom flow

In [69]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
test_proba_pred_all_folds = list()

for fold, (train_idx, val_idx) in enumerate(kf.split(x_meta, y_meta)):
    print(f'train_idx is {train_idx}, length is {len(train_idx)}')
    print(f'val_idx is {val_idx}, length is {len(val_idx)}')
    lgbm = lgb.LGBMClassifier(**lgbm_model_args)
    x_train, x_val = x_meta.iloc[train_idx], x_meta.iloc[val_idx]
    y_train, y_val = y_meta[train_idx], y_meta[val_idx]
    lgbm_fit_custom_args['eval_set'] = [(x_val, y_val)]
    lgbm.fit(x_train, y_train, **lgbm_fit_custom_args)
    lgbm_fit_custom_args['eval_set'] = None
    lgbm.booster_.save_model('/kaggle/working/LightGBM_fold{}_{}.txt'.format(fold, timestamp))
    # Make prediction
    test_proba_pred_probs = lgbm.predict_proba(extended_test_meta_features)
    test_proba_pred_all_folds.append(test_proba_pred_probs)

# Shape: (n_folds, n_samples, n_classes)
stacked_preds = np.stack(test_proba_pred_all_folds, axis=0)

# Average across folds
final_probs = np.mean(stacked_preds, axis=0)

train_idx is [     0      3      4 ... 749997 749998 749999], length is 675000
val_idx is [     1      2      8 ... 749958 749959 749991], length is 75000
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2]	valid_0's multi_logloss: 1.94195	valid_0's multi_error: 0.84052
train_idx is [     0      1      2 ... 749996 749997 749999], length is 675000
val_idx is [     5      6     30 ... 749993 749995 749998], length is 75000
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2]	valid_0's multi_logloss: 1.94197	valid_0's multi_error: 0.84192
train_idx is [     0      1      2 ... 749997 749998 749999], length is 675000
val_idx is [    22     24     26 ... 749967 749976 749985], length is 75000
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2]	valid_0's multi_logloss: 1.94195	valid_0's multi_error: 0.84076
trai

### Load and use pretrained LightGBM

In [None]:
# lgbm = lightgbm.Booster(model_file='LightGBM_xxxx.txt')

### Create submission file

In [71]:
# probs = lgbm.predict_proba(extended_test_meta_features)

probs = final_probs
top_predict = np.argsort(probs, axis=1)[:, -output_pred_num:][:, ::-1]

submission = pd.DataFrame({
    'id': test_df_src['id'].values,
})
submission["Fertilizer Name"] = [
    " ".join(rf.classes_[row]) for row in top_predict
]

submission.to_csv('submission.csv', index=False)
print(submission)

            id             Fertilizer Name
0       750000  14-35-14 10-26-26 17-17-17
1       750001  10-26-26 14-35-14 17-17-17
2       750002  10-26-26 14-35-14 17-17-17
3       750003  14-35-14 10-26-26 17-17-17
4       750004  14-35-14 10-26-26 17-17-17
...        ...                         ...
249995  999995  14-35-14 10-26-26 17-17-17
249996  999996  14-35-14 10-26-26 17-17-17
249997  999997  14-35-14 10-26-26 17-17-17
249998  999998  14-35-14 10-26-26 17-17-17
249999  999999  14-35-14 10-26-26 17-17-17

[250000 rows x 2 columns]


In [None]:
submission_timestamp = os.path.getmtime('submission.csv')
print("Last modified:", datetime.fromtimestamp(submission_timestamp).strftime('%Y-%m-%d %H:%M:%S'))


In [None]:
rf.classes_

In [None]:
rf.feature_names_in_

In [None]:
submission