In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
cwd = os.getcwd()
print("Current working directory is {}".format(cwd))

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, log_loss

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector

from datetime import datetime
import joblib

import lightgbm as lgb

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv
Current working directory is /kaggle/working


## Data preprocessing

In [2]:
def sort_columns(df):
    return df.sort_index(axis=1)

def describe_data(df):
    ## Basic statistics
    describe = df.describe(include='all')
    info = df.info()  # Return None, print df.info() directly to console.
    null_count = df.isnull().sum()
    ## Unique values
    unique_count = df.nunique()
    sample_size = df.shape[0]
    unique_ratio = unique_count / sample_size
    ## print data descriptions
    print("\n====== df:\n")
    print(df)
    print("\n====== describe:\n")
    print(describe)
    print("\n======info: \n")
    print(df.info())
    print("\n====== null_count: \n")
    print(null_count)
    print("\n====== unique_count: \n")
    print(unique_count)
    print("\n====== unique_ratio: \n")
    print(unique_ratio)

    data_description = {
        "describe": describe,
        "info": info,
        "null_count": null_count,
        "unique_count": unique_count,
        "sample_size": sample_size,
        "unique_ratio": unique_ratio
    }
    return data_description

def drop_columns(df, cols_to_drop):
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
    return df   

def intersect_train_test_columns(train_df, test_df):
    ## Find common columns between train and test
    common_cols = train_df.columns.intersection(test_df.columns)
    ## Keep only those columns
    train_aligned = train_df[common_cols].copy()
    test_aligned = test_df[common_cols].copy()
    return train_aligned, test_aligned

######
## Don't ever use dummies for one-hot encoding. Big issue when doing online prediction with new data.
## pd.get_dummies() will mess up the one-hot positions.
## Use OneHotEncoder from scikit-learn instead.
######
# def category_to_onehot(df, **kwargs):
#     return pd.get_dummies(df, **kwargs)

def transform_numeric_to_category(df):
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        df[col] = df[col].astype('category')
    return df

def build_label_encoders(train_df):
    '''
    LabelEncoder is strictly meant for 1D arrays (i.e., one column at a time). 
    It doesn’t support fitting across a 2D DataFrame of multiple categorical columns
    '''
    cat_cols = train_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    encoders = dict()
    for col in cat_cols:
        le = LabelEncoder()
        train_df[col] = train_df[col].astype(str)  # Ensure consistent type
        le.fit(train_df[col])
        encoders[col] = le
    return encoders

def build_onehot_encoder(train_df):
    cat_cols = train_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    encoder = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
        ],
        remainder='passthrough'
    )
    encoder.fit(train_df)
    return encoder

def mapk(y_true, y_pred_probs, k=3, labels=None):
    """
    Compute Mean Average Precision at K (MAP@k).

    Parameters:
    - y_true: array-like of shape (n_samples,) – true labels (can be str or int)
    - y_pred_probs: array-like of shape (n_samples, n_classes) – predicted class probabilities
    - k: int – number of top predictions to consider
    - labels: list – ordered list of label names corresponding to columns in y_pred_probs

    Returns:
    - map_k: float – MAP@k score
    """
    if labels is None:
        raise ValueError("You must provide a list of label names in `labels` to map predicted indices to class names.")

    top_k_preds = np.argsort(-y_pred_probs, axis=1)[:, :k]  # Get top-k predicted indices
    y_true = np.asarray(y_true)

    score = 0.0
    for i in range(len(y_true)):
        true_label = y_true[i]
        predicted_labels = [labels[idx] for idx in top_k_preds[i]]

        if true_label in predicted_labels:
            rank = predicted_labels.index(true_label)
            score += 1.0 / (rank + 1)

    return score / len(y_true)

In [3]:
# Parameter settings
labels = ["Fertilizer Name"]
timestamp = datetime.now().strftime('%Y%m%d')
output_pred_num = 3

# Read inputs
train_df = pd.read_csv("../input/playground-series-s5e6/train.csv")
train_df_src = train_df.copy()
test_df = pd.read_csv("../input/playground-series-s5e6/test.csv")
test_df_src = test_df.copy()
sample_df = pd.read_csv("../input/playground-series-s5e6/sample_submission.csv")


In [4]:
# train_df
# test_df_src
sample_df.iloc[0, 1]


'14-35-14 10-26-26 Urea'

In [5]:
train_df_src

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [6]:
############
## Data Cleaning
############

## Describe data
train_df_description = describe_data(train_df)

## Sort data (Ensuring pd.get_dummies() gives consistent orders on train and test data.)
train_df = sort_columns(train_df)
test_df = sort_columns(test_df)

## Drop columns
cols_to_drop = ["id"]
train_df = drop_columns(train_df, cols_to_drop)
test_df = drop_columns(test_df, cols_to_drop)

## Split labels from training data
train_feature_df = train_df.drop(columns=labels)
train_label_df = train_df[labels]

## Take the intersection of train and test features.
train_feature_df, test_df = intersect_train_test_columns(train_feature_df, test_df)

############
## Feature Engineering
############

## Transform categorical features into label encoding.
label_encoders = build_label_encoders(train_df.copy())

train_encoded = train_feature_df.copy()
for col, le in label_encoders.items():
    if col in train_encoded.columns:
        train_encoded[col] = le.transform(train_encoded[col].astype(str))
    else:
        print(f"Warning: Column '{col}' not found in train_encoded. Skipping...")
        
test_encoded = test_df.copy()
for col, le in label_encoders.items():
    if col in test_encoded.columns:
        test_encoded[col] = le.transform(test_encoded[col].astype(str))
    else:
        print(f"Warning: Column '{col}' not found in test_encoded. Skipping...")
        
train_label_encoded = train_label_df.copy()
for col in train_label_encoded.columns:
    if col in label_encoders.keys():
        train_label_encoded[col] = label_encoders[col].transform(train_label_encoded[col].astype(str))
        print(f"Train label column '{col}' is encoded with LabelEncoder.")

## Transform numerical features into categorical features.
train_encoded = transform_numeric_to_category(train_encoded)
test_encoded = transform_numeric_to_category(test_encoded)   

# ## Transform categorical features into one-hot encoding.
# oh_encoder = build_onehot_encoder(train_feature_df)

# train_encoded = pd.DataFrame(
#     oh_encoder.transform(train_df),
#     columns=oh_encoder.get_feature_names_out(),
#     index=train_df.index
# )
# test_encoded = pd.DataFrame(
#     oh_encoder.transform(test_df),
#     columns=oh_encoder.get_feature_names_out(),
#     index=test_df.index
# )


# train_encoded = train_encoded.iloc[:200,]
# train_label_df = train_label_df.iloc[:200]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temparature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
dtypes: int64(7), object(3)
memory usage: 57.2+ MB


            id  Temparature  Humidity  Moisture Soil Type    Crop Type  \
0            0           37        70        36    Clayey    Sugarcane   
1            1           27        69        65     Sandy      Millets   
2            2           29        63        32     Sandy     

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Train label column 'Fertilizer Name' is encoded with LabelEncoder.


In [7]:
train_label_df.values.ravel()

array(['28-28', '28-28', '17-17-17', ..., '10-26-26', '20-20', 'Urea'],
      dtype=object)

In [8]:
print(f'train_encoded.info():\n{train_encoded.info()}')
print(f'{train_encoded}')
print(f'test_encoded.info():{test_encoded.info()}')
print(f'{test_encoded}')
print(f'train_label_df.info():\n{train_label_df.info()}')
print(f'{train_label_df}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   Crop Type    750000 non-null  category
 1   Humidity     750000 non-null  category
 2   Moisture     750000 non-null  category
 3   Nitrogen     750000 non-null  category
 4   Phosphorous  750000 non-null  category
 5   Potassium    750000 non-null  category
 6   Soil Type    750000 non-null  category
 7   Temparature  750000 non-null  category
dtypes: category(8)
memory usage: 5.7 MB
train_encoded.info():
None
       Crop Type Humidity Moisture Nitrogen Phosphorous Potassium Soil Type  \
0              8       70       36       36           5         4         1   
1              4       69       65       30          18         6         4   
2              4       63       32       24          16        12         4   
3              0       62       54       39           4        12 

## Training and Testing

In [9]:
######
## Train models with cross_val_score()
######

## Set parameters
rf_model_args = {
    'n_estimators': 150,
    'criterion': "gini",
    'max_depth': 8,
    'min_samples_split': 20,
    'min_samples_leaf': 10,
    'random_state': 42,
    'max_features': "sqrt",
    'n_jobs': -1,
    'oob_score':True,
    'max_samples': 0.85,
    'verbose': 1,
}

rf_fit_cv_args = dict()

rf_cv_args = {
    'cv': 10,
    'scoring': ["neg_log_loss", 'accuracy'],
    'n_jobs': -1,
    'verbose': 1,
    'fit_params': rf_fit_cv_args,
    'return_train_score': True,
}

x_rf = train_encoded
y_rf = train_label_df.values.ravel()

## Training and validation
rf = RandomForestClassifier(**rf_model_args)
rf_cv_scores = cross_validate(rf, x_rf, y_rf, **rf_cv_args)
rf_cv_avg_scores = dict()
for k, v in rf_cv_scores.items():
    mean_val = np.mean(v)
    rf_cv_avg_scores[k] = mean_val
print(f"\nMean CV Score:\n{rf_cv_avg_scores}")
print(f"\nAll Fold Scores:\n{rf_cv_scores}")

rf.fit(x_rf, y_rf) # Train on the whole training set as the final model.
joblib.dump(rf, f'/kaggle/working/RandomForest_{timestamp}.joblib')

## Predict on training data
train_preds = rf.predict(x_rf)

## Model analytics
importances = dict(zip(rf.feature_names_in_, rf.feature_importances_))
sorted_importances = dict(sorted(
    zip(rf.feature_names_in_, rf.feature_importances_),
    key=lambda x: x[1],
    reverse=True
))


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.5s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.5s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   41.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   41.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.2min finished
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.2min finished
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.2min finished
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurre


Mean CV Score:
{'fit_time': 138.74856708049774, 'score_time': 3.6191795110702514, 'test_neg_log_loss': -1.9373605065290511, 'train_neg_log_loss': -1.933298287942608, 'test_accuracy': 0.170136, 'train_accuracy': 0.18679807407407406}

All Fold Scores:
{'fit_time': array([152.3513062 , 152.14783621, 153.96237516, 156.45775366,
       156.30473351, 158.56351399, 153.5927701 , 156.72993779,
        73.12319875,  74.25224543]), 'score_time': array([3.56960082, 3.53205323, 3.80842757, 5.11538768, 3.45546198,
       4.10704637, 3.52704144, 5.24216151, 1.51676798, 2.31784654]), 'test_neg_log_loss': array([-1.93728084, -1.93758748, -1.93716961, -1.93732193, -1.93742376,
       -1.93726957, -1.93736338, -1.9372968 , -1.93728867, -1.93760304]), 'train_neg_log_loss': array([-1.93325869, -1.93336416, -1.93318005, -1.93332031, -1.93329667,
       -1.93339119, -1.93324778, -1.93325085, -1.93338112, -1.93329206]), 'test_accuracy': array([0.17054667, 0.17202667, 0.17064   , 0.17008   , 0.16953333,
    

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   29.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    5.0s finished


### Load and use pretrained LightGBM

In [10]:
# rf = joblib.load('RandomForest_20240624T170312.joblib')

In [11]:
dict(sorted(
    zip(rf.feature_names_in_, rf.feature_importances_),
    key=lambda x: x[1],
    reverse=True
))

{'Crop Type': 0.21406853129943484,
 'Moisture': 0.15385746711740494,
 'Phosphorous': 0.1339453088750805,
 'Nitrogen': 0.11841742561659152,
 'Potassium': 0.11514333864576094,
 'Soil Type': 0.09469581073801026,
 'Humidity': 0.09260922365239543,
 'Temparature': 0.07726289405532161}

In [12]:
# ######
# ## Train models with KFold()
# ######

# kf = KFold(n_splits=10, shuffle=True, random_state=42)
# rf = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

# fold = 1
# for train_index, val_index in kf.split(train_encoded):
#     X_train, X_val = train_encoded.iloc[train_index], train_encoded.iloc[val_index]
#     y_train, y_val = train_label_df.values.ravel()[train_index], train_label_df.values.ravel()[val_index]
    
#     rf.fit(X_train, y_train)
#     preds = rf.predict(X_val)
#     acc = accuracy_score(y_val, preds)
    
#     print(f"Fold {fold} Accuracy: {acc:.4f}")
#     fold += 1

## Prediction

### Single Label Prediction

In [13]:
# test_predict = rf.predict(test_encoded)
# print(test_predict)

### MAP@5 Prediction

In [14]:
# probs = rf.predict_proba(test_encoded)

# top_predict = np.argsort(probs, axis=1)[:, -output_pred_num:][:, ::-1]
# submission = pd.DataFrame({
#     'id': test_df_src['id'].values,
# })
# submission["Fertilizer Name"] = [
#     " ".join(rf.classes_[row]) for row in top_predict
# ]

# submission.to_csv('submission.csv', index=False)
# print(submission)


### Meta-learner Prediction

#### Meta-learning feature creation

In [15]:
train_probs = rf.predict_proba(train_encoded)
test_probs = rf.predict_proba(test_encoded)

train_meta_features = pd.DataFrame(
    data=train_probs,
    columns=rf.classes_
)

test_meta_features = pd.DataFrame(
    data=test_probs,
    columns=rf.classes_
)

top_num = None  # Natural numbers or None to take all.
top_features = list(sorted_importances.keys()) if top_num is None else list(sorted_importances.keys())[:top_num]


extended_train_meta_features = pd.merge(train_encoded[top_features], train_meta_features, left_index=True, right_index=True, how="left")
extended_test_meta_features = pd.merge(test_encoded[top_features], test_meta_features, left_index=True, right_index=True, how="left")

# extended_train_meta_features = train_encoded
# extended_test_meta_features = test_encoded

extended_train_meta_features = sort_columns(extended_train_meta_features)
extended_test_meta_features = sort_columns(extended_test_meta_features)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    5.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    1.5s finished


In [16]:
extended_train_meta_features

Unnamed: 0,10-26-26,14-35-14,17-17-17,20-20,28-28,Crop Type,DAP,Humidity,Moisture,Nitrogen,Phosphorous,Potassium,Soil Type,Temparature,Urea
0,0.159948,0.158043,0.155687,0.154859,0.148813,8,0.114922,70,36,36,5,4,1,37,0.107728
1,0.137220,0.142944,0.139023,0.148303,0.161374,4,0.137869,69,65,30,18,6,4,27,0.133266
2,0.145215,0.153883,0.148962,0.138722,0.147859,4,0.130642,63,32,24,16,12,4,29,0.134716
3,0.144518,0.150372,0.146424,0.141160,0.146278,0,0.133150,62,54,39,4,12,4,35,0.138100
4,0.144485,0.151341,0.144961,0.135617,0.150718,6,0.138343,58,43,37,16,2,3,35,0.134535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,0.156966,0.152761,0.149556,0.147131,0.149932,3,0.122287,69,30,8,6,16,1,25,0.121367
749996,0.157519,0.159591,0.156122,0.155435,0.152308,8,0.113480,64,58,38,20,8,2,37,0.105545
749997,0.140197,0.138979,0.141141,0.146626,0.173830,2,0.123507,68,59,6,29,11,4,35,0.135720
749998,0.141648,0.147496,0.148567,0.143930,0.151930,1,0.128996,68,29,9,12,11,3,31,0.137432


#### Define inputs

In [17]:
x_meta = extended_train_meta_features
y_meta = train_label_df.values.ravel()
print(y_meta)

lgbm_model_args = {
    'objective': 'multiclass',
    'num_class': 7,
    'boosting_type': 'gbdt',
    'n_estimators': 1000,
    'early_stopping_round': 100,  # No effect, only for compatibility. Call early_stopping in fit directly.
    'learning_rate': 0.03,
    "num_leaves": 31,
    'max_depth': 8,
    'random_state': 42,
    'reg_alpha': 1.0, 
    'reg_lambda': 1.0,
    'device_type': 'gpu',
    'verbosity': -1,
}

lgbm_fit_cv_args = {
    # 'eval_set': [(x_valid, y_valid)],
    # 'callbacks': [lgb.early_stopping(stopping_rounds=50)],  # Early stopping not supported in sklearn cross_validate().
    'eval_metric': ['multi_logloss', 'multi_error'],  
}

lgbm_fit_custom_args = {
    'eval_set': None,
    'eval_metric': ['multi_logloss', 'multi_error'],
    'callbacks':[
        lgb.early_stopping(stopping_rounds=lgbm_model_args['early_stopping_round']),
        lgb.log_evaluation(period=40)
    ],
}

lgbm_cv_args = {
    'cv': 10,
    'n_jobs': -1,
    'verbose': 2,
    'fit_params': lgbm_fit_cv_args
}


['28-28' '28-28' '17-17-17' ... '10-26-26' '20-20' 'Urea']


#### Model with cross_validate

In [18]:
# lgbm = lgb.LGBMClassifier(**lgbm_model_args)
# lgbm_cv_scores = cross_validate(lgbm, x_meta, y_meta, **lgbm_cv_args)
# lgbm_cv_avg_scores = dict()
# for k, v in lgbm_cv_scores.items():
#     mean_val = np.mean(v)
#     lgbm_cv_avg_scores[k] = mean_val
# print(f"\nMean CV Score:\n{lgbm_cv_avg_scores}")
# print(f"\nAll Fold Scores:\n{lgbm_cv_scores}")

# lgbm.fit(x_meta, y_meta, **lgbm_fit_cv_args)
# lgbm.booster_.save_model('/kaggle/working/LightGBM_{}.txt'.format(timestamp))

# # Predict on training data
# train_preds = lgbm.predict(x_meta)

# # # Method 1: Using .score()
# # train_accuracy = lgbm.score(x_meta, y_meta)

# # Method 2: Using accuracy_score
# train_accuracy = accuracy_score(y_meta, train_preds)
# print(f"Training Accuracy: {train_accuracy:.4f}")

In [19]:
#### Model with custom flow

kfold_num = lgbm_cv_args['cv']
kf = StratifiedKFold(n_splits=kfold_num, shuffle=True, random_state=42)
test_proba_pred_all_folds = list()
best_iters = dict()

for fold, (train_idx, val_idx) in enumerate(kf.split(x_meta, y_meta)):
    print(f'train_idx is {train_idx}, length is {len(train_idx)}')
    print(f'val_idx is {val_idx}, length is {len(val_idx)}')
    lgbm = lgb.LGBMClassifier(**lgbm_model_args)
    x_train, x_val = x_meta.iloc[train_idx], x_meta.iloc[val_idx]
    y_train, y_val = y_meta[train_idx], y_meta[val_idx]
    lgbm_fit_custom_args['eval_set'] = [
        (x_val, y_val),
        (x_train, y_train)
    ]
    # lgbm_fit_custom_args['eval_set'] = [(x_val, y_val)]
    lgbm.fit(x_train, y_train, **lgbm_fit_custom_args)
    lgbm_fit_custom_args['eval_set'] = None
    lgbm.booster_.save_model('/kaggle/working/LightGBM_fold{}_{}.txt'.format(fold, timestamp))
    # Model evaluation
    best_iter = lgbm.best_iteration_
    best_iters[fold] = best_iter
    num_iteration = best_iter + 0
    labels = lgbm._classes
    
    eval_result = lgbm.evals_result_
    final_logloss = eval_result['valid_0']['multi_logloss'][best_iter - 1]
    final_error = eval_result['valid_0']['multi_error'][best_iter - 1]
    print(f"[Fold {fold}] Final log loss: {final_logloss:.5f}, Final error: {final_error:.5f}")
    
    pred_prob_train = lgbm.predict_proba(x_train, num_iteration=num_iteration)
    pred_prob_val = lgbm.predict_proba(x_val, num_iteration=num_iteration)
    pred_train = lgbm.predict(x_train, num_iteration=num_iteration)
    pred_val = lgbm.predict(x_val, num_iteration=num_iteration)
    # print("LightGBM Train Accuracy:", accuracy_score(y_train, pred_train))  
    # print("LightGBM Validation Accuracy:", accuracy_score(y_val, pred_val)) 
    
    mapk_train = mapk(y_train, pred_prob_train, k=3, labels=labels)
    mapk_val = mapk(y_val, pred_prob_val, k=3, labels=labels)
    print("LightGBM Train MAP3:", mapk_train)
    print("LightGBM Validation MAP3:", mapk_val)
    
    # Make prediction
    test_proba_pred_probs = lgbm.predict_proba(extended_test_meta_features, num_iteration=num_iteration)
    test_proba_pred_all_folds.append(test_proba_pred_probs)
    print(f"\n[Fold {fold} is finished.]\n\n\n")
    break
    
# Shape: (n_folds, n_samples, n_classes)
stacked_preds = np.stack(test_proba_pred_all_folds, axis=0)

# Average across folds
kfold_avg_probs = np.mean(stacked_preds, axis=0)

train_idx is [     0      3      4 ... 749997 749998 749999], length is 675000
val_idx is [     1      2      8 ... 749958 749959 749991], length is 75000




Training until validation scores don't improve for 100 rounds
[40]	training's multi_logloss: 1.90653	training's multi_error: 0.780631	valid_0's multi_logloss: 1.91358	valid_0's multi_error: 0.794107
[80]	training's multi_logloss: 1.89192	training's multi_error: 0.769145	valid_0's multi_logloss: 1.90599	valid_0's multi_error: 0.789067
[120]	training's multi_logloss: 1.8811	training's multi_error: 0.759738	valid_0's multi_logloss: 1.90201	valid_0's multi_error: 0.78676
[160]	training's multi_logloss: 1.87182	training's multi_error: 0.751676	valid_0's multi_logloss: 1.89917	valid_0's multi_error: 0.784587
[200]	training's multi_logloss: 1.86354	training's multi_error: 0.744578	valid_0's multi_logloss: 1.89732	valid_0's multi_error: 0.78272
[240]	training's multi_logloss: 1.85595	training's multi_error: 0.738039	valid_0's multi_logloss: 1.89593	valid_0's multi_error: 0.781987
[280]	training's multi_logloss: 1.84893	training's multi_error: 0.732062	valid_0's multi_logloss: 1.89492	valid_0's

In [20]:
test_proba_pred_probs

array([[0.16223715, 0.13701501, 0.14260705, ..., 0.150895  , 0.19916726,
        0.10224164],
       [0.13477861, 0.11544437, 0.24305222, ..., 0.13326829, 0.09253877,
        0.10188432],
       [0.20112987, 0.2069908 , 0.12394007, ..., 0.15094759, 0.0947308 ,
        0.08566567],
       ...,
       [0.14504583, 0.15518899, 0.10366532, ..., 0.09501075, 0.20311401,
        0.17912145],
       [0.1927835 , 0.11264773, 0.19600557, ..., 0.17714676, 0.13133042,
        0.11825347],
       [0.16647117, 0.20749408, 0.18831078, ..., 0.12450297, 0.07670708,
        0.09069463]])

### Load and use pretrained LightGBM

In [21]:
# lgbm = lightgbm.Booster(model_file='LightGBM_xxxx.txt')

### Create submission file

In [22]:
# probs = lgbm.predict_proba(extended_test_meta_features)

probs = kfold_avg_probs
top_predict = np.argsort(probs, axis=1)[:, -output_pred_num:][:, ::-1]

submission = pd.DataFrame({
    'id': test_df_src['id'].values,
})
submission["Fertilizer Name"] = [
    " ".join(rf.classes_[row]) for row in top_predict
]

submission.to_csv('submission.csv', index=False)
print(submission)

            id             Fertilizer Name
0       750000          DAP 10-26-26 28-28
1       750001     17-17-17 20-20 10-26-26
2       750002     14-35-14 10-26-26 28-28
3       750003  14-35-14 17-17-17 10-26-26
4       750004         20-20 10-26-26 Urea
...        ...                         ...
249995  999995     14-35-14 17-17-17 28-28
249996  999996         Urea 20-20 10-26-26
249997  999997           DAP Urea 14-35-14
249998  999998     17-17-17 10-26-26 28-28
249999  999999  14-35-14 17-17-17 10-26-26

[250000 rows x 2 columns]


In [23]:
submission_timestamp = os.path.getmtime('submission.csv')
print("Last modified:", datetime.fromtimestamp(submission_timestamp).strftime('%Y-%m-%d %H:%M:%S'))

Last modified: 2025-06-26 19:30:01


In [24]:
rf.classes_

array(['10-26-26', '14-35-14', '17-17-17', '20-20', '28-28', 'DAP',
       'Urea'], dtype=object)

In [25]:
rf.feature_names_in_

array(['Crop Type', 'Humidity', 'Moisture', 'Nitrogen', 'Phosphorous',
       'Potassium', 'Soil Type', 'Temparature'], dtype=object)

In [26]:
submission

Unnamed: 0,id,Fertilizer Name
0,750000,DAP 10-26-26 28-28
1,750001,17-17-17 20-20 10-26-26
2,750002,14-35-14 10-26-26 28-28
3,750003,14-35-14 17-17-17 10-26-26
4,750004,20-20 10-26-26 Urea
...,...,...
249995,999995,14-35-14 17-17-17 28-28
249996,999996,Urea 20-20 10-26-26
249997,999997,DAP Urea 14-35-14
249998,999998,17-17-17 10-26-26 28-28
