In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
cwd = os.getcwd()
print("Current working directory is {}".format(cwd))

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, log_loss

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector

import lightgbm as lgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from datetime import datetime
import joblib



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv
/kaggle/input/predicting-optimal-fertilizers-1/sample_submission.csv
/kaggle/input/predicting-optimal-fertilizers-1/train.csv
/kaggle/input/predicting-optimal-fertilizers-1/test.csv
Current working directory is /kaggle/working


## Data preprocessing

In [2]:
def sort_columns(df):
    return df.sort_index(axis=1)

def describe_data(df):
    ## Basic statistics
    describe = df.describe(include='all')
    info = df.info()  # Return None, print df.info() directly to console.
    null_count = df.isnull().sum()
    ## Unique values
    unique_count = df.nunique()
    sample_size = df.shape[0]
    unique_ratio = unique_count / sample_size
    ## print data descriptions
    print("\n====== df:\n")
    print(df)
    print("\n====== describe:\n")
    print(describe)
    print("\n======info: \n")
    print(df.info())
    print("\n====== null_count: \n")
    print(null_count)
    print("\n====== unique_count: \n")
    print(unique_count)
    print("\n====== unique_ratio: \n")
    print(unique_ratio)

    data_description = {
        "describe": describe,
        "info": info,
        "null_count": null_count,
        "unique_count": unique_count,
        "sample_size": sample_size,
        "unique_ratio": unique_ratio
    }
    return data_description

def drop_columns(df, cols_to_drop):
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
    return df   

def intersect_train_test_columns(train_df, test_df):
    ## Find common columns between train and test
    common_cols = train_df.columns.intersection(test_df.columns)
    ## Keep only those columns
    train_aligned = train_df[common_cols].copy()
    test_aligned = test_df[common_cols].copy()
    return train_aligned, test_aligned

######
## Don't ever use dummies for one-hot encoding. Big issue when doing online prediction with new data.
## pd.get_dummies() will mess up the one-hot positions.
## Use OneHotEncoder from scikit-learn instead.
######
# def category_to_onehot(df, **kwargs):
#     return pd.get_dummies(df, **kwargs)

def transform_numeric_to_category(df):
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        df[col] = df[col].astype('category')
    return df

def build_label_encoders(train_df):
    '''
    LabelEncoder is strictly meant for 1D arrays (i.e., one column at a time). 
    It doesn’t support fitting across a 2D DataFrame of multiple categorical columns.
    '''
    cat_cols = train_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    encoders = dict()
    for col in cat_cols:
        le = LabelEncoder()
        train_df[col] = train_df[col].astype(str)  # Ensure consistent type
        le.fit(train_df[col])
        encoders[col] = le
    return encoders

def build_onehot_encoder(train_df):
    cat_cols = train_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    encoder = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
        ],
        remainder='passthrough'
    )
    encoder.fit(train_df)
    return encoder

def mapk(y_true, y_pred_probs, k=3, labels=None):
    """
    Compute Mean Average Precision at K (MAP@k).

    Parameters:
    - y_true: array-like of shape (n_samples,) – true labels (can be str or int)
    - y_pred_probs: array-like of shape (n_samples, n_classes) – predicted class probabilities
    - k: int – number of top predictions to consider
    - labels: list – ordered list of label names corresponding to columns in y_pred_probs

    Returns:
    - map_k: float – MAP@k score
    """
    if labels is None:
        raise ValueError("You must provide a list of label names in `labels` to map predicted indices to class names.")

    top_k_preds = np.argsort(-y_pred_probs, axis=1)[:, :k]  # Get top-k predicted indices
    y_true = np.asarray(y_true)

    score = 0.0
    for i in range(len(y_true)):
        true_label = y_true[i]
        predicted_labels = [labels[idx] for idx in top_k_preds[i]]

        if true_label in predicted_labels:
            rank = predicted_labels.index(true_label)
            score += 1.0 / (rank + 1)

    return score / len(y_true)

In [3]:
# Parameter settings
labels = ["Fertilizer Name"]
timestamp = datetime.now().strftime('%Y%m%d')
output_pred_num = 3

# Read inputs
train_df = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv")
train_df_src = train_df.copy()
test_df = pd.read_csv("/kaggle/input/playground-series-s5e6/test.csv")
test_df_src = test_df.copy()
sample_df = pd.read_csv("/kaggle/input/playground-series-s5e6/sample_submission.csv")

# Data extension
external_df = pd.read_csv("/kaggle/input/predicting-optimal-fertilizers-1/train.csv")

# Repeat train_df 5 times (original + 4 copies)
train_df_repeated = pd.concat([train_df] * 4, ignore_index=True)

# Concatenate with external_df
train_df = pd.concat([train_df_repeated, external_df])
train_df = train_df.reset_index(drop=True)
train_df = train_df.sample(frac=1, random_state=42)

In [4]:
train_df

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
1494458,744458,32,50,35,Red,Sugarcane,25,12,14,14-35-14
3649276,649276,27,72,38,Black,Pulses,5,14,5,DAP
1885555,385555,35,67,33,Red,Wheat,26,1,18,20-20
613255,613255,30,68,45,Clayey,Tobacco,20,12,41,10-26-26
3639795,639795,35,67,34,Black,Oil seeds,35,5,35,10-26-26
...,...,...,...,...,...,...,...,...,...,...
2356330,106330,28,56,27,Red,Maize,36,18,17,17-17-17
3511566,511566,34,58,59,Clayey,Pulses,13,10,9,20-20
2229084,729084,28,50,63,Sandy,Millets,30,15,28,28-28
2768307,518307,33,58,26,Clayey,Maize,4,8,20,20-20


In [5]:
# train_df
# test_df_src
sample_df.iloc[0, 1]


'14-35-14 10-26-26 Urea'

In [6]:
train_df_src

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [7]:
############
## Data Cleaning
############

## Describe data
train_df_description = describe_data(train_df)

## Sort data (Ensuring pd.get_dummies() gives consistent orders on train and test data.)
train_df = sort_columns(train_df)
test_df = sort_columns(test_df)

## Drop columns
cols_to_drop = ["id"]
train_df = drop_columns(train_df, cols_to_drop)
test_df = drop_columns(test_df, cols_to_drop)

## Split labels from training data
train_feature_df = train_df.drop(columns=labels)
train_label_df = train_df[labels]

## Take the intersection of train and test features.
train_feature_df, test_df = intersect_train_test_columns(train_feature_df, test_df)

############
## Feature Engineering
############

## Transform categorical features into label encoding.
label_encoders = build_label_encoders(train_df.copy())

train_encoded = train_feature_df.copy()
for col, le in label_encoders.items():
    if col in train_encoded.columns:
        train_encoded[col] = le.transform(train_encoded[col].astype(str))
    else:
        print(f"Warning: Column '{col}' not found in train_encoded. Skipping...")
        
test_encoded = test_df.copy()
for col, le in label_encoders.items():
    if col in test_encoded.columns:
        test_encoded[col] = le.transform(test_encoded[col].astype(str))
    else:
        print(f"Warning: Column '{col}' not found in test_encoded. Skipping...")
        
train_label_encoded = train_label_df.copy()
for col in train_label_encoded.columns:
    if col in label_encoders.keys():
        train_label_encoded[col] = label_encoders[col].transform(train_label_encoded[col].astype(str))
        print(f"Train label column '{col}' is encoded with LabelEncoder.")

## Transform numerical features into categorical features.
train_encoded = transform_numeric_to_category(train_encoded)
test_encoded = transform_numeric_to_category(test_encoded)   

# ## Transform categorical features into one-hot encoding.
# oh_encoder = build_onehot_encoder(train_feature_df)

# train_encoded = pd.DataFrame(
#     oh_encoder.transform(train_df),
#     columns=oh_encoder.get_feature_names_out(),
#     index=train_df.index
# )
# test_encoded = pd.DataFrame(
#     oh_encoder.transform(test_df),
#     columns=oh_encoder.get_feature_names_out(),
#     index=test_df.index
# )


# train_encoded = train_encoded.iloc[:200,]
# train_label_df = train_label_df.iloc[:200]

<class 'pandas.core.frame.DataFrame'>
Index: 3750000 entries, 1494458 to 2219110
Data columns (total 10 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   id               int64 
 1   Temparature      int64 
 2   Humidity         int64 
 3   Moisture         int64 
 4   Soil Type        object
 5   Crop Type        object
 6   Nitrogen         int64 
 7   Potassium        int64 
 8   Phosphorous      int64 
 9   Fertilizer Name  object
dtypes: int64(7), object(3)
memory usage: 314.7+ MB


             id  Temparature  Humidity  Moisture Soil Type  Crop Type  \
1494458  744458           32        50        35       Red  Sugarcane   
3649276  649276           27        72        38     Black     Pulses   
1885555  385555           35        67        33       Red      Wheat   
613255   613255           30        68        45    Clayey    Tobacco   
3639795  639795           35        67        34     Black  Oil seeds   
...         ...          ...       ...       .

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Train label column 'Fertilizer Name' is encoded with LabelEncoder.


In [8]:
train_label_df.values.ravel()

array(['14-35-14', 'DAP', '20-20', ..., '28-28', '20-20', '28-28'],
      dtype=object)

In [9]:
print(f'train_encoded.info():\n{train_encoded.info()}')
print(f'{train_encoded}')
print(f'test_encoded.info():{test_encoded.info()}')
print(f'{test_encoded}')
print(f'train_label_df.info():\n{train_label_df.info()}')
print(f'{train_label_df}')

<class 'pandas.core.frame.DataFrame'>
Index: 3750000 entries, 1494458 to 2219110
Data columns (total 8 columns):
 #   Column       Dtype   
---  ------       -----   
 0   Crop Type    category
 1   Humidity     category
 2   Moisture     category
 3   Nitrogen     category
 4   Phosphorous  category
 5   Potassium    category
 6   Soil Type    category
 7   Temparature  category
dtypes: category(8)
memory usage: 57.2 MB
train_encoded.info():
None
        Crop Type Humidity Moisture Nitrogen Phosphorous Potassium Soil Type  \
1494458         8       50       35       25          14        12         3   
3649276         7       72       38        5           5        14         0   
1885555        10       67       33       26          18         1         3   
613255          9       68       45       20          41        12         1   
3639795         5       67       34       35          35         5         0   
...           ...      ...      ...      ...         ...       ...  

## Training and Testing

#### Stacking Method

In [10]:
# Random Forest core hyperparameters
rf_model_args = {
    'n_estimators': 200,             
    'max_depth': 10,               
    'min_samples_split': 5,        
    'min_samples_leaf': 2,        
    'max_features': 'sqrt',         # Number of features considered at each split ('sqrt' works well by default)
    'bootstrap': True,              
    'random_state': 42,            
    'n_jobs': -1,                   # Use all available CPU cores
    'verbose': 1                  
}

# Logistic Regression core hyperparameters
lr_model_args = {
    'penalty': 'l2',               
    'C': 1.0,                       # Inverse of regularization strength; lower = stronger regularization
    'solver': 'lbfgs',              # Robust optimizer for multiclass; but doesn't support 'n_jobs'
    'max_iter': 1000,              
    'random_state': 42,           
    # 'n_jobs': -1,                 # Not supported by 'lbfgs'
    'verbose': 1                 
}

# XGBoost core hyperparameters (with categorical support)
xgb_model_args = {
    'n_estimators': 300,            
    'tree_method': 'hist',         # ⚠️ Required for enable_categorical to work
    'enable_categorical': True,    # ⚠️ Enables native handling of categorical features (pandas dtype must be 'category')
    'learning_rate': 0.05,         # Step size shrinkage
    'max_depth': 6,            
    'subsample': 0.8,              # Randomly sample training data to prevent overfitting
    'colsample_bytree': 0.8,       # Randomly sample features for each tree
    'objective': 'multi:softprob', # Multi-class classification with probability output
    'eval_metric': 'mlogloss',     # Evaluation metric for model performance
    'use_label_encoder': False,    # ⚠️ Avoids warning in newer XGBoost versions
    'random_state': 42,         
    'n_jobs': -1,                 
    'verbosity': 1              
}

# LightGBM core hyperparameters
lgbm_model_args = {
    'objective': 'multiclass',     # Multiclass classification
    'num_class': 7,                # ⚠️ Must match number of classes
    'boosting_type': 'gbdt',     
    'n_estimators': 1000,         
    # 'early_stopping_round': 100,  # ⚠️ No effect here; handled via .fit() with eval_set
    'learning_rate': 0.03,         # Smaller values = slower but potentially better learning
    'num_leaves': 31,              # Max leaves per tree (controls complexity)
    'max_depth': 7,                # Maximum depth; helps reduce overfitting if leaves are many
    'random_state': 42,           
    'reg_alpha': 5.0,          
    'reg_lambda': 5.0,             
    'device_type': 'gpu',          # ⚠️ Make sure GPU is available, else fallback to 'cpu'
    'n_jobs': -1,             
    'verbosity': 1             
}

x_train_stack = train_encoded
y_train_stack = train_label_df.values.ravel()

base_models = [
    ('lr', LogisticRegression(**lr_model_args)),
    ('xgb', XGBClassifier(**xgb_model_args)),
    ('rf', RandomForestClassifier(**rf_model_args))
]

final_estimator = LGBMClassifier(**lgbm_model_args)

stack = StackingClassifier(
    estimators=base_models,
    final_estimator=final_estimator,  # or another meta-model
    stack_method = 'predict_proba',
    cv=5,
    passthrough=True,
    n_jobs=-1,
    verbose=1
)

# scores = cross_val_score(stack, x_train_stack, y_train_stack, cv=5, scoring='neg_log_loss')
# print("Mean Log Loss:", -scores.mean())


stack.fit(x_train_stack, y_train_stack)
# y_pred_proba = stack.predict_proba(x_val)
# loss = log_loss(y_val, y_pred_proba)
# print("Validation Log Loss:", loss)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.2min finished
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  7.5min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.6min finished
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed: 13.8min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 15.9min finished
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed: 16.4min
[Parallel(n_job

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 5555
[LightGBM] [Info] Number of data points in the train set: 3750000, number of used features: 29
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 29 dense feature groups (114.44 MB) transferred to GPU in 0.172087 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -1.884866
[LightGBM] [Info] Start training from score -1.880057
[LightGBM] [Info] Start training from score -1.897538
[LightGBM] [Info] Start training from score -1.911544
[LightGBM] [Info] Start training from score -1.909121
[LightGBM] [Info] Start training from score -2.067671
[LightGBM] [Info] Start training from score -2.094845


In [11]:
# ######
# ## Train models with cross_val_score()
# ######

# ## Set parameters
# rf_model_args = {
#     'n_estimators': 150,
#     'criterion': "gini",
#     'max_depth': 8,
#     'min_samples_split': 20,
#     'min_samples_leaf': 10,
#     'random_state': 42,
#     'max_features': "sqrt",
#     'n_jobs': -1,
#     'oob_score':True,
#     'max_samples': 0.85,
#     'verbose': 1,
# }

# rf_fit_cv_args = dict()

# rf_cv_args = {
#     'cv': 10,
#     'scoring': ["neg_log_loss", 'accuracy'],
#     'n_jobs': -1,
#     'verbose': 1,
#     'fit_params': rf_fit_cv_args,
#     'return_train_score': True,
# }

# x_rf = train_encoded
# y_rf = train_label_df.values.ravel()

# ## Training and validation
# rf = RandomForestClassifier(**rf_model_args)
# rf_cv_scores = cross_validate(rf, x_rf, y_rf, **rf_cv_args)
# rf_cv_avg_scores = dict()
# for k, v in rf_cv_scores.items():
#     mean_val = np.mean(v)
#     rf_cv_avg_scores[k] = mean_val
# print(f"\nMean CV Score:\n{rf_cv_avg_scores}")
# print(f"\nAll Fold Scores:\n{rf_cv_scores}")

# rf.fit(x_rf, y_rf) # Train on the whole training set as the final model.
# joblib.dump(rf, f'/kaggle/working/RandomForest_{timestamp}.joblib')

# ## Predict on training data
# train_preds = rf.predict(x_rf)

# ## Model analytics
# importances = dict(zip(rf.feature_names_in_, rf.feature_importances_))
# sorted_importances = dict(sorted(
#     zip(rf.feature_names_in_, rf.feature_importances_),
#     key=lambda x: x[1],
#     reverse=True
# ))


### Load and use pretrained LightGBM

In [12]:
# rf = joblib.load('RandomForest_20240624T170312.joblib')

In [13]:
# dict(sorted(
#     zip(rf.feature_names_in_, rf.feature_importances_),
#     key=lambda x: x[1],
#     reverse=True
# ))

In [14]:
# ######
# ## Train models with KFold()
# ######

# kf = KFold(n_splits=10, shuffle=True, random_state=42)
# rf = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

# fold = 1
# for train_index, val_index in kf.split(train_encoded):
#     X_train, X_val = train_encoded.iloc[train_index], train_encoded.iloc[val_index]
#     y_train, y_val = train_label_df.values.ravel()[train_index], train_label_df.values.ravel()[val_index]
    
#     rf.fit(X_train, y_train)
#     preds = rf.predict(X_val)
#     acc = accuracy_score(y_val, preds)
    
#     print(f"Fold {fold} Accuracy: {acc:.4f}")
#     fold += 1

## Prediction

### Single Label Prediction

In [15]:
# test_predict = rf.predict(test_encoded)
# print(test_predict)

### MAP@k Prediction

In [16]:
# probs = rf.predict_proba(test_encoded)

# top_predict = np.argsort(probs, axis=1)[:, -output_pred_num:][:, ::-1]
# submission = pd.DataFrame({
#     'id': test_df_src['id'].values,
# })
# submission["Fertilizer Name"] = [
#     " ".join(rf.classes_[row]) for row in top_predict
# ]

# submission.to_csv('submission.csv', index=False)
# print(submission)


### Meta-learner Prediction

#### Meta-learning feature creation

In [17]:
# train_probs = rf.predict_proba(train_encoded)
# test_probs = rf.predict_proba(test_encoded)

# train_meta_features = pd.DataFrame(
#     data=train_probs,
#     columns=rf.classes_
# )

# test_meta_features = pd.DataFrame(
#     data=test_probs,
#     columns=rf.classes_
# )

# top_num = None  # Natural numbers or None to take all.
# top_features = list(sorted_importances.keys()) if top_num is None else list(sorted_importances.keys())[:top_num]


# extended_train_meta_features = pd.merge(train_encoded[top_features], train_meta_features, left_index=True, right_index=True, how="left")
# extended_test_meta_features = pd.merge(test_encoded[top_features], test_meta_features, left_index=True, right_index=True, how="left")

# # extended_train_meta_features = train_encoded
# # extended_test_meta_features = test_encoded

# extended_train_meta_features = sort_columns(extended_train_meta_features)
# extended_test_meta_features = sort_columns(extended_test_meta_features)

In [18]:
# extended_train_meta_features

#### Define inputs

In [19]:
# x_meta = extended_train_meta_features
# y_meta = train_label_df.values.ravel()
# print(y_meta)

# lgbm_model_args = {
#     'objective': 'multiclass',
#     'num_class': 7,
#     'boosting_type': 'gbdt',
#     'n_estimators': 3000,
#     'early_stopping_round': 200,  # No effect, only for compatibility. Call early_stopping in fit directly.
#     'learning_rate': 0.03,
#     "num_leaves": 31,
#     'max_depth': 7,
#     'random_state': 42,
#     'reg_alpha': 5.0, 
#     'reg_lambda': 5.0,
#     'device_type': 'gpu',
#     'verbosity': -1,
# }

# lgbm_fit_cv_args = {
#     # 'eval_set': [(x_valid, y_valid)],
#     # 'callbacks': [lgb.early_stopping(stopping_rounds=50)],  # Early stopping not supported in sklearn cross_validate().
#     'eval_metric': ['multi_logloss', 'multi_error'],  
# }

# lgbm_fit_custom_args = {
#     'eval_set': None,
#     'eval_metric': ['multi_logloss', 'multi_error'],
#     'callbacks':[
#         lgb.early_stopping(stopping_rounds=lgbm_model_args['early_stopping_round']),
#         lgb.log_evaluation(period=40)
#     ],
# }

# lgbm_cv_args = {
#     'cv': 5,
#     'n_jobs': -1,
#     'verbose': 2,
#     'fit_params': lgbm_fit_cv_args
# }


#### Model with cross_validate

In [20]:
# lgbm = LGBMClassifier(**lgbm_model_args)
# lgbm_cv_scores = cross_validate(lgbm, x_meta, y_meta, **lgbm_cv_args)
# lgbm_cv_avg_scores = dict()
# for k, v in lgbm_cv_scores.items():
#     mean_val = np.mean(v)
#     lgbm_cv_avg_scores[k] = mean_val
# print(f"\nMean CV Score:\n{lgbm_cv_avg_scores}")
# print(f"\nAll Fold Scores:\n{lgbm_cv_scores}")

# lgbm.fit(x_meta, y_meta, **lgbm_fit_cv_args)
# lgbm.booster_.save_model('/kaggle/working/LightGBM_{}.txt'.format(timestamp))

# # Predict on training data
# train_preds = lgbm.predict(x_meta)

# # # Method 1: Using .score()
# # train_accuracy = lgbm.score(x_meta, y_meta)

# # Method 2: Using accuracy_score
# train_accuracy = accuracy_score(y_meta, train_preds)
# print(f"Training Accuracy: {train_accuracy:.4f}")

In [21]:
# #### Model with custom flow

# kfold_num = lgbm_cv_args['cv']
# kf = StratifiedKFold(n_splits=kfold_num, shuffle=True, random_state=42)
# test_pred_prob_all_folds = list()
# best_iters = dict()

# for fold, (train_idx, val_idx) in enumerate(kf.split(x_meta, y_meta)):
#     print(f'train_idx is {train_idx}, length is {len(train_idx)}')
#     print(f'val_idx is {val_idx}, length is {len(val_idx)}')
#     lgbm = LGBMClassifier(**lgbm_model_args)
#     x_train, x_val = x_meta.iloc[train_idx], x_meta.iloc[val_idx]
#     y_train, y_val = y_meta[train_idx], y_meta[val_idx]
#     lgbm_fit_custom_args['eval_set'] = [
#         (x_val, y_val),
#         (x_train, y_train)
#     ]
#     # lgbm_fit_custom_args['eval_set'] = [(x_val, y_val)]
#     lgbm.fit(x_train, y_train, **lgbm_fit_custom_args)
#     lgbm_fit_custom_args['eval_set'] = None
#     lgbm.booster_.save_model('/kaggle/working/LightGBM_fold{}_{}.txt'.format(fold, timestamp))
#     # Model evaluation
#     best_iter = lgbm.best_iteration_
#     best_iters[fold] = best_iter
#     num_iteration = best_iter
#     labels = lgbm._classes
    
#     eval_result = lgbm.evals_result_
#     final_logloss = eval_result['valid_0']['multi_logloss'][best_iter - 1]
#     final_error = eval_result['valid_0']['multi_error'][best_iter - 1]
#     print(f"[Fold {fold}] Final log loss: {final_logloss:.5f}, Final error: {final_error:.5f}")
    
#     pred_prob_train = lgbm.predict_proba(x_train, num_iteration=num_iteration)
#     pred_prob_val = lgbm.predict_proba(x_val, num_iteration=num_iteration)
#     pred_train = lgbm.predict(x_train, num_iteration=num_iteration)
#     pred_val = lgbm.predict(x_val, num_iteration=num_iteration)
#     # print("LightGBM Train Accuracy:", accuracy_score(y_train, pred_train))  
#     # print("LightGBM Validation Accuracy:", accuracy_score(y_val, pred_val)) 
    
#     mapk_train = mapk(y_train, pred_prob_train, k=3, labels=labels)
#     mapk_val = mapk(y_val, pred_prob_val, k=3, labels=labels)
#     print("LightGBM Train MAP3:", mapk_train)
#     print("LightGBM Validation MAP3:", mapk_val)
    
#     # Make prediction
#     test_pred_prob = lgbm.predict_proba(extended_test_meta_features, num_iteration=num_iteration)
#     test_pred_prob_all_folds.append(test_pred_prob)
#     print(f"\n[Fold {fold} is finished.]\n\n\n")
#     break
    
# # Shape: (n_folds, n_samples, n_classes)
# stacked_preds = np.stack(test_pred_prob_all_folds, axis=0)

# # Average across folds
# kfold_avg_probs = np.mean(stacked_preds, axis=0)

In [22]:
# test_pred_prob_all_folds

### Load and use pretrained LightGBM

In [23]:
# lgbm = lightgbm.Booster(model_file='LightGBM_xxxx.txt')

### Create submission file

In [24]:
# probs = lgbm.predict_proba(extended_test_meta_features)
# probs = kfold_avg_probs
pred_model = stack
probs = stack.predict_proba(test_encoded)

top_predict = np.argsort(probs, axis=1)[:, -output_pred_num:][:, ::-1]

submission = pd.DataFrame({
    'id': test_df_src['id'].values,
})
submission["Fertilizer Name"] = [
    " ".join(pred_model.classes_[row]) for row in top_predict
]

submission.to_csv('submission.csv', index=False)
print(submission)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    2.6s finished


            id          Fertilizer Name
0       750000       DAP 10-26-26 28-28
1       750001      Urea 17-17-17 20-20
2       750002     20-20 14-35-14 28-28
3       750003    14-35-14 DAP 10-26-26
4       750004     28-28 10-26-26 20-20
...        ...                      ...
249995  999995   Urea 14-35-14 17-17-17
249996  999996      Urea 17-17-17 28-28
249997  999997        Urea DAP 14-35-14
249998  999998  28-28 10-26-26 17-17-17
249999  999999  14-35-14 17-17-17 20-20

[250000 rows x 2 columns]


In [25]:
submission_timestamp = os.path.getmtime('submission.csv')
print("Last modified:", datetime.fromtimestamp(submission_timestamp).strftime('%Y-%m-%d %H:%M:%S'))

Last modified: 2025-07-01 03:27:50
