In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
cwd = os.getcwd()
print("Current working directory is {}".format(cwd))

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from datetime import datetime
import joblib

import lightgbm

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv
Current working directory is /kaggle/working


## Data preprocessing

In [2]:
def sort_columns(df):
    return df.sort_index(axis=1)


def describe_data(df):
    ## Basic statistics
    describe = df.describe(include='all')
    info = df.info()  # Return None, print df.info() directly to console.
    null_count = df.isnull().sum()
    ## Unique values
    unique_count = df.nunique()
    sample_size = df.shape[0]
    unique_ratio = unique_count / sample_size
    ## print data descriptions
    print("\n====== df:\n")
    print(df)
    print("\n====== describe:\n")
    print(describe)
    print("\n======info: \n")
    print(df.info())
    print("\n====== null_count: \n")
    print(null_count)
    print("\n====== unique_count: \n")
    print(unique_count)
    print("\n====== unique_ratio: \n")
    print(unique_ratio)

    data_description = {
        "describe": describe,
        "info": info,
        "null_count": null_count,
        "unique_count": unique_count,
        "sample_size": sample_size,
        "unique_ratio": unique_ratio
    }
    return data_description


def drop_columns(df, cols_to_drop):
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
    return df   

def intersect_train_test_columns(train_df, test_df):
    ## Find common columns between train and test
    common_cols = train_df.columns.intersection(test_df.columns)
    ## Keep only those columns
    train_aligned = train_df[common_cols].copy()
    test_aligned = test_df[common_cols].copy()
    return train_aligned, test_aligned

######
## Don't ever use dummies for one-hot encoding. Big issue when doing online prediction with new data.
## pd.get_dummies() will mess up the one-hot positions.
## Use OneHotEncoder from scikit-learn instead.
######
# def category_to_onehot(df, **kwargs):
#     return pd.get_dummies(df, **kwargs)

def build_onehot_encoder(train_df):
    cat_cols = train_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

    onehot_encoder = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
        ],
        remainder='passthrough'
    )
    onehot_encoder.fit(train_df)
    return onehot_encoder

In [3]:
# Parameter settings
labels = ["Fertilizer Name"]
timestamp = datetime.now().strftime('%Y%m%d')

# Read inputs
train_df = pd.read_csv("../input/playground-series-s5e6/train.csv")
train_df_src = train_df.copy()
test_df = pd.read_csv("../input/playground-series-s5e6/test.csv")
test_df_src = test_df.copy()
sample_df = pd.read_csv("../input/playground-series-s5e6/sample_submission.csv")


In [4]:
# train_df
# test_df_src
sample_df.iloc[0, 1]


'14-35-14 10-26-26 Urea'

In [5]:
train_df_src

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [6]:
######
## Data preprocessing
######


## Describe data
train_df_description = describe_data(train_df)

## Sort data (Ensuring pd.get_dummies() gives consistent orders on train and test data.)
train_df = sort_columns(train_df)
test_df = sort_columns(test_df)

## Drop columns
cols_to_drop = ["id"]
train_df = drop_columns(train_df, cols_to_drop)
train_df

## Split labels from training data
train_feature_df = train_df.drop(columns=labels)
train_label_df = train_df[labels]

## Take the intersection of train and test features.
train_feature_df, test_df = intersect_train_test_columns(train_feature_df, test_df)

## Transform categorical features into one-hot encoding.
oh_encoder = build_onehot_encoder(train_feature_df)

train_encoded = pd.DataFrame(
    oh_encoder.transform(train_df),
    columns=oh_encoder.get_feature_names_out(),
    index=train_df.index
)
test_encoded = pd.DataFrame(
    oh_encoder.transform(test_df),
    columns=oh_encoder.get_feature_names_out(),
    index=test_df.index
)


# train_encoded = train_encoded.iloc[:200,]
# train_label_df = train_label_df.iloc[:200]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temparature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
dtypes: int64(7), object(3)
memory usage: 57.2+ MB


            id  Temparature  Humidity  Moisture Soil Type    Crop Type  \
0            0           37        70        36    Clayey    Sugarcane   
1            1           27        69        65     Sandy      Millets   
2            2           29        63        32     Sandy     

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [7]:
train_label_df.values.ravel()

array(['28-28', '28-28', '17-17-17', ..., '10-26-26', '20-20', 'Urea'],
      dtype=object)

In [8]:
test_encoded

Unnamed: 0,cat__Crop Type_Barley,cat__Crop Type_Cotton,cat__Crop Type_Ground Nuts,cat__Crop Type_Maize,cat__Crop Type_Millets,cat__Crop Type_Oil seeds,cat__Crop Type_Paddy,cat__Crop Type_Pulses,cat__Crop Type_Sugarcane,cat__Crop Type_Tobacco,...,cat__Soil Type_Clayey,cat__Soil Type_Loamy,cat__Soil Type_Red,cat__Soil Type_Sandy,remainder__Humidity,remainder__Moisture,remainder__Nitrogen,remainder__Phosphorous,remainder__Potassium,remainder__Temparature
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,70.0,52.0,34.0,24.0,11.0,31.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,62.0,45.0,30.0,15.0,14.0,27.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,72.0,28.0,14.0,4.0,15.0,28.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,53.0,57.0,18.0,36.0,17.0,37.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,55.0,32.0,13.0,14.0,19.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,66.0,30.0,14.0,18.0,7.0,26.0
249996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,62.0,55.0,28.0,7.0,14.0,33.0
249997,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,53.0,64.0,28.0,27.0,11.0,36.0
249998,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,67.0,26.0,33.0,10.0,0.0,36.0


## Training and Testing

In [9]:
######
## Train models with cross_val_score()
######
rf_args = {
    "n_estimators": 200,
    "criterion": "gini",
    "max_depth": 15,
    "min_samples_split": 5,
    "min_samples_leaf": 6,
    "random_state": 42,
    "max_features": "sqrt",
    "n_jobs": -1
}
x_rf = train_encoded
y_rf = train_label_df.values.ravel()

rf = RandomForestClassifier(**rf_args)
rf_cv_scores = cross_val_score(rf, x_rf, y_rf, cv=10, scoring='accuracy') # Train cv times of model.

print(f"Mean CV Score: {rf_cv_scores.mean():.4f}")
print(f"All Fold Scores: {rf_cv_scores}")
rf.fit(x_rf, y_rf)  # Train on the whole training set as the final model.
joblib.dump(rf, f'/kaggle/working/RandomForest_{timestamp}.joblib')

# Predict on training data
train_preds = rf.predict(x_rf)

# Calculate training accuracy
train_accuracy = accuracy_score(y_rf, train_preds)
print(f"Training Accuracy: {train_accuracy:.4f}")

importances = dict(zip(rf.feature_names_in_, rf.feature_importances_))
sorted_importances = dict(sorted(
    zip(rf.feature_names_in_, rf.feature_importances_),
    key=lambda x: x[1],
    reverse=True
))


Mean CV Score: 0.1790
All Fold Scores: [0.18026667 0.17882667 0.17936    0.18025333 0.17794667 0.17844
 0.17948    0.17694667 0.17994667 0.17806667]
Training Accuracy: 0.4167


### Load and use pretrained LightGBM

In [10]:
# rf = joblib.load('RandomForest_20240624T170312.joblib')

In [11]:
dict(sorted(
    zip(rf.feature_names_in_, rf.feature_importances_),
    key=lambda x: x[1],
    reverse=True
))

{'remainder__Phosphorous': 0.16612198040519235,
 'remainder__Nitrogen': 0.16390325575246817,
 'remainder__Moisture': 0.15701484353631912,
 'remainder__Humidity': 0.14469667802249642,
 'remainder__Potassium': 0.13224116199386102,
 'remainder__Temparature': 0.12776323939299894,
 'cat__Soil Type_Red': 0.010808283898609265,
 'cat__Soil Type_Sandy': 0.010663529371173796,
 'cat__Soil Type_Loamy': 0.010401377703307239,
 'cat__Soil Type_Black': 0.010273183017325854,
 'cat__Soil Type_Clayey': 0.009660031058630026,
 'cat__Crop Type_Pulses': 0.008473578408478987,
 'cat__Crop Type_Sugarcane': 0.00678911648951804,
 'cat__Crop Type_Paddy': 0.0050114902923918125,
 'cat__Crop Type_Ground Nuts': 0.004983513459886015,
 'cat__Crop Type_Tobacco': 0.004728395069272474,
 'cat__Crop Type_Wheat': 0.004548604514969221,
 'cat__Crop Type_Oil seeds': 0.004508697959165533,
 'cat__Crop Type_Cotton': 0.0044650377949471964,
 'cat__Crop Type_Millets': 0.00434344069020597,
 'cat__Crop Type_Maize': 0.004328324715317063,

In [12]:
# ######
# ## Train models with KFold()
# ######

# kf = KFold(n_splits=10, shuffle=True, random_state=42)
# rf = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=42)

# fold = 1
# for train_index, val_index in kf.split(train_encoded):
#     X_train, X_val = train_encoded.iloc[train_index], train_encoded.iloc[val_index]
#     y_train, y_val = train_label_df.values.ravel()[train_index], train_label_df.values.ravel()[val_index]
    
#     rf.fit(X_train, y_train)
#     preds = rf.predict(X_val)
#     acc = accuracy_score(y_val, preds)
    
#     print(f"Fold {fold} Accuracy: {acc:.4f}")
#     fold += 1

## Prediction

### Single Label Prediction

In [13]:
# test_predict = rf.predict(test_encoded)
# print(test_predict)

### MAP@5 Prediction

In [14]:
# probs = rf.predict_proba(test_encoded)

# top5_predict = np.argsort(probs, axis=1)[:, -5:][:, ::-1]
# submission = pd.DataFrame({
#     'id': test_df_src['id'].values,
# })
# submission["Fertilizer Name"] = [
#     " ".join(rf.classes_[row]) for row in top5_predict
# ]

# submission.to_csv('submission.csv', index=False)
# print(submission)


### Meta-learner Prediction

In [15]:
train_probs = rf.predict_proba(train_encoded)
test_probs = rf.predict_proba(test_encoded)

train_meta_features = pd.DataFrame(
    data=train_probs,
    columns=rf.classes_
)

test_meta_features = pd.DataFrame(
    data=test_probs,
    columns=rf.classes_
)

top_num = 6  # Natural numbers or None to take all.
top_features = list(sorted_importances.keys()) if top_num is None else list(sorted_importances.keys())[:top_num]


extended_train_meta_features = pd.merge(train_encoded[top_features], train_meta_features, left_index=True, right_index=True, how="left")
extended_test_meta_features = pd.merge(test_encoded[top_features], test_meta_features, left_index=True, right_index=True, how="left")

extended_train_meta_features = sort_columns(extended_train_meta_features)
extended_test_meta_features = sort_columns(extended_test_meta_features)

In [16]:
extended_train_meta_features

Unnamed: 0,10-26-26,14-35-14,17-17-17,20-20,28-28,DAP,Urea,remainder__Humidity,remainder__Moisture,remainder__Nitrogen,remainder__Phosphorous,remainder__Potassium,remainder__Temparature
0,0.146179,0.144740,0.126802,0.159793,0.238030,0.099825,0.084631,70.0,36.0,36.0,5.0,4.0,37.0
1,0.125272,0.122152,0.130744,0.153646,0.192992,0.140754,0.134439,69.0,65.0,30.0,18.0,6.0,27.0
2,0.143289,0.154393,0.155372,0.137251,0.142627,0.127461,0.139607,63.0,32.0,24.0,16.0,12.0,29.0
3,0.142098,0.147571,0.140663,0.139917,0.146068,0.140869,0.142814,62.0,54.0,39.0,4.0,12.0,35.0
4,0.141351,0.146645,0.147174,0.133775,0.146558,0.150665,0.133831,58.0,43.0,37.0,16.0,2.0,35.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,0.147809,0.143444,0.151686,0.154886,0.152940,0.131520,0.117715,69.0,30.0,8.0,6.0,16.0,25.0
749996,0.139655,0.186867,0.147523,0.168364,0.172923,0.104941,0.079727,64.0,58.0,38.0,20.0,8.0,37.0
749997,0.164382,0.126302,0.168625,0.151373,0.169560,0.093764,0.125994,68.0,59.0,6.0,29.0,11.0,35.0
749998,0.143736,0.151344,0.144530,0.150737,0.144895,0.128937,0.135822,68.0,29.0,9.0,12.0,11.0,31.0


In [17]:
# x and y
x_meta = extended_train_meta_features
y_meta = train_label_df.values.ravel()
print(y_meta)

lgbm_args = {
    "boosting_type": 'gbdt',
    "n_estimators": 10,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "max_depth": 6,
    "random_state": 42,
    "reg_alpha": 1.0, 
    "reg_lambda": 1.0
}
lgbm = lightgbm.LGBMClassifier(**lgbm_args)

lgbm_cv_scores = cross_val_score(lgbm, x_meta, y_meta, cv=10, scoring='accuracy')  # or use a custom MAP@5 scorer if you have one

print(f"Mean CV Score: {lgbm_cv_scores.mean():.4f}")
print(f"All Fold Scores: {lgbm_cv_scores}")

lgbm.fit(x_meta, y_meta)
lgbm.booster_.save_model('/kaggle/working/LightGBM_{}.txt'.format(timestamp))


# Predict on training data
train_preds = lgbm.predict(x_meta)

# # Method 1: Using .score()
# train_accuracy = lgbm.score(x_meta, y_meta)

# Method 2: Using accuracy_score
train_accuracy = accuracy_score(y_meta, train_preds)
print(f"Training Accuracy: {train_accuracy:.4f}")


['28-28' '28-28' '17-17-17' ... '10-26-26' '20-20' 'Urea']
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014973 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1969
[LightGBM] [Info] Number of data points in the train set: 675000, number of used features: 13
[LightGBM] [Info] Start training from score -1.884860
[LightGBM] [Info] Start training from score -1.880061
[LightGBM] [Info] Start training from score -1.897545
[LightGBM] [Info] Start training from score -1.911545
[LightGBM] [Info] Start training from score -1.909123
[LightGBM] [Info] Start training from score -2.067671
[LightGBM] [Info] Start training from score -2.094836
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force

In [18]:
print(len(y_meta))

750000


### Load and use pretrained LightGBM

In [19]:
# lgbm = lightgbm.Booster(model_file='LightGBM_xxxx.txt')


### Create submission file

In [20]:
probs = lgbm.predict_proba(extended_test_meta_features)

top5_predict = np.argsort(probs, axis=1)[:, -5:][:, ::-1]
submission = pd.DataFrame({
    'id': test_df_src['id'].values,
})
submission["Fertilizer Name"] = [
    " ".join(rf.classes_[row]) for row in top5_predict
]

submission.to_csv('submission.csv', index=False)
print(submission)

            id                         Fertilizer Name
0       750000           20-20 Urea 17-17-17 28-28 DAP
1       750001   17-17-17 20-20 10-26-26 Urea 14-35-14
2       750002       28-28 DAP 20-20 10-26-26 17-17-17
3       750003   14-35-14 17-17-17 20-20 Urea 10-26-26
4       750004  20-20 10-26-26 17-17-17 14-35-14 28-28
...        ...                                     ...
249995  999995  17-17-17 10-26-26 28-28 14-35-14 20-20
249996  999996   10-26-26 14-35-14 Urea 20-20 17-17-17
249997  999997     DAP 10-26-26 14-35-14 Urea 17-17-17
249998  999998       17-17-17 10-26-26 28-28 DAP 20-20
249999  999999  10-26-26 14-35-14 28-28 20-20 17-17-17

[250000 rows x 2 columns]


In [21]:
submission_timestamp = os.path.getmtime('submission.csv')
print("Last modified:", datetime.fromtimestamp(submission_timestamp).strftime('%Y-%m-%d %H:%M:%S'))


Last modified: 2025-06-24 22:51:29


In [22]:
rf.classes_

array(['10-26-26', '14-35-14', '17-17-17', '20-20', '28-28', 'DAP',
       'Urea'], dtype=object)

In [23]:
rf.feature_names_in_

array(['cat__Crop Type_Barley', 'cat__Crop Type_Cotton',
       'cat__Crop Type_Ground Nuts', 'cat__Crop Type_Maize',
       'cat__Crop Type_Millets', 'cat__Crop Type_Oil seeds',
       'cat__Crop Type_Paddy', 'cat__Crop Type_Pulses',
       'cat__Crop Type_Sugarcane', 'cat__Crop Type_Tobacco',
       'cat__Crop Type_Wheat', 'cat__Soil Type_Black',
       'cat__Soil Type_Clayey', 'cat__Soil Type_Loamy',
       'cat__Soil Type_Red', 'cat__Soil Type_Sandy',
       'remainder__Humidity', 'remainder__Moisture',
       'remainder__Nitrogen', 'remainder__Phosphorous',
       'remainder__Potassium', 'remainder__Temparature'], dtype=object)

In [24]:
submission

Unnamed: 0,id,Fertilizer Name
0,750000,20-20 Urea 17-17-17 28-28 DAP
1,750001,17-17-17 20-20 10-26-26 Urea 14-35-14
2,750002,28-28 DAP 20-20 10-26-26 17-17-17
3,750003,14-35-14 17-17-17 20-20 Urea 10-26-26
4,750004,20-20 10-26-26 17-17-17 14-35-14 28-28
...,...,...
249995,999995,17-17-17 10-26-26 28-28 14-35-14 20-20
249996,999996,10-26-26 14-35-14 Urea 20-20 17-17-17
249997,999997,DAP 10-26-26 14-35-14 Urea 17-17-17
249998,999998,17-17-17 10-26-26 28-28 DAP 20-20
