# Data Processing

## Explore data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder,StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold,cross_val_predict
from sklearn.metrics import accuracy_score, f1_score,precision_score,mean_squared_error, precision_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline

In [None]:
# import data 
train = pd.read_csv('/kaggle/input/simulates-real-world-agricultural-data/train.csv')
test = pd.read_csv('/kaggle/input/simulates-real-world-agricultural-data/test.csv')
# Display the first few rows of the dataset
train.head()

In [None]:
test.head()

In [None]:
print(f'{len(test)/len(train):.2f}')

In [None]:
# check any null value
print(train.isnull().sum())

In [None]:
# Print the data information
print(train.info())

There are 10 columns with 750000 rows, without any null values. 'Soli Type', 'Crop Type' and 'Fertilizer Name' are catogory type, others are numeric type.

In [None]:
# Print statistics
print(train.describe())

Base on the describe, the data range is resonable,so no normalization needed.

# Data Preprocessing

### Encode Categorical Variables

In [None]:
le_soil = LabelEncoder()
le_crop = LabelEncoder()
le_fert = LabelEncoder()

train['Soil Type'] = le_soil.fit_transform(train['Soil Type'])
train['Crop Type'] = le_crop.fit_transform(train['Crop Type'])
train['Fertilizer Name'] = le_fert.fit_transform(train['Fertilizer Name'])

test['Soil Type'] = le_soil.transform(test['Soil Type'])
test['Crop Type'] = le_crop.transform(test['Crop Type'])

In [None]:
# Save ID for submission
test_ids = test['id']

# Drop ID before prediction
X_test = test.drop(columns=['id'])

In [None]:
# Define features and target
X_train = train.drop(['id','Fertilizer Name'], axis=1)
y_train = train['Fertilizer Name']

### Check data balance

In [None]:
train['Fertilizer Name'].value_counts(normalize=True).plot(kind='bar')

The class distribution for the target variable 'Fertilizer Name' is fairly balanced.

# Model Training

In [None]:
def evaluate_metrics(y_true, y_pred):
    print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
    print("F1-score:", round(f1_score(y_true, y_pred, average="weighted"), 4))
    print("Precision:", round(precision_score(y_true, y_pred, average="weighted"), 4))
    print("Recall (Sensitivity):", round(recall_score(y_true, y_pred, average="weighted"), 4))
    print("Mean Squared Error:", round(mean_squared_error(y_true, y_pred), 4))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, zero_division=0))

In [None]:
# Use cross-validation-friendly split for balanced classes
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### Model 1: XGBoost Model

In [None]:
# Initialize XGB model
xgb = XGBClassifier(
    objective='multi:softmax',
    num_class=7,
    random_state=42,
    n_estimators=150,
    learning_rate=0.1,
    max_depth=5
)

# Cross-validation setup (using your predefined cv)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validated predictions
cv_preds = cross_val_predict(xgb, X_train, y_train, cv=cv)

# Evaluate metrics using your function
print("Cross-Validation Results:")
evaluate_metrics(y_train, cv_preds)

# Confusion matrix
cm = confusion_matrix(y_train, cv_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title('XGBoost Confusion Matrix (Cross-Validation)')
plt.show()


In [None]:
xgb.fit(X_train, y_train)

In [None]:
probs = xgb.predict_proba(X_test)
top3_preds = np.argsort(probs, axis=1)[:, -3:][:, ::-1]

In [None]:
top3_names = np.array([le_fert.inverse_transform(row) for row in top3_preds])

In [None]:
# Join the top 3 fertilizer names into space-separated strings per row
fertilizer_strings = [' '.join(row) for row in top3_names]

# Build submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'Fertilizer Name': fertilizer_strings
})

# Save to CSV with header
submission.to_csv('submission.csv', index=False)
print("Submission saved as 'submission.csv'")

In [None]:
submission.head()