In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Charger les données
train = pd.read_csv('../data/playground-series-s5e6/train.csv')
# test  = pd.read_csv('../data/playground-series-s5e6/test.csv')

# Aperçu
print(train.shape) # , test.shape
display(train.head(), train.info())


(750000, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temparature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
dtypes: int64(7), object(3)
memory usage: 57.2+ MB


Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP


None

In [2]:
# Affiche la liste de toutes les colonnes
print(train.columns.tolist())

# Ou mieux, repère toutes les colonnes de type 'object'
print(train.select_dtypes(include='object').columns.tolist())

['id', 'Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type', 'Nitrogen', 'Potassium', 'Phosphorous', 'Fertilizer Name']
['Soil Type', 'Crop Type', 'Fertilizer Name']


In [3]:
# Nombre de NaN par colonne
missing_train = train.isnull().sum()
print("NaN dans train :\n", missing_train[missing_train > 0])


NaN dans train :
 Series([], dtype: int64)


In [4]:
from sklearn.impute import SimpleImputer

# Sélection des colonnes
num_cols = train.select_dtypes(include='number').columns.tolist()
cat_cols = train.select_dtypes(include='object').columns.tolist()

# Imputers
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Appliquer à train ET test
# on fait fit sur train, transform sur train ET test pour éviter la fuite de données
train[num_cols] = num_imputer.fit_transform(train[num_cols])
# test[num_cols]  = num_imputer.transform(test[num_cols])

train[cat_cols] = cat_imputer.fit_transform(train[cat_cols])
# test[cat_cols]  = cat_imputer.transform(test[cat_cols])

# Vérification finale
print("NaN restants dans train :", train.isnull().sum().sum())
# print("NaN restants dans test  :", test.isnull().sum().sum())

NaN restants dans train : 0


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# Définir les colonnes de features (exclure id + target)
feature_cols = [c for c in train.columns if c not in ['id', 'Fertilizer Name']]

# Séparer X et y
X_train = train[feature_cols]
y_train = train['Fertilizer Name']

# Reconstruire num_cols et cat_cols SUR X_train
num_cols = X_train.select_dtypes(include='number').columns.tolist()
cat_cols = X_train.select_dtypes(include='object').columns.tolist()

# Pipelines isolés
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',  StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# ColumnTransformer sur X_train uniquement
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline,     num_cols),
    ('cat', categorical_pipeline, cat_cols)
])

# Pipeline global
model_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('clf',     RandomForestClassifier(n_estimators=100, random_state=42))
])

# Entraînement (silence)
model_pipeline = model_pipeline.fit(X_train, y_train)


In [6]:
acc_train = model_pipeline.score(X_train, y_train)
print(f"Accuracy (top-1) sur le train : {acc_train:.3f}")

# predict_train = model_pipeline.predict_proba(X_train, y_train)

Accuracy (top-1) sur le train : 1.000


In [7]:
import numpy as np

# Probabilités brutes : shape (n_samples, n_classes)
proba_train = model_pipeline.predict_proba(X_train)

# Index des 3 classes les plus probables
top3_idxs = np.argsort(proba_train, axis=1)[:, -3:][:, ::-1]

# Noms des engrais dans l’ordre des colonnes du classifieur
classes = model_pipeline.named_steps['clf'].classes_

# Top-3 sous forme lisible
top3_preds = [[classes[idx] for idx in row] for row in top3_idxs]

# Affichage
print("Top-3 prédites pour les 5 premiers exemples :")
for i, preds in enumerate(top3_preds[:5]):
    true = y_train.iloc[i]
    print(f"  id={X_train.index[i]}  ⇒  vrai : {true:15s}  prédit : {preds}")


Top-3 prédites pour les 5 premiers exemples :
  id=0  ⇒  vrai : 28-28            prédit : ['28-28', '20-20', '14-35-14']
  id=1  ⇒  vrai : 28-28            prédit : ['28-28', '10-26-26', 'DAP']
  id=2  ⇒  vrai : 17-17-17         prédit : ['17-17-17', '14-35-14', '28-28']
  id=3  ⇒  vrai : 10-26-26         prédit : ['10-26-26', '17-17-17', 'Urea']
  id=4  ⇒  vrai : DAP              prédit : ['DAP', '28-28', '14-35-14']
