In [1]:
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

___
### analyse

In [2]:
df = sns.load_dataset('penguins')
display(df.head())

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [None]:
df.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


___
### machin learning

In [4]:
# Préparation des données
df = df.dropna()  # Supprimer les lignes avec des valeurs manquantes
X = df.drop(columns=['species'])
y = df['species']

# Encodage des variables catégorielles
X = pd.get_dummies(X, drop_first=True)

# Division des données en ensembles d'entraînement, de validation et de test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.8, random_state=42)

# Normalisation des données
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid_not_transformed = X_valid.copy()
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

# Entraînement du modèle
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Évaluation du modèle
y_pred_valid = model.predict(X_valid)
print("Validation Set Classification Report:")
print(classification_report(y_valid, y_pred_valid))

y_pred_test = model.predict(X_test)
print("Test Set Classification Report:")
print(classification_report(y_test, y_pred_test))

Validation Set Classification Report:
              precision    recall  f1-score   support

      Adelie       0.88      1.00      0.93         7
   Chinstrap       1.00      0.50      0.67         2
      Gentoo       1.00      1.00      1.00        11

    accuracy                           0.95        20
   macro avg       0.96      0.83      0.87        20
weighted avg       0.96      0.95      0.94        20

Test Set Classification Report:
              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        41
   Chinstrap       1.00      1.00      1.00        16
      Gentoo       1.00      1.00      1.00        23

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80



___
### tests manuels

In [5]:
print(y_train.value_counts(), y_test.value_counts(), y_valid.value_counts())

species
Adelie       98
Gentoo       85
Chinstrap    50
Name: count, dtype: int64 species
Adelie       41
Gentoo       23
Chinstrap    16
Name: count, dtype: int64 species
Gentoo       11
Adelie        7
Chinstrap     2
Name: count, dtype: int64


In [6]:
display(X_valid_not_transformed.head(10), y_valid.head(10))

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Dream,island_Torgersen,sex_Male
4,36.7,19.3,193.0,3450.0,False,True,False
321,55.9,17.0,228.0,5600.0,False,False,True
260,42.7,13.7,208.0,3950.0,False,False,False
331,49.8,15.9,229.0,5950.0,False,False,True
317,46.9,14.6,222.0,4875.0,False,False,False
226,45.4,14.6,211.0,4800.0,False,False,False
143,40.7,17.0,190.0,3725.0,True,False,True
79,42.1,19.1,195.0,4000.0,False,True,True
205,50.7,19.7,203.0,4050.0,True,False,True
308,47.5,14.0,212.0,4875.0,False,False,False


4         Adelie
321       Gentoo
260       Gentoo
331       Gentoo
317       Gentoo
226       Gentoo
143       Adelie
79        Adelie
205    Chinstrap
308       Gentoo
Name: species, dtype: object

In [8]:
rang = 8

# Transformation des valeurs de X_valid_not_transformed[0]
X_valid_sample_transformed = scaler.transform([X_valid_not_transformed.iloc[rang]])

# Prédiction avec le modèle
y_pred_sample = model.predict(X_valid_sample_transformed)
print('____________________________________________________________________________________________________')
print(f"Prédiction pour X_valid_not_transformed[rang={rang}]:", y_pred_sample[0])
print('____________________________________________________________________________________________________')
print('')
print(y_valid.iloc[rang])
print('')
print(X_valid_not_transformed.iloc[rang])
print('')
print('____________________________________________________________________________________________________')
print('')

____________________________________________________________________________________________________
Prédiction pour X_valid_not_transformed[rang=8]: Chinstrap
____________________________________________________________________________________________________

Chinstrap

bill_length_mm         50.7
bill_depth_mm          19.7
flipper_length_mm     203.0
body_mass_g          4050.0
island_Dream           True
island_Torgersen      False
sex_Male               True
Name: 205, dtype: object

____________________________________________________________________________________________________



