In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score, r2_score, mean_squared_error, classification_report

In [2]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    if hasattr(model, 'predict_proba'):  # Check if the model has a 'predict_proba' method
        y_proba = model.predict_proba(X_test)[:, 1]  # Probability of the positive class
        roc_auc = roc_auc_score(y_test, y_proba)
    else:
        roc_auc = None

    if len(set(y_test)) == 2:  # Binary classification
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, pos_label='good')
        gini = 2 * roc_auc - 1 if roc_auc is not None else None

        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Gini: {gini}")
    else:  # Regression
        mse = mean_squared_error(y_test, y_pred)
        rmse = mse ** 0.5
        r2 = r2_score(y_test, y_pred)

        print(f"Mean Squared Error: {mse}")
        print(f"Root Mean Squared Error: {rmse}")
        print(f"R2 Score: {r2}")

In [3]:
# Charger les données
df = pd.read_csv("apple_quality.csv")

ratio_missing_values = df.isna().sum() / df.count()

# Affichage du résultat
print(ratio_missing_values)
# Data preprocessing
# df = df.drop(['cabin', 'body', 'home.dest', 'boat'], axis=1)
df = df.dropna()
# df = df.drop('name', axis=1)
df = df.drop_duplicates()

df = df.drop('A_id', axis=1)
df.head()

A_id           0.0
Size           0.0
Weight         0.0
Sweetness      0.0
Crunchiness    0.0
Juiciness      0.0
Ripeness       0.0
Acidity        0.0
Quality        0.0
dtype: float64


Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,good
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,good
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,bad
3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,good
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,good


In [4]:
y = df['Quality']
X = df.drop('Quality', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['Quality'])

In [5]:
numerical_columns_selector = X.select_dtypes(exclude='object').columns
categorical_columns_selector = X.select_dtypes(include='object').columns

numerical_columns = X[numerical_columns_selector]
categorical_columns = X[categorical_columns_selector]

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nums", numerical_preprocessor, numerical_columns_selector),
        ("cat", categorical_preprocessor, categorical_columns_selector)
    ]
)

In [6]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', BaggingClassifier(base_estimator=RandomForestClassifier(), max_samples=0.9, random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)



In [7]:

evaluate_model(pipeline, X_test, y_test)

Accuracy: 0.8675
Precision: 0.8470588235294118
Gini: 0.8893055581597384


In [8]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', BaggingClassifier(base_estimator=RandomForestClassifier(), bootstrap=False, max_samples=0.9, random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)



In [9]:
evaluate_model(pipeline, X_test, y_test)

Accuracy: 0.87875
Precision: 0.8584905660377359
Gini: 0.8976306101913138
