# Introduction

I downloaded a dataset of apple quality on kaggle : https://www.kaggle.com/datasets/nelgiriyewithana/apple-quality

The goal is to predict if an apple is good or bad

# Import libraries

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt

In [None]:
apple = pd.read_csv(os.path.join("apple_quality", "apple_quality.csv"))

In [None]:
apple.head()

In [None]:
apple.info()

Here I can see that the acidity is also an object whereas it's composed of numerical values

In [None]:
apple["Acidity"].value_counts()

In [None]:
apple.describe()

In [None]:
apple.hist(bins = 50, figsize=(15,15))

In [None]:
missing_values = apple.isnull().sum()
print(missing_values)

In [None]:
# I delete the only value that contains null
apple_cleaned = apple.dropna()

# So now only one line has been deleted
print("Number of line :", len(apple_cleaned))

In [None]:
apple_cleaned.info()

In [None]:
apple_cleaned["Acidity"]

In [None]:
# I convert the acidity
apple_cleaned['Acidity'] = pd.to_numeric(apple['Acidity'], errors='coerce')

In [None]:
apple_cleaned.info() #Now acidities are float64

In [None]:
apple_cleaned["Quality"]

# Test and train set

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(apple_cleaned, apple_cleaned["Quality"]):
    train_set = apple_cleaned.loc[train_index]
    test_set = apple_cleaned.loc[test_index]

In [None]:
len(train_set)

In [None]:
len(test_set)

In [None]:
test_set["Quality"].value_counts() / len(test_set)

In [None]:
train_set["Quality"].value_counts() / len(train_set)

In [None]:
apple_cleaned["Quality"].value_counts() / len(apple_cleaned)

The repartition is good

In [None]:
def income_cat_proportions(data):
    return data["Quality"].value_counts() / len(data)

compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(apple_cleaned),
    "Stratified": income_cat_proportions(train_set),
    "Random": income_cat_proportions(train_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [None]:
compare_props

# Let's discover our datas !!

In [None]:
apple_train = train_set.copy()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Créer un dictionnaire de mapping
quality_mapping = {'good': 1, 'bad': 0}

# Appliquer le mapping à la colonne Quality
apple_train['Quality'] = apple_train['Quality'].map(quality_mapping)

# Maintenant, vous pouvez calculer la matrice de corrélation
correlation_matrix = apple_train.corr()

# Tracer la heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Matrice de corrélation')
plt.show()



In [None]:
correlation_matrix["Quality"].sort_values(ascending = False)

So here we can see which features can help us to predict the quality of an apple. We could have guessed it but the juice, the sweetness, the size and the ripeness are correlated. In my mind, acidity and crunchiness would have played a bigger role.

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["Quality", "Juiciness", "Sweetness",
              "Size", "Ripeness" ]
scatter_matrix(apple_train[attributes], figsize=(12, 8))

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Création de la figure
fig = plt.figure(figsize=(12, 8))
fig.subplots_adjust(hspace=0.5, wspace=0.5)

# Liste des combinaisons de variables
combinations = [("Ripeness", "Juiciness", "Sweetness"),
                ("Quality", "Size", "Ripeness")]

# Boucle sur les combinaisons pour créer les graphiques 3D
for i, combo in enumerate(combinations, start=1):
    ax = fig.add_subplot(2, 2, i, projection='3d')
    ax.scatter(apple_train[combo[0]], apple_train[combo[1]], apple_train[combo[2]])
    ax.set_xlabel(combo[0])
    ax.set_ylabel(combo[1])
    ax.set_zlabel(combo[2])

plt.show()


# Test attribute combinations

In [None]:
apple_train["Ratio Juice Sweet"] = apple_train["Juiciness"]/apple_train["Sweetness"]
apple_train["Ratio Juice size"] = apple_train["Juiciness"]/apple_train["Size"]
apple_train["Ratio Juice Ripeness"] = apple_train["Juiciness"]/apple_train["Ripeness"]
apple_train["Ratio Sweet Size"] = apple_train["Sweetness"]/apple_train["Size"]

apple_train["Ratio Sweet Ripeness"] = apple_train["Sweetness"]/apple_train["Ripeness"]

apple_train["Ratio Size Ripeness"] = apple_train["Size"]/apple_train["Ripeness"]

In [None]:
corr_matrix = apple_train.corr()
corr_matrix["Quality"].sort_values(ascending=False)

No need to create attributes

In [None]:
# Let's delete them
attributes_to_delete = ["Ratio Juice Sweet", "Ratio Juice size", "Ratio Juice Ripeness",
                         "Ratio Sweet Size", "Ratio Sweet Ripeness", "Ratio Size Ripeness"]

apple_train.drop(attributes_to_delete, axis=1, inplace=True)


# Prepare the data for machine learning

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

apple = apple_train.drop("Quality", axis=1) #Drop the label
apple_label = apple_train["Quality"].copy() #Save the label

No need to clean the data as it's from kaggle, I've already removed one apple that had missing attributes

In [None]:
sample_incomplete_rows = apple[apple.isnull().any(axis=1)].head()
sample_incomplete_rows

Cool !

In [None]:
apple.info()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

numeric_features = apple.select_dtypes(include=['float64']).columns

numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

apple_prepared = preprocessor.fit_transform(apple)

In [None]:
transformed_numeric_feature_names = numeric_features
print(transformed_numeric_feature_names)


In [None]:
apple_prepared #So now apple_prepared is matrices !

In [None]:
apple_prepared.shape #And the shape is what it should so let's train

Good !

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Test of several models
classifiers = {
    'Logistic Regression' : LogisticRegression(penalty='l2', solver='lbfgs', C=1.0, max_iter=1000),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier()
}

# I'll do cross validation
for name, clf in classifiers.items():
    scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='accuracy')
    precision_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='precision')
    recall_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='recall')
    f1_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='f1')
    
    print(f"{name}: Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")
    print(f"   Precision: {precision_scores.mean():.4f} (+/- {precision_scores.std():.4f})")
    print(f"   Recall: {recall_scores.mean():.4f} (+/- {recall_scores.std():.4f})")
    print(f"   F1-score: {f1_scores.mean():.4f} (+/- {f1_scores.std():.4f})")

I'm going to chose CatBoost and Random Forest to fit the model and then I'll be able to see which one is really the best

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Let's search parameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

grid_search.fit(apple, apple_label)

print("Best params : ", grid_search.best_params_)

best_rf = grid_search.best_estimator_


In [None]:
clf = best_rf
scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='accuracy')
precision_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='precision')
recall_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='recall')
f1_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='f1')

print(f"Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")
print(f"Precision: {precision_scores.mean():.4f} (+/- {precision_scores.std():.4f})")
print(f"Recall: {recall_scores.mean():.4f} (+/- {recall_scores.std():.4f})")
print(f"F1-score: {f1_scores.mean():.4f} (+/- {f1_scores.std():.4f})")


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(100, 1000),                
    'max_depth': [None] + list(range(1, 51, 5)),        
    'min_samples_split': randint(2, 20),                
    'min_samples_leaf': randint(1, 20)                 
}

rf = RandomForestClassifier()

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', verbose=2, n_jobs=-1, random_state=42)

random_search.fit(apple, apple_label)

print("Meilleurs hyperparamètres : ", random_search.best_params_)

best_rf = random_search.best_estimator_


In [None]:
clf = best_rf
scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='accuracy')
precision_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='precision')
recall_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='recall')
f1_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='f1')

print(f"Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")
print(f"Precision: {precision_scores.mean():.4f} (+/- {precision_scores.std():.4f})")
print(f"Recall: {recall_scores.mean():.4f} (+/- {recall_scores.std():.4f})")
print(f"F1-score: {f1_scores.mean():.4f} (+/- {f1_scores.std():.4f})")

In [None]:
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier

param_grid = {
    'iterations': [100, 200, 300],       
    'learning_rate': [0.01, 0.05, 0.1],  
    'depth': [6, 8, 10],                  
    'l2_leaf_reg': [1, 3, 5]              
}


catboost = CatBoostClassifier()

grid_search = GridSearchCV(estimator=catboost, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)

grid_search.fit(apple, apple_label)

print("Best params : ", grid_search.best_params_)

best_catboost = grid_search.best_estimator_


In [None]:
clf = CatBoostClassifier(depth=8, iterations=300, l2_leaf_reg=5, learning_rate=0.1)
scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='accuracy')
precision_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='precision')
recall_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='recall')
f1_scores = cross_val_score(clf, apple, apple_label, cv=5, scoring='f1')

print(f"Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")
print(f"Precision: {precision_scores.mean():.4f} (+/- {precision_scores.std():.4f})")
print(f"Recall: {recall_scores.mean():.4f} (+/- {recall_scores.std():.4f})")
print(f"F1-score: {f1_scores.mean():.4f} (+/- {f1_scores.std():.4f})")

I'll keep this model

In [None]:
test_set

In [None]:
apple_test_label

In [None]:
quality_mapping = {'good': 1, 'bad': 0}

test_set['Quality'] = test_set['Quality'].map(quality_mapping)

apple_test_label = test_set["Quality"].copy()

apple_test = test_set.drop(["Quality"], axis=1)

In [None]:
apple_test_label

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

final_model = best_catboost

apple_test_prepared = preprocessor.fit_transform(apple_test)

final_predictions = final_model.predict(apple_test_prepared)

accuracy = accuracy_score(apple_test_label, final_predictions)
precision = precision_score(apple_test_label, final_predictions)
recall = recall_score(apple_test_label, final_predictions)
f1 = f1_score(apple_test_label, final_predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Save the model

In [None]:
import pickle

#Save model
with open('Apple_quality.pkl', 'wb') as file:
    pickle.dump(clf, file)

# Load model
with open('Apple_quality.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


# I need to put more doc