<div style="background-color: #333; padding: 40px; border: 2px solid #ffd700; border-radius: 10px; color: #ffd700; text-align: center; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);">

<h1 style="font-size: 48px; font-weight: bold; color: #ffd700;">Apple quality</h1>

<img src="https://res.cloudinary.com/hv9ssmzrz/image/fetch/c_fill,f_auto,h_630,q_auto,w_1200/https://images-ca-1-0-1-eu.s3-eu-west-1.amazonaws.com/photos/original/795/varietes-de-pommes_flickr_14574971754_edc3a455f4_b.jpg" alt="Movie Reel" style="width: 500px; margin: 20px auto; border-radius: 10px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);">
    
</div>

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">1. Import Libraries</div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score,make_scorer, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">2. Load data</div>

In [None]:
df = pd.read_csv("/kaggle/input/apple-quality/apple_quality.csv")

df = df.drop(columns=['A_id'])
df = df.dropna()
df = df.astype({'Acidity': 'float64'})
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">3. Exploratory Data Analysis</div>

### I | Check duplicates

In [None]:
duplicates = df.duplicated().sum()
print(duplicates)

### II | Check null and missing values

In [None]:
missing_values = df.isnull().sum()
total_missing_values = (missing_values).sum()
total_cells = np.product(df.shape)
percent_missing_values = (total_missing_values / total_cells)*100
print("Percent of data that is missing", percent_missing_values)
print(missing_values)

### III | Check unique values in each columns

In [None]:
for column in df.columns:
    num_distinct_values = len(df[column].unique())
    print(f"{column}: {num_distinct_values} distinct values")

### IV | Correlation Analysis

In [None]:
numeric_columns = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_columns.corr()
correlation_matrix

In [None]:
fig, ax = plt.subplots() 
fig.set_size_inches(15,10)
sns.heatmap(correlation_matrix, vmax =.8, square = True, annot = True,cmap='YlGn' )
plt.title('Correlation Matrix',fontsize=15);

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 8))

axes[0, 0].set_title("Scatter plot of weight by size")
sns.scatterplot(x='Size', y='Weight', ax=axes[0, 0], data=df, color = 'blue')

axes[0, 1].set_title("Scatter plot of crunchiness by size")
sns.scatterplot(x='Size', y='Crunchiness', ax=axes[0, 1], data=df, color = 'red')

axes[1, 0].set_title("Scatter plot of weight by crunchiness")
sns.scatterplot(x='Crunchiness', y='Weight', ax=axes[1, 0], data=df, color = 'green')

axes[1, 1].set_title("Scatter plot of sweetness by size")
sns.scatterplot(x='Size', y='Sweetness', ax=axes[1, 1], data=df, color ='purple')

plt.tight_layout()


plt.show()

In [None]:
ax = sns.countplot(x='Quality', data=df)

# Annotate each bar with its count value
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.title('Distribution of Quality')
plt.xlabel('Quality')
plt.ylabel('Count')
plt.show()

In [None]:
numerical_features = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness',
       'Acidity']

scaler = StandardScaler()

df[numerical_features] = scaler.fit_transform(df[numerical_features])

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">4. Predictive Analysis</div>

In [None]:
X = df.drop(['Quality'], axis = 1)
y = df['Quality']

scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X_scaled, y)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(y_encoded)

X_train,X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.2, random_state = 42)
print(X_train.shape, X_test.shape)

In [None]:
models = [
    (XGBClassifier(), "XGBoost"),
    (RandomForestClassifier(), "RF"),
    (SVC(), "SVC")
]

param_grids = [
    {
        'n_estimators': [50, 100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7, 10]
    },
    {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }
]

# Define scoring functions for classification models
classification_scoring = {
    'Accuracy': 'accuracy',
    'Precision': 'precision',
    'Recall': 'recall',
    'F1': 'f1',
    'AUC-ROC': 'roc_auc'
}

# Perform GridSearchCV for each model
results = []
for (model, model_name), param_grid in zip(models, param_grids):
    grid_search = GridSearchCV(
        model,
        param_grid,
        scoring=classification_scoring,
        refit='Accuracy',  # Choose the metric to be used for refitting the best model
        cv=5,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Evaluate the best model on the test set
    y_pred = best_model.predict(X_test).round()
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred)
    
    results.append({
        'Model': model_name,
        'Best Hyperparameters': grid_search.best_params_,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'AUC-ROC': auc_roc
    })

# Display the results in a DataFrame
df_results = pd.DataFrame(results)
display(df_results)


In [None]:
df_results = pd.DataFrame(results)

# Set up the subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Plot bar chart for Accuracy results
axes[0].bar(df_results['Model'], df_results['Accuracy'], color='lightgreen')
axes[0].set_title('Accuracy Comparison for Different Models')
axes[0].set_ylabel('Accuracy Value')

# Plot bar chart for RMSE results
axes[1].bar(df_results['Model'], df_results['Precision'], color='blue')
axes[1].set_title('Precision Comparison for Different Models')
axes[1].set_ylabel('Precision Value')

# Plot bar chart for MAE results
axes[2].bar(df_results['Model'], df_results['AUC-ROC'], color='purple')
axes[2].set_title('AUC-ROC Comparison for Different Models')
axes[2].set_ylabel('AUC-ROC Value')

plt.tight_layout()
plt.show()