In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from seaborn import sns
from phik import phik_matrix

In [26]:
df = pd.read_csv('dataSets/avocado_ripeness_dataset.csv')

In [9]:
df.head()

Unnamed: 0,firmness,hue,saturation,brightness,color_category,sound_db,weight_g,size_cm3,ripeness
0,14.5,19,40,26,black,34,175,261,ripe
1,71.7,53,69,75,green,69,206,185,pre-conditioned
2,88.5,60,94,46,dark green,79,220,143,hard
3,93.8,105,87,41,dark green,75,299,140,hard
4,42.5,303,58,32,purple,63,200,227,breaking


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   firmness        250 non-null    float64
 1   hue             250 non-null    float64
 2   saturation      250 non-null    float64
 3   brightness      250 non-null    float64
 4   color_category  250 non-null    float64
 5   sound_db        250 non-null    float64
 6   weight_g        250 non-null    float64
 7   size_cm3        250 non-null    float64
 8   ripeness        250 non-null    object 
dtypes: float64(8), object(1)
memory usage: 17.7+ KB


In [None]:
df.isna().sum()

In [None]:
df[df.duplicated()]

In [None]:
df.skew()

In [11]:
df['color_category'].unique()

array(['black', 'green', 'dark green', 'purple'], dtype=object)

In [27]:
from sklearn.calibration import LabelEncoder

le = LabelEncoder()
df['color_category'] = le.fit_transform(df['color_category'])

In [13]:
df.head()

Unnamed: 0,firmness,hue,saturation,brightness,color_category,sound_db,weight_g,size_cm3,ripeness
0,14.5,19,40,26,0,34,175,261,ripe
1,71.7,53,69,75,2,69,206,185,pre-conditioned
2,88.5,60,94,46,1,79,220,143,hard
3,93.8,105,87,41,1,75,299,140,hard
4,42.5,303,58,32,3,63,200,227,breaking


In [30]:
df.drop(columns=['ripeness']).skew()

firmness          0.111704
hue              -0.656329
saturation       -0.120372
brightness       -0.089933
color_category    0.000000
sound_db         -0.330793
weight_g          0.145087
size_cm3         -0.137845
dtype: float64

In [29]:
df['hue'] = np.log1p(df['hue'])

In [31]:
from sklearn.preprocessing import StandardScaler
cols_to_scale = df.drop(columns=['ripeness']).columns

df[cols_to_scale] = StandardScaler().fit_transform(df[cols_to_scale])

In [32]:
df[cols_to_scale].skew()

firmness          0.111704
hue              -0.656329
saturation       -0.120372
brightness       -0.089933
color_category    0.000000
sound_db         -0.330793
weight_g          0.145087
size_cm3         -0.137845
dtype: float64

In [33]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['ripeness'])
y = df['ripeness']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
from sklearn.neural_network import MLPClassifier

baselineMLPClassifier = MLPClassifier(hidden_layer_sizes=(10,), activation='relu', solver='adam', max_iter=1000, random_state=42)
baselineMLPClassifier.fit(X_train, y_train)



In [37]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

y_pred = baselineMLPClassifier.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report: {classification_report(y_test, y_pred)}")

Accuracy: 1.0
Classification Report:                  precision    recall  f1-score   support

       breaking       1.00      1.00      1.00        13
      firm-ripe       1.00      1.00      1.00         9
           hard       1.00      1.00      1.00         8
pre-conditioned       1.00      1.00      1.00         7
           ripe       1.00      1.00      1.00        13

       accuracy                           1.00        50
      macro avg       1.00      1.00      1.00        50
   weighted avg       1.00      1.00      1.00        50



In [39]:
# Perform 5-fold cross-validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(baselineMLPClassifier, X, y, cv=5, scoring='accuracy')

print("Cross-validation accuracy scores:", scores)
print("Mean accuracy:", np.mean(scores))
print("Std deviation:", np.std(scores))



Cross-validation accuracy scores: [1.   0.98 0.96 1.   0.96]
Mean accuracy: 0.9800000000000001
Std deviation: 0.017888543819998333


