# Orange Quality Classification

## Data Loading

In [1]:
# Importing dataset via kaggle

!kaggle datasets download -d shruthiiiee/orange-quality

Dataset URL: https://www.kaggle.com/datasets/shruthiiiee/orange-quality
License(s): apache-2.0
Downloading orange-quality.zip to /home/bhxveshhh/ML/Orange Quality Classification
  0%|                                               | 0.00/3.73k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 3.73k/3.73k [00:00<00:00, 10.6MB/s]


In [2]:
# Exporting dataset from zipfile

import zipfile
zip_ref = zipfile.ZipFile('/home/bhxveshhh/ML/Orange Quality Classification/orange-quality.zip', 'r')
zip_ref.extractall('/home/bhxveshhh/ML/Orange Quality Classification')

## Exploratory Data Analysis

In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [45]:
df = pd.read_csv('Orange Quality Data.csv')

In [46]:
df.head()

Unnamed: 0,Size (cm),Weight (g),Brix (Sweetness),pH (Acidity),Softness (1-5),HarvestTime (days),Ripeness (1-5),Color,Variety,Blemishes (Y/N),Quality (1-5)
0,7.5,180,12.0,3.2,2.0,10,4.0,Orange,Valencia,N,4.0
1,8.2,220,10.5,3.4,3.0,14,4.5,Deep Orange,Navel,N,4.5
2,6.8,150,14.0,3.0,1.0,7,5.0,Light Orange,Cara Cara,N,5.0
3,9.0,250,8.5,3.8,4.0,21,3.5,Orange-Red,Blood Orange,N,3.5
4,8.5,210,11.5,3.3,2.5,12,5.0,Orange,Hamlin,Y (Minor),4.5


In [47]:
df.shape

(241, 11)

In [48]:
df.size

2651

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Size (cm)           241 non-null    float64
 1   Weight (g)          241 non-null    int64  
 2   Brix (Sweetness)    241 non-null    float64
 3   pH (Acidity)        241 non-null    float64
 4   Softness (1-5)      241 non-null    float64
 5   HarvestTime (days)  241 non-null    int64  
 6   Ripeness (1-5)      241 non-null    float64
 7   Color               241 non-null    object 
 8   Variety             241 non-null    object 
 9   Blemishes (Y/N)     241 non-null    object 
 10  Quality (1-5)       241 non-null    float64
dtypes: float64(6), int64(2), object(3)
memory usage: 20.8+ KB


In [50]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Size (cm),241.0,7.844813,1.086002,6.0,6.9,7.8,8.7,10.0
Weight (g),241.0,205.128631,56.461012,100.0,155.0,205.0,252.0,300.0
Brix (Sweetness),241.0,10.907884,2.760446,5.5,8.5,11.0,13.4,16.0
pH (Acidity),241.0,3.4739,0.421007,2.8,3.2,3.4,3.8,4.4
Softness (1-5),241.0,3.072614,1.32363,1.0,2.0,3.0,4.0,5.0
HarvestTime (days),241.0,15.344398,5.323852,4.0,11.0,15.0,20.0,25.0
Ripeness (1-5),241.0,3.599585,1.205214,1.0,3.0,4.0,4.5,5.0
Quality (1-5),241.0,3.817427,1.01441,1.0,3.0,4.0,4.5,5.0


In [51]:
df.isnull().sum()

Size (cm)             0
Weight (g)            0
Brix (Sweetness)      0
pH (Acidity)          0
Softness (1-5)        0
HarvestTime (days)    0
Ripeness (1-5)        0
Color                 0
Variety               0
Blemishes (Y/N)       0
Quality (1-5)         0
dtype: int64

In [52]:
df.duplicated().sum()

np.int64(0)

In [53]:
df.nunique()

Size (cm)              41
Weight (g)            131
Brix (Sweetness)       82
pH (Acidity)           44
Softness (1-5)          9
HarvestTime (days)     22
Ripeness (1-5)          8
Color                   5
Variety                24
Blemishes (Y/N)        12
Quality (1-5)           8
dtype: int64

## Data Preprocessing

In [54]:
# Label encoding categorical columns

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Color'] = le.fit_transform(df['Color'])
df['Variety'] = le.fit_transform(df['Variety'])
df['Blemishes (Y/N)'] = le.fit_transform(df['Blemishes (Y/N)'])

In [55]:
# Type conversion for dependent variable

df['Quality (1-5)'] = df['Quality (1-5)'].astype('int')

In [56]:
X = df.drop('Quality (1-5)', axis=1)
y = df['Quality (1-5)']

In [57]:
thresholds = [1.5,2.5, 3.5, 4.5]
y_discrete = np.digitize(y, bins=thresholds)

In [58]:
# Train-Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_discrete, test_size=0.2, random_state=42)


In [None]:
# Standardize features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Training

In [60]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score

In [61]:
logistic_clf = LogisticRegression()
ridge_clf = RidgeClassifier()
xgboost_clf = XGBClassifier()
random_forest_clf = RandomForestClassifier()
ada_boost_clf = AdaBoostClassifier()
grad_boost_clf = GradientBoostingClassifier()
bagging_clf = BaggingClassifier()
decision_tree_clf = DecisionTreeClassifier()
svm_clf = SVC()

In [63]:
model_li = [logistic_clf, ridge_clf, xgboost_clf, random_forest_clf, 
            ada_boost_clf, grad_boost_clf, bagging_clf, decision_tree_clf, svm_clf]

scores = []
for model in model_li:
    model.fit(X_train_scaled, y_train)
    scores.append(accuracy_score(y_test, model.predict(X_test_scaled)))

print(scores)

[0.6326530612244898, 0.5918367346938775, 0.6326530612244898, 0.6326530612244898, 0.5306122448979592, 0.673469387755102, 0.5714285714285714, 0.673469387755102, 0.6122448979591837]
