# **Importing required modules**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
banana=pd.read_csv("/content/banana_quality_dataset.csv")

In [4]:
banana.shape

(1000, 16)

In [5]:
banana.head()

Unnamed: 0,sample_id,variety,region,quality_score,quality_category,ripeness_index,ripeness_category,sugar_content_brix,firmness_kgf,length_cm,weight_g,harvest_date,tree_age_years,altitude_m,rainfall_mm,soil_nitrogen_ppm
0,1,Manzano,Colombia,1.88,Processing,2.11,Turning,16.83,3.53,21.44,146.92,2023-10-16,13.7,58.2,2440.5,183.6
1,2,Plantain,Guatemala,2.42,Processing,4.25,Ripe,16.73,4.09,26.11,160.48,2023-10-14,5.1,280.2,2374.6,109.8
2,3,Burro,Ecuador,3.57,Premium,6.24,Overripe,21.34,1.63,25.2,225.27,2023-09-08,17.7,1246.9,1191.5,147.7
3,4,Manzano,Ecuador,2.21,Processing,5.39,Ripe,16.75,3.31,13.08,137.8,2023-10-07,13.0,1150.2,2845.1,92.8
4,5,Red Dacca,Ecuador,2.35,Processing,5.84,Ripe,16.9,3.07,12.98,227.84,2023-10-02,4.8,526.0,2136.9,129.7


# **Describe stats numerical columns**

In [6]:
banana.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sample_id,1000.0,500.5,288.819436,1.0,250.75,500.5,750.25,1000.0
quality_score,1000.0,2.46516,0.540909,0.92,2.09,2.44,2.85,3.89
ripeness_index,1000.0,4.04217,1.753211,1.02,2.4475,4.11,5.5625,7.0
sugar_content_brix,1000.0,18.51683,2.034812,15.0,16.8275,18.5,20.3125,21.98
firmness_kgf,1000.0,2.70684,1.289252,0.5,1.59,2.68,3.82,5.0
length_cm,1000.0,19.8821,5.736318,10.0,14.9025,19.89,24.8,29.95
weight_g,1000.0,164.73893,49.163073,81.05,122.3975,163.365,205.775,249.89
tree_age_years,1000.0,10.8796,5.215554,2.0,6.3,10.7,15.6,20.0
altitude_m,1000.0,723.5567,427.372855,0.4,353.95,726.0,1071.725,1498.4
rainfall_mm,1000.0,1972.9928,564.467912,1000.5,1498.75,1957.4,2431.8,2992.0


# **Describing categorical columns**

In [7]:
banana.describe(include='object').T

Unnamed: 0,count,unique,top,freq
variety,1000,8,Plantain,146
region,1000,8,Ecuador,137
quality_category,1000,4,Processing,506
ripeness_category,1000,4,Ripe,349
harvest_date,1000,61,2023-10-30,28


# **Checking null values or duplicates values**

In [8]:
banana.isnull().sum()

Unnamed: 0,0
sample_id,0
variety,0
region,0
quality_score,0
quality_category,0
ripeness_index,0
ripeness_category,0
sugar_content_brix,0
firmness_kgf,0
length_cm,0


In [9]:
banana.duplicated().sum()

0

# **Selecting input data columns**

In [10]:
X=banana[['variety', 'region','ripeness_index','sugar_content_brix','firmness_kgf','length_cm', 'weight_g']]
banana.shape

(1000, 16)

# **Selecting output data columns**

In [11]:
y=banana['quality_score']
y.shape

(1000,)

# **Spliting the data X and y**

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(800, 7)
(800,)
(200, 7)
(200,)


# **Selecting numerical and categorical data column wise**

In [13]:
X_train_numerical=X_train.select_dtypes(include='number')
X_test_numerical=X_test.select_dtypes(include='number')
X_train_categorical=X_train.select_dtypes(include='object')
X_test_categorical=X_test.select_dtypes(include='object')
print(X_train_numerical.shape)
print(X_test_numerical.shape)
print(X_train_categorical.shape)
print(X_test_categorical.shape)

(800, 5)
(200, 5)
(800, 2)
(200, 2)


# **Transforming Numerical data using Standard scaler method**

In [14]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_numerical_transform=scaler.fit_transform(X_train_numerical)
X_test_numerical_transform=scaler.fit_transform(X_test_numerical)
print(X_train_numerical_transform.shape)
print(X_test_numerical_transform.shape)

(800, 5)
(200, 5)


# **Transforming Categorical data using OrdinalEncoder**

In [15]:
from sklearn.preprocessing import OrdinalEncoder
coder=OrdinalEncoder()
X_train_categorical_transform=coder.fit_transform(X_train_categorical)
X_test_categorical_transform=coder.fit_transform(X_test_categorical)
print(X_train_categorical_transform.shape)
print(X_test_categorical_transform.shape)

(800, 2)
(200, 2)


# **Concatenate**

In [16]:
X_train_transform=np.concatenate([X_train_numerical_transform,X_train_categorical_transform],axis=1)
X_test_transform=np.concatenate([X_test_numerical_transform,X_test_categorical_transform],axis=1)
print(X_train_transform.shape)
print(X_test_transform.shape)

(800, 7)
(200, 7)


# **Building Ml Models**

## **k-nearest neighbor**

In [34]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
knn_model=KNeighborsRegressor()
knn_model.fit(X_train_transform,y_train)
y_predict=knn_model.predict(X_test_transform)
score1=root_mean_squared_error(y_test,y_predict)
score2=r2_score(y_test,y_predict)
print(f'The accuracy score of knn algorithm for acuracy_score validation is : {score1}')
print('\n')
print(f'The accuracy score of knn algorithm for re_score validation is :{score2}')

The accuracy score of knn algorithm for acuracy_score validation is : 0.2161361607875924


The accuracy score of knn algorithm for re_score validation is :0.8448262300274747


# **Decision Tree Classifier**

In [35]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
decision_model=DecisionTreeRegressor()
decision_model.fit(X_train_transform,y_train)
y_predict=decision_model.predict(X_test_transform)
score1=root_mean_squared_error(y_test,y_predict)
score2=r2_score(y_test,y_predict)
print(f'The accuracy score of Decision Tree classsifier using MSE validation  is :{score1}')
print('\n')
print(f'The accuracy score of Decision Tree classifier using r2 score is : {score2}')

The accuracy score of Decision Tree classsifier using MSE validation  is :0.1668652150689292


The accuracy score of Decision Tree classifier using r2 score is : 0.9075099379316082


# **Logistic Regression**

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
regression_model=LinearRegression()
regression_model.fit(X_train_transform,y_train)
y_predict=regression_model.predict(X_test_transform)
score1=root_mean_squared_error(y_test,y_predict)
score2=r2_score(y_test,y_predict)
print(f'The accuracy score of Logistic Regression is :{score1}')
print('\n')
print(f'The accuracy score of Logistic Regression for r2_score validation metrics is :{score2}')

The accuracy score of Logistic Regression is :0.0856107764499139


The accuracy score of Logistic Regression for r2_score validation metrics is :0.975654409227677


# **Random Forest**

In [44]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
forest_model=RandomForestRegressor()
forest_model.fit(X_train_transform,y_train)
y_predict=forest_model.predict(X_test_transform)
score1=root_mean_squared_error(y_test,y_predict)
score2=r2_score(y_test,y_predict)
print(f'The accuracy score of Random Forest aglorithm is : {score1}')
print('\n')
print(f'The accuracy score of Random Forest algorithm is :{score2}')

The accuracy score of Random Forest aglorithm is : 0.11011502940107692


The accuracu score of Random Forest algorithm is :0.9597230436843809


# **Ada Boost**

In [49]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
ada_model=AdaBoostRegressor()
ada_model.fit(X_train_transform,y_train)
y_predict=ada_model.predict(X_test_transform)
score1=root_mean_squared_error(y_test,y_predict)
score2=r2_score(y_test,y_predict)
print(f'The accuracy score of Ada boost algorithm is : {score1}')
print('\n')
print(f'The accuracy score of Ada boost algorithm is : {score2}')

The accuracy score of Ada boost algorithm is : 0.16042270765460118


The accuracy score of Ada boost algorithm is : 0.9145139736551684


# **Extreme Boosting**

In [51]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
ada_model=GradientBoostingRegressor()
ada_model.fit(X_train_transform,y_train)
y_predict=ada_model.predict(X_test_transform)
score1=root_mean_squared_error(y_test,y_predict)
score2=r2_score(y_test,y_predict)
print(f'The accuracy score of exxtreme boost algorithm is : {score1}')
print('\n')
print(f'The accuracy score of extreme boost algorithm is : {score2}')

The accuracy score of exxtreme boost algorithm is : 0.0754065208633561


The accuracy score of extreme boost algorithm is : 0.9811122053242316


# **Conclusion**



*   Among all the algorithms the best algorithm is
*   Logistic Regression scored 85 percentage of accuracy for root_mean_squared_error validation metrics


*  Extreme boost algorithm is scored  : 98 percentage accuracy for r2 score for metrics validation


