In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,confusion_matrix, precision_score, recall_score
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler



In [2]:
def preprocess_data(df, axis=0):
    # Drop null values
    df.dropna(inplace=True)
    # Drop duplicate rows
    df.drop_duplicates(inplace=True)
    # Reset index after dropping rows
    df.reset_index(drop=True, inplace=True)
    return df


def scalerfit_data(X_train):
    from sklearn.preprocessing import StandardScaler
    # Perform Standard Scaling
    scaler = StandardScaler()
    return scaler.fit(X_train)


def standardize_data(X_train):
    # Perform Standard Scaling
    scaler = StandardScaler()
    return scaler.fit_transform(X_train)


# Create a dataframe with the dependent and independent variables
def print_metrics(y_test, y_pred):
    # Evaluate the model on the test set
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)

In [3]:
def nb_train(X_train, X_test, y_train, y_tes):
    # Define the NB classifier
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    # Evaluate the model on the test set
    accuracy = nb.score(X_test, y_test)
    # Predict labels for test data
    y_pred = nb.predict(X_test)
    print_metrics(y_test, y_pred)
    return nb

In [4]:
file_path = '../../csv/features_data-Sheet1.csv'
df_ = pd.read_csv(file_path)
df_

Unnamed: 0,length,width,diameter,area,perimeter,rectangularity,circularity,ellipticity,aspect ratio,form factor,narrow factor,perimeter ratio of diameter,perimeter Ratio of Physiological Length and Physiological Width,Texture,Species
0,380.0,305.0,413.762665,70748.5,1269.207273,1.245902,0.551902,1.908372,1.245902,0.551902,1.088849,3.067477,1.852857,7.0,0.0
1,377.0,208.0,373.708954,56552.0,1043.234627,1.812500,0.652971,1.858469,1.812500,0.652971,0.991270,2.791570,1.783307,1.0,0.0
2,386.0,146.0,393.509216,33447.0,975.352375,2.643836,0.441819,3.276360,2.643836,0.441819,1.019454,2.478601,1.833369,8.0,0.0
3,387.0,317.0,408.597290,70679.0,1191.768679,1.220820,0.625340,1.613881,1.220820,0.625340,1.055807,2.916732,1.692853,9.0,0.0
4,387.0,182.0,384.071136,54103.0,1032.038666,0.470284,0.638321,2.142146,2.126374,0.638321,0.992432,2.687103,1.813776,9.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4173,217.0,108.0,226.773682,8754.0,536.014280,2.009259,0.382881,4.527912,2.009259,0.382881,1.045040,2.363653,1.649275,9.0,3.0
4174,385.0,147.0,387.669403,32310.5,952.222429,0.381818,0.447792,3.336835,2.619048,0.447792,1.006934,2.456274,1.789892,1.0,3.0
4175,379.0,107.0,352.927338,23339.0,903.754395,3.542056,0.359080,4.398380,3.542056,0.359080,0.931207,2.560738,1.859577,1.0,3.0
4176,387.0,137.0,392.361908,36179.0,975.470123,0.354005,0.477792,3.154620,2.824818,0.477792,1.013855,2.486149,1.861584,1.0,3.0


In [5]:
df = preprocess_data(df_)
df

Unnamed: 0,length,width,diameter,area,perimeter,rectangularity,circularity,ellipticity,aspect ratio,form factor,narrow factor,perimeter ratio of diameter,perimeter Ratio of Physiological Length and Physiological Width,Texture,Species
0,380.0,305.0,413.762665,70748.5,1269.207273,1.245902,0.551902,1.908372,1.245902,0.551902,1.088849,3.067477,1.852857,7.0,0.0
1,377.0,208.0,373.708954,56552.0,1043.234627,1.812500,0.652971,1.858469,1.812500,0.652971,0.991270,2.791570,1.783307,1.0,0.0
2,386.0,146.0,393.509216,33447.0,975.352375,2.643836,0.441819,3.276360,2.643836,0.441819,1.019454,2.478601,1.833369,8.0,0.0
3,387.0,317.0,408.597290,70679.0,1191.768679,1.220820,0.625340,1.613881,1.220820,0.625340,1.055807,2.916732,1.692853,9.0,0.0
4,387.0,182.0,384.071136,54103.0,1032.038666,0.470284,0.638321,2.142146,2.126374,0.638321,0.992432,2.687103,1.813776,9.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3063,217.0,108.0,226.773682,8754.0,536.014280,2.009259,0.382881,4.527912,2.009259,0.382881,1.045040,2.363653,1.649275,9.0,3.0
3064,385.0,147.0,387.669403,32310.5,952.222429,0.381818,0.447792,3.336835,2.619048,0.447792,1.006934,2.456274,1.789892,1.0,3.0
3065,379.0,107.0,352.927338,23339.0,903.754395,3.542056,0.359080,4.398380,3.542056,0.359080,0.931207,2.560738,1.859577,1.0,3.0
3066,387.0,137.0,392.361908,36179.0,975.470123,0.354005,0.477792,3.154620,2.824818,0.477792,1.013855,2.486149,1.861584,1.0,3.0


In [6]:
# Prepare the data 2
X = df[['area', 'circularity', 'ellipticity', 'form factor', 'perimeter', 'perimeter ratio of diameter', 'perimeter Ratio of Physiological Length and Physiological Width']]
y = df['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
import os

# function to save dataframe in csv folder
def save_dataframe(df, file_path):
    if os.path.exists(file_path):
        print(f'File {file_path} already exists. Skipping.')
    else:
        df.to_csv(file_path, index=False)
        print(f'File {file_path} saved successfully.')
    return None


# Save training and testing data
save_dataframe(X_train, '../../csv/train_test/X_train_features.csv')
save_dataframe(X_test, '../../csv/train_test/X_test_features.csv')
save_dataframe(y_train, '../../csv/train_test/y_train_features.csv')
save_dataframe(y_test, '../../csv/train_test/y_test_features.csv')

import numpy as np

# If the data has already been splitted into train and test, you can use the following code to load it.
X_train, X_test, y_train, y_test = (pd.read_csv('../../csv/train_test/X_train_features.csv'),
                                    pd.read_csv('../../csv/train_test/X_test_features.csv'),
                                    pd.read_csv('../../csv/train_test/y_train_features.csv'),
                                    pd.read_csv('../../csv/train_test/y_test_features.csv'))

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

File ../../csv/train_test/X_train_features.csv saved successfully.
File ../../csv/train_test/X_test_features.csv saved successfully.
File ../../csv/train_test/y_train_features.csv saved successfully.
File ../../csv/train_test/y_test_features.csv saved successfully.


# Naive Bayes

In [8]:
X_standard_scaled = standardize_data(X_train)
nb = nb_train(X_train, X_test, y_train, y_test)

Accuracy: 0.6351791530944625
Precision: 0.623010282528313
Recall: 0.6351791530944625


# Save/Load Model

In [9]:
import joblib

# Save the best SVM model
joblib.dump(nb, '../../models/plant_prediction_model_nb.pkl')
# Save standard scaler
fitted_scaler = scalerfit_data(X_train)
joblib.dump(fitted_scaler, '../../models/X_standard_scaler.pkl')

['../../models/X_standard_scaler.pkl']

In [10]:
import joblib
from sklearn import svm

# Load the saved SVM model
loaded_model = joblib.load('../../models/plant_prediction_model_nb.pkl')

# Make predictions on the test data
y_pred = loaded_model.predict(X_test)

# Evaluate the model on the test set
print_metrics(y_test, y_pred)

Accuracy: 0.6351791530944625
Precision: 0.623010282528313
Recall: 0.6351791530944625


In [11]:
X_standard_scaler = joblib.load('../../models/X_standard_scaler.pkl')
new_scaled_data = X_standard_scaler.transform(np.array([[70748.5,0.5519020363926462,1.908372278751408,0.5519020363926462,1269.207272648811,3.067476552718655,1.8528573323340312]]))

# Make predictions
predictions = nb.predict(new_scaled_data)
# Print the predicted class labels
print("Predicted Class Labels: ", predictions)

Predicted Class Labels:  [0.]


