In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,confusion_matrix, precision_score, recall_score
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler



In [2]:
def preprocess_data(df, axis=0):
    # Drop null values
    df.dropna(inplace=True)
    # Drop duplicate rows
    df.drop_duplicates(inplace=True)
    # Reset index after dropping rows
    df.reset_index(drop=True, inplace=True)
    return df


def scalerfit_data(X_train):
    from sklearn.preprocessing import StandardScaler
    # Perform Standard Scaling
    scaler = StandardScaler()
    return scaler.fit(X_train)


def standardize_data(X_train):
    # Perform Standard Scaling
    scaler = StandardScaler()
    return scaler.fit_transform(X_train)


# Create a dataframe with the dependent and independent variables
def print_metrics(y_test, y_pred):
    # Evaluate the model on the test set
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)

In [3]:
def nb_train(X_train, X_test, y_train, y_tes):
    # Define the NB classifier
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    # Evaluate the model on the test set
    accuracy = nb.score(X_test, y_test)
    # Predict labels for test data
    y_pred = nb.predict(X_test)
    print_metrics(y_test, y_pred)
    return nb

In [4]:
file_path = '../../csv/fin_features.csv'
df_ = pd.read_csv(file_path)
df_

Unnamed: 0.1,Unnamed: 0,length,width,diameter,area,perimeter,rectangularity,circularity,ellipticity,apect_ratio,...,perimeter_ratio_of_diameter,perimeter_ratio_of_PLW,fractal_dim,entropy,eccentricity,curvature,vein_density,color,teeth,species
0,0,177,146,180.917511,16650.0,539.612262,1.212329,0.718556,1.582544,1.212329,...,2.982643,1.670626,1.890963,4.896521,0.355155,0.158420,0.027938,151,58,0
1,1,168,97,173.085022,13001.0,467.362479,1.731959,0.747961,1.727679,1.731959,...,2.700190,1.763632,1.845597,4.855767,0.414043,0.224843,0.029066,231,37,0
2,2,181,66,164.964050,7281.5,451.605119,2.742424,0.448656,2.978130,2.742424,...,2.737597,1.828361,1.944664,4.720797,0.640082,0.152909,0.038162,141,43,0
3,3,187,152,189.208450,17753.0,572.357426,1.230263,0.681000,1.565289,1.230263,...,3.025010,1.688370,1.852114,4.062104,0.343600,0.143289,0.020292,215,47,0
4,4,287,133,289.048981,30347.0,766.901584,0.463415,0.648405,2.178899,2.157895,...,2.653189,1.825956,1.820155,4.927390,0.516427,0.414601,0.019083,195,48,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34092,34092,165,83,174.651871,5400.0,408.132031,1.987952,0.407382,4.333301,1.987952,...,2.336832,1.645694,1.940472,0.792677,0.765777,0.055295,0.043552,193,35,18
34093,34093,276,107,281.507416,18000.5,680.457932,0.387681,0.488531,3.204195,2.579439,...,2.417194,1.776653,1.921504,3.780989,0.689449,0.094697,0.022295,103,56,18
34094,34094,155,47,153.985474,5246.5,370.693432,3.297872,0.479788,3.589358,3.297872,...,2.407327,1.835116,1.955055,3.962493,0.713840,0.135895,0.045069,118,30,18
34095,34095,287,97,263.738831,17571.5,720.658941,0.337979,0.425167,3.402805,2.958763,...,2.732472,1.876716,1.910407,4.519591,0.611948,0.200701,0.023595,89,49,18


In [5]:
df = preprocess_data(df_)
df

Unnamed: 0.1,Unnamed: 0,length,width,diameter,area,perimeter,rectangularity,circularity,ellipticity,apect_ratio,...,perimeter_ratio_of_diameter,perimeter_ratio_of_PLW,fractal_dim,entropy,eccentricity,curvature,vein_density,color,teeth,species
0,0,177,146,180.917511,16650.0,539.612262,1.212329,0.718556,1.582544,1.212329,...,2.982643,1.670626,1.890963,4.896521,0.355155,0.158420,0.027938,151,58,0
1,1,168,97,173.085022,13001.0,467.362479,1.731959,0.747961,1.727679,1.731959,...,2.700190,1.763632,1.845597,4.855767,0.414043,0.224843,0.029066,231,37,0
2,2,181,66,164.964050,7281.5,451.605119,2.742424,0.448656,2.978130,2.742424,...,2.737597,1.828361,1.944664,4.720797,0.640082,0.152909,0.038162,141,43,0
3,3,187,152,189.208450,17753.0,572.357426,1.230263,0.681000,1.565289,1.230263,...,3.025010,1.688370,1.852114,4.062104,0.343600,0.143289,0.020292,215,47,0
4,4,287,133,289.048981,30347.0,766.901584,0.463415,0.648405,2.178899,2.157895,...,2.653189,1.825956,1.820155,4.927390,0.516427,0.414601,0.019083,195,48,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33803,34092,165,83,174.651871,5400.0,408.132031,1.987952,0.407382,4.333301,1.987952,...,2.336832,1.645694,1.940472,0.792677,0.765777,0.055295,0.043552,193,35,18
33804,34093,276,107,281.507416,18000.5,680.457932,0.387681,0.488531,3.204195,2.579439,...,2.417194,1.776653,1.921504,3.780989,0.689449,0.094697,0.022295,103,56,18
33805,34094,155,47,153.985474,5246.5,370.693432,3.297872,0.479788,3.589358,3.297872,...,2.407327,1.835116,1.955055,3.962493,0.713840,0.135895,0.045069,118,30,18
33806,34095,287,97,263.738831,17571.5,720.658941,0.337979,0.425167,3.402805,2.958763,...,2.732472,1.876716,1.910407,4.519591,0.611948,0.200701,0.023595,89,49,18


In [6]:
# Prepare the data 2
X = df[['area','perimeter','rectangularity','circularity','ellipticity','apect_ratio','form_factor','narrow_factor','perimeter_ratio_of_diameter','perimeter_ratio_of_PLW','fractal_dim','entropy','eccentricity','curvature','vein_density','color','teeth']]
y = df['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
import os

# function to save dataframe in csv folder
def save_dataframe(df, file_path):
    if os.path.exists(file_path):
        print(f'File {file_path} already exists. Skipping.')
    else:
        df.to_csv(file_path, index=False)
        print(f'File {file_path} saved successfully.')
    return None


# Save training and testing data
save_dataframe(X_train, '../../csv/train_test/X_train_features_19.csv')
save_dataframe(X_test, '../../csv/train_test/X_test_features_19.csv')
save_dataframe(y_train, '../../csv/train_test/y_train_features_19.csv')
save_dataframe(y_test, '../../csv/train_test/y_test_features_19.csv')

import numpy as np

# If the data has already been splitted into train and test, you can use the following code to load it.
X_train, X_test, y_train, y_test = (pd.read_csv('../../csv/train_test/X_train_features_19.csv'),
                                    pd.read_csv('../../csv/train_test/X_test_features_19.csv'),
                                    pd.read_csv('../../csv/train_test/y_train_features_19.csv'),
                                    pd.read_csv('../../csv/train_test/y_test_features_19.csv'))

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

File ../../csv/train_test/X_train_features_19.csv saved successfully.
File ../../csv/train_test/X_test_features_19.csv saved successfully.
File ../../csv/train_test/y_train_features_19.csv saved successfully.
File ../../csv/train_test/y_test_features_19.csv saved successfully.


# Naive Bayes

In [8]:
X_standard_scaled = standardize_data(X_train)
nb = nb_train(X_train, X_test, y_train, y_test)

Accuracy: 0.2925170068027211
Precision: 0.2892206873964112
Recall: 0.2925170068027211


  _warn_prf(average, modifier, msg_start, len(result))


# Save/Load Model

In [9]:
import joblib

# Save the best SVM model
joblib.dump(nb, '../../models/plant_prediction_model_nb.pkl')
# Save standard scaler
fitted_scaler = scalerfit_data(X_train)
joblib.dump(fitted_scaler, '../../models/X_standard_scaler.pkl')

['../../models/X_standard_scaler.pkl']

In [10]:
import joblib
from sklearn import svm

# Load the saved SVM model
loaded_model = joblib.load('../../models/plant_prediction_model_nb.pkl')

# Make predictions on the test data
y_pred = loaded_model.predict(X_test)

# Evaluate the model on the test set
print_metrics(y_test, y_pred)

Accuracy: 0.6351791530944625
Precision: 0.623010282528313
Recall: 0.6351791530944625


In [11]:
X_standard_scaler = joblib.load('../../models/X_standard_scaler.pkl')
new_scaled_data = X_standard_scaler.transform(np.array([[70748.5,0.5519020363926462,1.908372278751408,0.5519020363926462,1269.207272648811,3.067476552718655,1.8528573323340312]]))

# Make predictions
predictions = nb.predict(new_scaled_data)
# Print the predicted class labels
print("Predicted Class Labels: ", predictions)

Predicted Class Labels:  [0.]


