In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,confusion_matrix, precision_score, recall_score
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler



In [2]:
def preprocess_data(df, axis=0):
    # Drop null values
    df.dropna(inplace=True)
    # Drop duplicate rows
    df.drop_duplicates(inplace=True)
    # Reset index after dropping rows
    df.reset_index(drop=True, inplace=True)
    return df


def scalerfit_data(X_train):
    # Perform Standard Scaling
    scaler = StandardScaler()
    return scaler.fit(X_train)


def standardize_data(X_train):
    # Perform Standard Scaling
    scaler = StandardScaler()
    return scaler.fit_transform(X_train)


def set_optimal_perimeters():
    # Define the hyperparameter grid for tuning
    # Define the range and step size
    start = 1
    stop = 100
    step = 50

    # Create the array using numpy's arange function
    arr = np.arange(start, stop + step, step)
    gamma = list(np.logspace(-3, 3, 30))
    param_grid = {
      'C':arr,
      'kernel': ['rbf'],
      'degree': [2],
      'gamma': ['scale', 'auto'] + gamma
      #['scale', 'auto'] +gamma
    }

    # param_grid = {
    #     'C': [101],
    #     'kernel': ['rbf'],
    #     'degree': [2],
    #     'gamma': [2.976351441631316]
    #     # ['scale', 'auto'] +gamma
    # }
    return param_grid


# Create a dataframe with the dependent and independent variables
def print_metrics(y_test, y_pred):
    # Evaluate the model on the test set
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)

In [3]:
def rf_train(X_train, X_test, y_train, y_test):
    # Define the NB classifier
    rf = RandomForestClassifier()
    # Define hyperparameter grid to search over
    param_grid = {
        'n_estimators': [10, 50, 100],  # Number of trees in the forest
        'max_depth': [None, 10, 20],  # Maximum depth of the trees
        'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
        'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
        'max_features': ['sqrt', 'log2']  # Number of features to consider for the best split
    }
    # Create GridSearchCV object with Random Forest Classifier and hyperparameter grid
    grid_search = GridSearchCV(rf, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    # Get the best hyperparameters found by GridSearchCV
    best_params = grid_search.best_params_
    print("Best Parameters: ", best_params)
    print("Best Accuracy: ", grid_search.best_score_)
    # Train Random Forest Classifier with the best hyperparameters on the entire training data
    best_rf = RandomForestClassifier(**best_params)
    best_rf.fit(X_train, y_train)
    # Evaluate the model on the test set
    accuracy = best_rf.score(X_test, y_test)
    # Predict labels for test data
    y_pred = best_rf.predict(X_test)
    print_metrics(y_test, y_pred)
    return best_rf

In [4]:
file_path = '../../csv/fin_features.csv'
df_ = pd.read_csv(file_path)
df_

Unnamed: 0.1,Unnamed: 0,length,width,diameter,area,perimeter,rectangularity,circularity,ellipticity,apect_ratio,...,perimeter_ratio_of_diameter,perimeter_ratio_of_PLW,fractal_dim,entropy,eccentricity,curvature,vein_density,color,teeth,species
0,0,177,146,180.917511,16650.0,539.612262,1.212329,0.718556,1.582544,1.212329,...,2.982643,1.670626,1.890963,4.896521,0.355155,0.158420,0.027938,151,58,0
1,1,168,97,173.085022,13001.0,467.362479,1.731959,0.747961,1.727679,1.731959,...,2.700190,1.763632,1.845597,4.855767,0.414043,0.224843,0.029066,231,37,0
2,2,181,66,164.964050,7281.5,451.605119,2.742424,0.448656,2.978130,2.742424,...,2.737597,1.828361,1.944664,4.720797,0.640082,0.152909,0.038162,141,43,0
3,3,187,152,189.208450,17753.0,572.357426,1.230263,0.681000,1.565289,1.230263,...,3.025010,1.688370,1.852114,4.062104,0.343600,0.143289,0.020292,215,47,0
4,4,287,133,289.048981,30347.0,766.901584,0.463415,0.648405,2.178899,2.157895,...,2.653189,1.825956,1.820155,4.927390,0.516427,0.414601,0.019083,195,48,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34092,34092,165,83,174.651871,5400.0,408.132031,1.987952,0.407382,4.333301,1.987952,...,2.336832,1.645694,1.940472,0.792677,0.765777,0.055295,0.043552,193,35,18
34093,34093,276,107,281.507416,18000.5,680.457932,0.387681,0.488531,3.204195,2.579439,...,2.417194,1.776653,1.921504,3.780989,0.689449,0.094697,0.022295,103,56,18
34094,34094,155,47,153.985474,5246.5,370.693432,3.297872,0.479788,3.589358,3.297872,...,2.407327,1.835116,1.955055,3.962493,0.713840,0.135895,0.045069,118,30,18
34095,34095,287,97,263.738831,17571.5,720.658941,0.337979,0.425167,3.402805,2.958763,...,2.732472,1.876716,1.910407,4.519591,0.611948,0.200701,0.023595,89,49,18


In [5]:
df = preprocess_data(df_)
df

Unnamed: 0.1,Unnamed: 0,length,width,diameter,area,perimeter,rectangularity,circularity,ellipticity,apect_ratio,...,perimeter_ratio_of_diameter,perimeter_ratio_of_PLW,fractal_dim,entropy,eccentricity,curvature,vein_density,color,teeth,species
0,0,177,146,180.917511,16650.0,539.612262,1.212329,0.718556,1.582544,1.212329,...,2.982643,1.670626,1.890963,4.896521,0.355155,0.158420,0.027938,151,58,0
1,1,168,97,173.085022,13001.0,467.362479,1.731959,0.747961,1.727679,1.731959,...,2.700190,1.763632,1.845597,4.855767,0.414043,0.224843,0.029066,231,37,0
2,2,181,66,164.964050,7281.5,451.605119,2.742424,0.448656,2.978130,2.742424,...,2.737597,1.828361,1.944664,4.720797,0.640082,0.152909,0.038162,141,43,0
3,3,187,152,189.208450,17753.0,572.357426,1.230263,0.681000,1.565289,1.230263,...,3.025010,1.688370,1.852114,4.062104,0.343600,0.143289,0.020292,215,47,0
4,4,287,133,289.048981,30347.0,766.901584,0.463415,0.648405,2.178899,2.157895,...,2.653189,1.825956,1.820155,4.927390,0.516427,0.414601,0.019083,195,48,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33803,34092,165,83,174.651871,5400.0,408.132031,1.987952,0.407382,4.333301,1.987952,...,2.336832,1.645694,1.940472,0.792677,0.765777,0.055295,0.043552,193,35,18
33804,34093,276,107,281.507416,18000.5,680.457932,0.387681,0.488531,3.204195,2.579439,...,2.417194,1.776653,1.921504,3.780989,0.689449,0.094697,0.022295,103,56,18
33805,34094,155,47,153.985474,5246.5,370.693432,3.297872,0.479788,3.589358,3.297872,...,2.407327,1.835116,1.955055,3.962493,0.713840,0.135895,0.045069,118,30,18
33806,34095,287,97,263.738831,17571.5,720.658941,0.337979,0.425167,3.402805,2.958763,...,2.732472,1.876716,1.910407,4.519591,0.611948,0.200701,0.023595,89,49,18


In [6]:
# Prepare the data 2
X = df[['area','perimeter','rectangularity','circularity','ellipticity','apect_ratio','form_factor','narrow_factor','perimeter_ratio_of_diameter','perimeter_ratio_of_PLW','fractal_dim','entropy','eccentricity','curvature','vein_density','color','teeth']]
y = df['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
import os

# function to save dataframe in csv folder
def save_dataframe(df, file_path):
    if os.path.exists(file_path):
        print(f'File {file_path} already exists. Skipping.')
    else:
        df.to_csv(file_path, index=False)
        print(f'File {file_path} saved successfully.')
    return None


# Save training and testing data
save_dataframe(X_train, '../../csv/train_test/finX_train_features.csv')
save_dataframe(X_test, '../../csv/train_test/finX_test_features.csv')
save_dataframe(y_train, '../../csv/train_test/finy_train_features.csv')
save_dataframe(y_test, '../../csv/train_test/finy_test_features.csv')

import numpy as np

# If the data has already been splitted into train and test, you can use the following code to load it.
X_train, X_test, y_train, y_test = (pd.read_csv('../../csv/train_test/finX_train_features.csv'),
                                    pd.read_csv('../../csv/train_test/finX_test_features.csv'),
                                    pd.read_csv('../../csv/train_test/finy_train_features.csv'),
                                    pd.read_csv('../../csv/train_test/finy_test_features.csv'))

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

File ../../csv/train_test/finX_train_features.csv saved successfully.
File ../../csv/train_test/finX_test_features.csv saved successfully.
File ../../csv/train_test/finy_train_features.csv saved successfully.
File ../../csv/train_test/finy_test_features.csv saved successfully.


# Random Forest

In [8]:
X_standard_scaled = standardize_data(X_train)
rf = rf_train(X_train, X_test, y_train, y_test)

Best Parameters:  {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy:  0.8948828491160586
Accuracy: 0.9174800354924578
Precision: 0.9189644138055827
Recall: 0.9174800354924578


# Save/Load Model

In [9]:
import joblib

# Save the best SVM model
joblib.dump(rf, '../../models/plant_prediction_model_rf_19.pkl')
# Save standard scaler
fitted_scaler = scalerfit_data(X_train)
joblib.dump(fitted_scaler, '../../models/X_standard_scaler_19.pkl')

['../../models/X_standard_scaler_19.pkl']

In [10]:
import joblib
from sklearn import svm

# Load the saved SVM model
loaded_model = joblib.load('../../models/plant_prediction_model_rf_19.pkl')

# Make predictions on the test data
y_pred = loaded_model.predict(X_test)

# Evaluate the model on the test set
print_metrics(y_test, y_pred)

Accuracy: 0.9174800354924578
Precision: 0.9189644138055827
Recall: 0.9174800354924578


# Single Input Testing

In [21]:
X_standard_scaler = joblib.load('../../models/X_standard_scaler_19.pkl')
new_scaled_data = X_standard_scaler.transform(np.array([[4270.5,356.8355673333515594,0.4,0.4214560751723232364247,4.276187519589548,2.5,0.4214560751764247,1.016973164876302,2.339200152500642,1.6992169874055045,1.943252101849597,0.8767040812938101,0.7591508887491394,0.07618301789006197,0.05087847730600293,169,33]]))

# Make predictions
predictions = rf.predict(new_scaled_data)

# Print the predicted class labels
print("Predicted Class Labels: ", predictions)

Predicted Class Labels:  [0]


