In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import VotingClassifier

In [None]:
# Load dataset
df = pd.read_csv("D:\MED_LEAF_ID\data\glcm_features.csv")

# Drop the Image_Name column as it's not needed
df = df.drop(columns=["Image_Name"])

# Encode target variable
label_encoder = LabelEncoder()
df["Class_Name"] = label_encoder.fit_transform(df["Class_Name"])

# Split features and target
X = df.drop(columns=["Class_Name"])
y = df["Class_Name"]

In [None]:
# Exploratory Data Analysis (EDA)
plt.figure(figsize=(12, 5))
sns.countplot(x=y)
plt.xticks(rotation=90)
plt.title("Class Distribution")
plt.xlabel("Plant Class")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(X.corr(), cmap="coolwarm", vmax=1.0, vmin=-1.0, square=True)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Feature Selection using Recursive Feature Elimination (RFE)
log_reg = LogisticRegression(max_iter=1000)
rfe = RFE(log_reg, n_features_to_select=30)
rfe.fit(X, y)
X_selected = X.loc[:, rfe.support_]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

In [None]:
from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_selected, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Define parameter grid for Random Search

In [None]:
param_dist = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

In [None]:
rf = RandomForestClassifier(random_state=42)

In [None]:
random_search = RandomizedSearchCV(
    estimator=rf, param_distributions=param_dist, n_iter=50, cv=5, verbose=1, n_jobs=-1, random_state=42
)
random_search.fit(X_train, y_train)

In [None]:
best_rf = random_search.best_estimator_

In [None]:
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y)), random_state=42)
xgb_clf.fit(X_train, y_train)

In [None]:
ensemble = VotingClassifier(estimators=[('rf', best_rf), ('xgb', xgb_clf)], voting='soft')
ensemble.fit(X_train, y_train)

In [None]:
y_pred = ensemble.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [None]:
print("Best Parameters (RF):", random_search.best_params_)
print("Ensemble Model Accuracy:", accuracy)

In [None]:
import joblib
import os

# Define the path where the model will be saved
model_path = r"D:\MED_LEAF_ID\models\plant_classifier.pkl"

# Ensure the directory exists
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# Save the trained model
joblib.dump(ensemble, model_path)

print(f"Model saved successfully at: {model_path}")