In [None]:

#Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif


In [None]:
# Load & Combine Datasets
file1 = "/content/plant_disease_dataset.csv"
file2 = "/content/plant_disease_data.csv"

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)


df = pd.concat([df1, df2], ignore_index=True)

print("Dataset shape after combining:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nSummary Stats:")
print(df.describe())

# Correlation (numeric features only)
numeric_cols = ["Leaf_Spot_Size", "Humidity", "Temperature"]
print("\nCorrelation Matrix:")
print(df[numeric_cols].corr())


In [None]:
# Exploratory Data Analysis

# Plant Type distribution
plt.figure(figsize=(6,4))
sns.countplot(data=df, x="Plant_Type", palette="Set2", legend=False)
plt.title("Distribution of Plant Types")
plt.show()

# Disease Status distribution
plt.figure(figsize=(6,4))
sns.countplot(data=df, x="Disease_Status", palette="Set1", legend=False)
plt.title("Distribution of Disease Status")
plt.show()

# Boxplots
for col in ["Leaf_Spot_Size", "Humidity", "Temperature"]:
    plt.figure(figsize=(6,4))
    sns.boxplot(data=df, x="Disease_Status", y=col, palette="coolwarm")
    plt.title(f"{col} vs Disease Status")
    plt.show()


In [None]:
# Encode Categorical Features
le_plant = LabelEncoder()
le_color = LabelEncoder()
le_status = LabelEncoder()

df["Plant_Type"] = le_plant.fit_transform(df["Plant_Type"])
df["Leaf_Color"] = le_color.fit_transform(df["Leaf_Color"])
df["Disease_Status"] = le_status.fit_transform(df["Disease_Status"])


In [None]:
# Train/Test Split
X = df.drop(["Plant_ID", "Disease_Status"], axis=1)
y = df["Disease_Status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# Train Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Predictions
y_pred = model.predict(X_test)

# Ensure labels present in y_test or y_pred
unique_labels = np.unique(np.concatenate([y_test, y_pred]))

# Convert encoded labels back to original class names
target_names = le_status.inverse_transform(unique_labels)
target_names = [str(name) for name in target_names]   # force to list of strings

# Classification report
print("Classification Report:\n",
      classification_report(y_test, y_pred,
                            labels=unique_labels,
                            target_names=target_names))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=unique_labels)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=target_names,
            yticklabels=target_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix Heatmap")
plt.show()


In [None]:
# Feature Importance
plt.figure(figsize=(8,5))
importances = model.feature_importances_
features = X.columns
sns.barplot(x=importances, y=features, palette="viridis")
plt.title("Feature Importance in Disease Prediction")
plt.show()


In [None]:
# Save Model & Encoders
joblib.dump(model, "plant_disease_model.pkl")
joblib.dump(le_plant, "plant_encoder.pkl")
joblib.dump(le_color, "color_encoder.pkl")
joblib.dump(le_status, "status_encoder.pkl")

print("✅ Model and encoders saved successfully!")


In [None]:
#  Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer

X = df.drop(columns=["Plant_ID", "Disease_Status"])
y = df["Disease_Status"]

if y.dtype == "object":
    y = le_status.fit_transform(y)

X_enc = pd.get_dummies(X, drop_first=True)

imputer = SimpleImputer(strategy="mean")
X_enc = pd.DataFrame(imputer.fit_transform(X_enc), columns=X_enc.columns)


selector = SelectKBest(score_func=f_classif, k="all")
X_new = selector.fit_transform(X_enc, y)


feature_scores = pd.DataFrame({
    "Feature": X_enc.columns,
    "Score": selector.scores_
}).sort_values(by="Score", ascending=False)

print("\nFeature Importance Scores:")
print(feature_scores)


plt.figure(figsize=(8,5))
sns.barplot(x="Score", y="Feature", data=feature_scores, palette="crest")
plt.title("Feature Selection Scores (ANOVA F-test)")
plt.show()


In [None]:
# Cross Validation
scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
print("Cross-validation Accuracy Scores:", scores)
print("Mean Accuracy:", scores.mean())


In [None]:
# Multi Sample Predictions
import pandas as pd
import numpy as np

# Raw new samples
new_samples_raw = pd.DataFrame({
    "Plant_Type": ["Corn", "Rice", "Potato"],
    "Leaf_Color": ["Green", "Yellow", "Brown"],
    "Leaf_Spot_Size": [1.2, 4.8, 6.0],
    "Humidity": [65.0, 80.0, 50.0],
    "Temperature": [27.0, 30.0, 35.0]
})

X_train_cols = X_train.columns

new_samples_enc = pd.get_dummies(new_samples_raw, drop_first=True)


for col in X_train_cols:
    if col not in new_samples_enc.columns:
        new_samples_enc[col] = 0


new_samples_enc = new_samples_enc[X_train_cols]

probs = model.predict_proba(new_samples_enc)
predictions = model.predict(new_samples_enc)

for i in range(len(new_samples_enc)):
    print(f"Sample {i+1}: {le_status.inverse_transform([predictions[i]])[0]}")
    for cls, p in zip(le_status.classes_, probs[i]):
        print(f"   {cls}: {p:.2f}")


In [None]:
#  Detection
plt.figure(figsize=(7,5))
sns.boxplot(data=df, y="Leaf_Spot_Size", x="Plant_Type")
plt.title("Outliers in Leaf Spot Size by Plant Type")
plt.show()
