In [None]:
import os
import random
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import DataLoader
from skimage.feature import graycomatrix, graycoprops
import mahotas
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf

# --- Step 1: Load Datasets ---
transform = transforms.ToTensor()
train_dataset = ImageFolder(root="/home/abhivandhana/computerVision/train", transform=transform)
val_dataset = ImageFolder(root="/home/abhivandhana/computerVision/val", transform=transform)

print(f"Training images: {len(train_dataset)}")
print(f"Validation images: {len(val_dataset)}")

def get_image_details(dataset):
    loader = DataLoader(dataset, batch_size=1, shuffle=True)
    images, _ = next(iter(loader))
    c, h, w = images.shape[1:]
    print(f"Image size: {h}x{w}, Channels: {c}")

print("\nTraining Image Details:")
get_image_details(train_dataset)
print("\nValidation Image Details:")
get_image_details(val_dataset)

# --- Step 2: Class distribution ---
train_dataset = ImageFolder(root="/home/abhivandhana/Documents/life/PG/cv/project/archive (2)/train_images/dataset/train")
val_dataset = ImageFolder(root="/home/abhivandhana/Documents/life/PG/cv/project/archive (2)/train_images/dataset/val")
print("Train class distribution:", Counter(train_dataset.targets))
print("Val class distribution:", Counter(val_dataset.targets))

# --- Step 3: Feature Extraction ---
image_folder = "/home/abhivandhana/Documents/life/PG/cv/project/train_images/train_images"
features = []

for filename in os.listdir(image_folder):
    if filename.lower().endswith((".jpg", ".png")):
        img_path = os.path.join(image_folder, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            continue

        mean_intensity = np.mean(img)
        std_intensity = np.std(img)
        glcm = graycomatrix(img, [5], [0], 256, symmetric=True, normed=True)
        contrast = graycoprops(glcm, 'contrast')[0, 0]
        homogeneity = graycoprops(glcm, 'homogeneity')[0, 0]
        haralick = mahotas.features.haralick(img).mean()

        blurred = cv2.GaussianBlur(img, (5, 5), 0)
        _, thresh = cv2.threshold(blurred, 60, 255, cv2.THRESH_BINARY)
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        max_radius, lesion_area, linearity = 0, 0, 0
        if contours:
            largest = max(contours, key=cv2.contourArea)
            lesion_area = cv2.contourArea(largest)
            (_, _), max_radius = cv2.minEnclosingCircle(largest)
            perimeter = cv2.arcLength(largest, True)
            if lesion_area > 0:
                linearity = (perimeter ** 2) / lesion_area

        features.append([
            filename, mean_intensity, std_intensity, contrast,
            homogeneity, haralick, max_radius, lesion_area, linearity
        ])

columns = ["File_Name", "Mean_Intensity", "STD_Intensity", "Contrast", "Homogeneity", "Haralick_Feature", "Radius", "Area", "Linearity"]
df = pd.DataFrame(features, columns=columns)

# --- Step 4: Clustering ---
feature_columns = columns[1:]
X = df[feature_columns]
kmeans = KMeans(n_clusters=5, random_state=42)
df["BCLC_Cluster"] = kmeans.fit_predict(X)

# --- Step 5: Map Clusters to Stages ---
cluster_mapping = {
    0: "Stage 0 (Very Early)",
    1: "Stage A (Early)",
    2: "Stage B (Intermediate)",
    3: "Stage C (Advanced)",
    4: "Stage D (End-Stage)"
}
df["BCLC_Stage"] = df["BCLC_Cluster"].map(cluster_mapping)
df.to_csv("lesion_features_bclc.csv", index=False)

# --- Step 6: Random Forest Classification ---
label_encoder = LabelEncoder()
df["BCLC_Stage_Label"] = label_encoder.fit_transform(df["BCLC_Stage"])
X = df[feature_columns]
y = df["BCLC_Stage_Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# --- Step 7: Confusion Matrix ---
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# --- Step 8: Neural Network ---
nn_model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(16, activation='relu'),
    Dense(5, activation='softmax')
])

nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = nn_model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test))

# --- Step 9: Plot NN Accuracy & Loss ---
plt.plot(np.array(history.history['accuracy']) * 100, label='Train Accuracy')
plt.plot(np.array(history.history['val_accuracy']) * 100, label='Val Accuracy')
plt.ylabel("Accuracy (%)")
plt.xlabel("Epochs")
plt.title("NN Accuracy Curve")
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.title("NN Loss Curve")
plt.legend()
plt.show()

# --- Step 10: Predict and Visualize ---
random_indices = random.sample(range(len(X_test)), 30)
X_sample = X_test.iloc[random_indices]
y_sample_actual = y_test.iloc[random_indices]
y_sample_pred = rf_model.predict(X_sample)

actual_labels = label_encoder.inverse_transform(y_sample_actual)
pred_labels = label_encoder.inverse_transform(y_sample_pred)

# Attach prediction info
sample_df = df.loc[X_sample.index].copy()
sample_df["Actual_Stage"] = actual_labels
sample_df["Predicted_Stage"] = pred_labels

image_dir = "/home/abhivandhana/Documents/life/PG/cv/project/train_images/train_images"
plt.figure(figsize=(20, 10))

for i, row in sample_df.iterrows():
    img_path = os.path.join(image_dir, row["File_Name"])
    img = cv2.imread(img_path)

    if img is None:
        print(f"Could not load {img_path}")
        continue

    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_with_text = img.copy()

    # Predicted Stage on the image (bottom-left)
    cv2.putText(img_with_text,
                f"Predicted: {row['Predicted_Stage']}",
                org=(10, img.shape[0] - 10),
                fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                fontScale=0.6,
                color=(255, 0, 0),  # Blue
                thickness=2,
                lineType=cv2.LINE_AA)

    # Plotting
    plt.subplot(5, 6, list(sample_df.index).index(i) + 1)
    plt.imshow(img_with_text)
    plt.title(f"Actual: {row['Actual_Stage']}", fontsize=8)
    plt.axis('off')

plt.tight_layout()
plt.show()