In [None]:
# ============================================
# PART 1: Upload & Prepare New Incoming Dataset
# ============================================

import os
import zipfile
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files

print("üìÅ Upload your NEW DATASET ZIP file:")
uploaded = files.upload()

zip_path = list(uploaded.keys())[0]

# Extract folder
extract_to = "/content/new_data"
os.makedirs(extract_to, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("‚úÖ ZIP extracted to:", extract_to)

# ---------------------------------------------------
# Validate dataset structure and detect class folders
# ---------------------------------------------------
def find_class_folders(path):
    folders = []
    for root, dirs, files in os.walk(path):
        for d in dirs:
            if len(os.listdir(os.path.join(root, d))) > 0:
                folders.append(os.path.join(root, d))
        break
    return folders

class_folders = find_class_folders(extract_to)
print("üìÇ Detected class folders:", class_folders)

# Extract class names
class_names = [os.path.basename(c) for c in class_folders]
print("üß™ Detected classes:", class_names)

# Check if they match the original 4
original_classes = ['Cyst', 'Normal', 'Stone', 'Tumor']

if sorted(class_names) != sorted(original_classes):
    print("‚ùå ERROR: Class mismatch!")
    print("Expected:", original_classes)
    print("Found:", class_names)
else:
    print("‚úÖ Class names match original training data.")

# Count images per class
image_counts = {}
for cls in class_folders:
    imgs = [f for f in os.listdir(cls) if f.lower().endswith(('.jpg','.png','.jpeg'))]
    image_counts[os.path.basename(cls)] = len(imgs)

print("\nüìä Image distribution in NEW dataset:")
print(image_counts)

# Simple bar plot
plt.bar(image_counts.keys(), image_counts.values())
plt.title("New Dataset Class Distribution")
plt.show()


In [None]:
# ============================================
# PART 2: DATA DRIFT DETECTION
# ============================================

from scipy.stats import ks_2samp
import cv2
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tqdm import tqdm

# 1) Load OLD training dataset path
OLD_DATA_PATH = "/content/data/train"

# 2) Load NEW data path
NEW_DATA_PATH = extract_to

IMG_SIZE = (224, 224)

def load_pixels(folder):
    pixel_values = []
    for cls in os.listdir(folder):
        cls_path = os.path.join(folder, cls)
        for img in os.listdir(cls_path):
            if img.lower().endswith(('.png','.jpg','.jpeg')):
                p = cv2.imread(os.path.join(cls_path, img), 0)
                if p is not None:
                    p = cv2.resize(p, IMG_SIZE)
                    pixel_values.append(p.flatten())
    return np.array(pixel_values)

print("üì• Loading OLD dataset pixels...")
old_pixels = load_pixels(OLD_DATA_PATH)

print("üì• Loading NEW dataset pixels...")
new_pixels = load_pixels(NEW_DATA_PATH)

# ----------------------------
# 1Ô∏è‚É£ KS Test on Pixel Values
# ----------------------------
print("üî¨ Running KS test...")
ks_stats = []
for i in range(1000):  # sample 1000 random pixels
    old_sample = old_pixels[:, i]
    new_sample = new_pixels[:, i]
    _, p = ks_2samp(old_sample, new_sample)
    ks_stats.append(p)

ks_pvalue = np.mean(ks_stats)
print("üìå KS p-value =", ks_pvalue)

# Interpret KS
pixel_drift = ks_pvalue < 0.05

# ---------------------------------------------------------
# 2Ô∏è‚É£ Embedding Drift using VGG16 (feature representations)
# ---------------------------------------------------------

base = VGG16(weights="imagenet", include_top=False, pooling='avg')
embedder = Model(inputs=base.input, outputs=base.output)

def get_embeddings(folder):
    embs = []
    for cls in os.listdir(folder):
        cls_path = os.path.join(folder, cls)
        for img in tqdm(os.listdir(cls_path)):
            if img.lower().endswith(('.jpg','.png')):
                p = cv2.imread(os.path.join(cls_path, img))
                if p is not None:
                    p = cv2.resize(p, IMG_SIZE)
                    p = np.expand_dims(p/255.0, axis=0)
                    e = embedder.predict(p, verbose=0)
                    embs.append(e.flatten())
    return np.array(embs)

print("üì• Getting OLD embeddings...")
old_emb = get_embeddings(OLD_DATA_PATH)

print("üì• Getting NEW embeddings...")
new_emb = get_embeddings(NEW_DATA_PATH)

from scipy.spatial.distance import cosine

# Compute avg cosine distance
distances = []
for i in range(min(len(old_emb), len(new_emb))):
    distances.append(cosine(old_emb[i], new_emb[i]))

embedding_drift_score = np.mean(distances)
print("üìå Embedding drift score:", embedding_drift_score)

embedding_drift = embedding_drift_score > 0.25  # threshold

# -------------------------------------------
# Decide if drift happened
# -------------------------------------------
if pixel_drift or embedding_drift:
    print("\nüö®üö® DATA DRIFT DETECTED üö®üö®")
else:
    print("\n‚úÖ No drift detected. Safe to continue training.")


In [None]:
# ============================================
# PART 3: TRAIN MODELS AGAIN (Custom + VGG)
# ============================================

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16, ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import shutil

# -------------------------------------------
# 1) Split NEW DATA into train/val/test again
# -------------------------------------------
DATASET_PATH = NEW_DATA_PATH
BASE = "/content/split_data"
for d in ['train','val','test']:
    os.makedirs(os.path.join(BASE, d), exist_ok=True)
    for cls in class_names:
        os.makedirs(os.path.join(BASE, d, cls), exist_ok=True)

train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15

for cls in class_names:
    cls_folder = os.path.join(DATASET_PATH, cls)
    imgs = [f for f in os.listdir(cls_folder) if f.lower().endswith(('.jpg','.png','.jpeg'))]

    train_imgs, temp_imgs = train_test_split(imgs, test_size=(1-train_ratio))
    val_imgs, test_imgs = train_test_split(temp_imgs, test_size=test_ratio/(val_ratio+test_ratio))

    for img in train_imgs:
        shutil.copy(os.path.join(cls_folder, img), os.path.join(BASE, 'train', cls))

    for img in val_imgs:
        shutil.copy(os.path.join(cls_folder, img), os.path.join(BASE, 'val', cls))

    for img in test_imgs:
        shutil.copy(os.path.join(cls_folder, img), os.path.join(BASE, 'test', cls))

# -------------------------------------------
# 2) Generators
# -------------------------------------------
datagen = ImageDataGenerator(rescale=1./255)

train_gen = datagen.flow_from_directory(os.path.join(BASE,'train'), target_size=(224,224),
                                        batch_size=32, class_mode='categorical')

val_gen = datagen.flow_from_directory(os.path.join(BASE,'val'), target_size=(224,224),
                                      batch_size=32, class_mode='categorical')

# -------------------------------------------
# 3) Train models (short version)
# -------------------------------------------

# Custom CNN
custom_cnn = Sequential([
    Conv2D(32,(3,3),activation='relu',input_shape=(224,224,3)),
    MaxPooling2D(2,2),
    Conv2D(64,(3,3),activation='relu'),
    MaxPooling2D(2,2),
    Flatten(),
    Dense(256,activation='relu'),
    Dropout(0.5),
    Dense(len(class_names),activation='softmax')
])
custom_cnn.compile(optimizer=Adam(1e-4),loss='categorical_crossentropy',metrics=['accuracy'])
custom_cnn.fit(train_gen, validation_data=val_gen, epochs=10)
custom_cnn.save("/content/custom_cnn_model.h5")

# VGG16
base_vgg = VGG16(weights="imagenet", include_top=False, input_shape=(224,224,3))
base_vgg.trainable = False

vgg = Sequential([
    base_vgg,
    Flatten(),
    Dense(256,activation='relu'),
    Dropout(0.5),
    Dense(len(class_names),activation='softmax')
])
vgg.compile(optimizer=Adam(1e-4),loss='categorical_crossentropy',metrics=['accuracy'])
vgg.fit(train_gen, validation_data=val_gen, epochs=10)
vgg.save("/content/vgg16_model.h5")

print(" Training complete! Models saved.")
