## **Library**

In [None]:
import numpy as np
import pandas as pd
import cv2
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from google.colab import drive
drive.mount('/content/drive')

## **Load Data**

In [None]:
csv_path = '/content/drive/MyDrive/PCD 2025/datasetPCD/Dataset.csv'
training_folder = '/content/drive/MyDrive/PCD 2025/datasetPCD/Original Images/Original Images'
testing_folder = '/content/drive/MyDrive/PCD 2025/datasetPCD/Faces/Faces'

df = pd.read_csv(csv_path)
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())
print("\nUnique Labels:", df['label'].nunique())
print("\nLabel Distribution:")
print(df['label'].value_counts())

def load_images_from_folder(folder_path, label_df):
    images = []
    labels = []
    image_names = []

    for artist_folder in os.listdir(folder_path):
        artist_path = os.path.join(folder_path, artist_folder)

        if os.path.isdir(artist_path):
            for img_name in os.listdir(artist_path):
                img_path = os.path.join(artist_path, img_name)

                img = cv2.imread(img_path)
                if img is not None:
                    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                    images.append(gray)
                    labels.append(artist_folder)
                    image_names.append(img_name)

    return images, labels, image_names

train_images, train_labels, train_names = load_images_from_folder(training_folder, df)
print(f"Total training images loaded: {len(train_images)}")
print(f"Unique artists: {len(set(train_labels))}")

# plt.figure(figsize=(12, 6))
# df['label'].value_counts().plot(kind='bar')
# plt.title('Distribusi Artis dalam Dataset')
# plt.xlabel('Nama Artis')
# plt.ylabel('Jumlah Gambar')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.show()


## **EDA**

In [None]:
# 2. ANALISIS DISTRIBUSI LABEL
print("\n\n2. ANALISIS DISTRIBUSI LABEL")
print("-"*40)
label_counts = df['label'].value_counts().sort_index()
unique_labels = df['label'].nunique()

print(f"✓ Jumlah kelas/individu unik: {unique_labels}")
print(f"✓ Total sampel per kelas:")
for label, count in label_counts.items():
    print(f"   - {label}: {count} gambar")

# Statistik distribusi
print(f"\nStatistik distribusi:")
print(f"   - Rata-rata sampel per kelas: {label_counts.mean():.2f}")
print(f"   - Median sampel per kelas: {label_counts.median():.2f}")
print(f"   - Std deviasi: {label_counts.std():.2f}")
print(f"   - Min sampel: {label_counts.min()}")
print(f"   - Max sampel: {label_counts.max()}")

# 3. CEK KEBERADAAN FILE GAMBAR
print("\n\n3. VALIDASI KEBERADAAN FILE GAMBAR")
print("-"*40)
missing_images = []
existing_images = []
corrupted_images = []

for idx, row in df.iterrows():
    image_path = os.path.join(img_folder, row['id'])

    if not os.path.exists(image_path):
        missing_images.append((idx, row['id'], row['label']))
    else:
        # Cek apakah gambar bisa dibaca
        try:
            img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            if img is None:
                corrupted_images.append((idx, row['id'], row['label']))
            else:
                existing_images.append((idx, row['id'], row['label']))
        except:
            corrupted_images.append((idx, row['id'], row['label']))

print(f"✓ Gambar yang tersedia: {len(existing_images)}")
print(f"✗ Gambar yang hilang: {len(missing_images)}")
print(f"✗ Gambar yang corrupt: {len(corrupted_images)}")

if missing_images:
    print(f"\nContoh gambar yang hilang:")
    for i, (idx, img_id, label) in enumerate(missing_images[:5]):
        print(f"   - Index {idx}: {img_id} (Label: {label})")

# 4. ANALISIS RESOLUSI GAMBAR
print("\n\n4. ANALISIS RESOLUSI GAMBAR")
print("-"*40)
resolutions = []
file_sizes = []

for idx, img_id, label in existing_images:
    image_path = os.path.join(img_folder, img_id)
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    resolutions.append(img.shape)
    file_sizes.append(os.path.getsize(image_path))

if resolutions:
    unique_resolutions = list(set(resolutions))
    print(f"✓ Resolusi unik yang ditemukan: {len(unique_resolutions)}")

    resolution_counts = Counter(resolutions)
    print(f"✓ Distribusi resolusi:")
    for res, count in sorted(resolution_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"   - {res[1]}x{res[0]}: {count} gambar ({count/len(resolutions)*100:.1f}%)")

    # Statistik resolusi
    heights = [res[0] for res in resolutions]
    widths = [res[1] for res in resolutions]

    print(f"\nStatistik resolusi:")
    print(f"   - Tinggi: min={min(heights)}, max={max(heights)}, rata-rata={np.mean(heights):.1f}")
    print(f"   - Lebar: min={min(widths)}, max={max(widths)}, rata-rata={np.mean(widths):.1f}")

# 5. ANALISIS KUALITAS GAMBAR
print("\n\n5. ANALISIS KUALITAS GAMBAR")
print("-"*40)
brightness_stats = []
contrast_stats = []
blur_stats = []

sample_size = min(100, len(existing_images))  # Analisis sample untuk efisiensi
print(f"Menganalisis kualitas dari {sample_size} gambar sample...")

for i in range(sample_size):
    idx, img_id, label = existing_images[i]
    image_path = os.path.join(img_folder, img_id)
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Brightness (rata-rata intensitas pixel)
    brightness = np.mean(img)
    brightness_stats.append(brightness)

    # Contrast (standar deviasi intensitas pixel)
    contrast = np.std(img)
    contrast_stats.append(contrast)

    # Blur detection (varian Laplacian)
    blur_score = cv2.Laplacian(img, cv2.CV_64F).var()
    blur_stats.append(blur_score)

if brightness_stats:
    print(f"✓ Statistik Kecerahan:")
    print(f"   - Rata-rata: {np.mean(brightness_stats):.1f}")
    print(f"   - Min: {np.min(brightness_stats):.1f}, Max: {np.max(brightness_stats):.1f}")

    print(f"✓ Statistik Kontras:")
    print(f"   - Rata-rata: {np.mean(contrast_stats):.1f}")
    print(f"   - Min: {np.min(contrast_stats):.1f}, Max: {np.max(contrast_stats):.1f}")

    print(f"✓ Statistik Blur (semakin tinggi = semakin tajam):")
    print(f"   - Rata-rata: {np.mean(blur_stats):.1f}")
    print(f"   - Min: {np.min(blur_stats):.1f}, Max: {np.max(blur_stats):.1f}")

    # Identifikasi gambar bermasalah
    very_dark = sum(1 for b in brightness_stats if b < 50)
    very_bright = sum(1 for b in brightness_stats if b > 200)
    low_contrast = sum(1 for c in contrast_stats if c < 20)
    very_blurry = sum(1 for bl in blur_stats if bl < 100)

    print(f"\n⚠️  Potensi masalah kualitas:")
    print(f"   - Gambar terlalu gelap (< 50): {very_dark}")
    print(f"   - Gambar terlalu terang (> 200): {very_bright}")
    print(f"   - Kontras rendah (< 20): {low_contrast}")
    print(f"   - Gambar blur (< 100): {very_blurry}")

# 6. ANALISIS CLASS IMBALANCE
print("\n\n6. ANALISIS KESEIMBANGAN KELAS")
print("-"*40)
class_proportions = df['label'].value_counts(normalize=True).sort_index()

print(f"✓ Proporsi setiap kelas:")
for label, prop in class_proportions.items():
    print(f"   - {label}: {prop:.3f} ({prop*100:.1f}%)")

# Hitung imbalance ratio
max_samples = label_counts.max()
min_samples = label_counts.min()
imbalance_ratio = max_samples / min_samples

print(f"\n✓ Analisis keseimbangan:")
print(f"   - Rasio ketidakseimbangan: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 2:
    print(f"   ⚠️  Dataset tidak seimbang - pertimbangkan teknik balancing")
else:
    print(f"   ✓ Dataset relatif seimbang")

# 7. VISUALISASI
print("\n\n7. MEMBUAT VISUALISASI")
print("-"*40)

# Visualisasi 1: Distribusi Label
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
label_counts.plot(kind='bar', color='skyblue', alpha=0.7)
plt.title("Distribusi Jumlah Gambar per Individu", fontsize=12, fontweight='bold')
plt.xlabel("Label (Individu)")
plt.ylabel("Jumlah Gambar")
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)

# Visualisasi 2: Distribusi Resolusi
if resolutions:
    plt.subplot(2, 2, 2)
    heights = [res[0] for res in resolutions]
    widths = [res[1] for res in resolutions]
    plt.scatter(widths, heights, alpha=0.6, c='coral')
    plt.title("Distribusi Resolusi Gambar", fontsize=12, fontweight='bold')
    plt.xlabel("Lebar (pixels)")
    plt.ylabel("Tinggi (pixels)")
    plt.grid(True, alpha=0.3)

# Visualisasi 3: Distribusi Kualitas Gambar
if brightness_stats:
    plt.subplot(2, 2, 3)
    plt.hist(brightness_stats, bins=20, color='lightgreen', alpha=0.7, edgecolor='black')
    plt.title("Distribusi Kecerahan Gambar", fontsize=12, fontweight='bold')
    plt.xlabel("Nilai Kecerahan")
    plt.ylabel("Frekuensi")
    plt.grid(axis='y', alpha=0.3)

# Visualisasi 4: Class Balance
plt.subplot(2, 2, 4)
colors = plt.cm.Set3(np.linspace(0, 1, len(class_proportions)))
plt.pie(class_proportions.values, labels=class_proportions.index, autopct='%1.1f%%',
        colors=colors, startangle=90)
plt.title("Proporsi Kelas", fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

# 8. MENAMPILKAN CONTOH GAMBAR
print("\n8. CONTOH GAMBAR DARI DATASET")
print("-"*40)

if existing_images:
    # Tampilkan contoh dari setiap kelas
    plt.figure(figsize=(15, 10))

    samples_per_class = {}
    for idx, img_id, label in existing_images:
        if label not in samples_per_class:
            samples_per_class[label] = []
        if len(samples_per_class[label]) < 3:  # Maksimal 3 contoh per kelas
            samples_per_class[label].append((img_id, idx))

    plot_idx = 1
    for label, samples in samples_per_class.items():
        for img_id, idx in samples:
            if plot_idx > 15:  # Batasi tampilan
                break

            image_path = os.path.join(img_folder, img_id)
            img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            plt.subplot(3, 5, plot_idx)
            plt.imshow(img, cmap='gray')
            plt.title(f"Label: {label}\nSize: {img.shape}", fontsize=8)
            plt.axis('off')
            plot_idx += 1

        if plot_idx > 15:
            break

    plt.suptitle("Contoh Gambar dari Setiap Kelas", fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()


## **Preprocessing and Data Preparation**

### **Face Detection dan Cropping**

In [None]:
def detect_and_crop_face(image):
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

    if len(faces) > 0:
        (x, y, w, h) = faces[0]
        padding = int(0.1 * w)
        y1 = max(0, y - padding)
        y2 = min(image.shape[0], y + h + padding)
        x1 = max(0, x - padding)
        x2 = min(image.shape[1], x + w + padding)

        face_crop = image[y1:y2, x1:x2]
        return face_crop
    else:
        return image

print("Detecting and cropping faces...")
cropped_faces = []
valid_labels = []
valid_names = []

for i, (img, label, name) in enumerate(zip(train_images, train_labels, train_names)):
    face = detect_and_crop_face(img)
    if face is not None and face.size > 0:
        cropped_faces.append(face)
        valid_labels.append(label)
        valid_names.append(name)

    if (i + 1) % 100 == 0:
        print(f"Processed {i + 1}/{len(train_images)} images")

print(f"Successfully cropped {len(cropped_faces)} faces")

### **Data Augmented**

In [None]:
def augment_data(images, labels):
    augmented_images = []
    augmented_labels = []

    for img, label in zip(images, labels):
        augmented_images.append(img)
        augmented_labels.append(label)

        flipped = cv2.flip(img, 1)
        augmented_images.append(flipped)
        augmented_labels.append(label)

        bright = cv2.convertScaleAbs(img, alpha=1.2, beta=10)
        dark = cv2.convertScaleAbs(img, alpha=0.8, beta=-10)
        augmented_images.append(bright)
        augmented_labels.append(label)
        augmented_images.append(dark)
        augmented_labels.append(label)

    return augmented_images, augmented_labels

print("Augmenting data...")
augmented_faces, augmented_labels = augment_data(cropped_faces, valid_labels)
print(f"Augmented data size: {len(augmented_faces)}")

### **Resize Images**

In [None]:
def resize_images(images, size=(128, 128)):
    resized_images = []
    for img in images:
        resized = cv2.resize(img, size)
        resized_images.append(resized)
    return np.array(resized_images)

resized_faces = resize_images(augmented_faces)
print(f"Resized faces shape: {resized_faces.shape}")

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for i, ax in enumerate(axes.flat):
    if i < len(resized_faces):
        ax.imshow(resized_faces[i], cmap='gray')
        ax.set_title(valid_labels[i])
        ax.axis('off')
plt.suptitle('Sample Cropped and Resized Faces')
plt.tight_layout()
plt.show()


### **Local Binary Pattern (LBP) Implementation**

In [None]:
def calculate_lbp(image, radius=1, n_points=8):
    rows, cols = image.shape
    lbp_image = np.zeros_like(image)

    for i in range(radius, rows - radius):
        for j in range(radius, cols - radius):
            center = image[i, j]
            binary_code = 0

            for p in range(n_points):
                theta = 2 * np.pi * p / n_points
                neighbor_x = i + radius * np.cos(theta)
                neighbor_y = j + radius * np.sin(theta)

                x1, y1 = int(neighbor_x), int(neighbor_y)
                x2, y2 = x1 + 1, y1 + 1

                x1 = max(0, min(x1, rows - 1))
                x2 = max(0, min(x2, rows - 1))
                y1 = max(0, min(y1, cols - 1))
                y2 = max(0, min(y2, cols - 1))

                fx = neighbor_x - x1
                fy = neighbor_y - y1

                neighbor_value = (1 - fx) * (1 - fy) * image[x1, y1] + \
                                fx * (1 - fy) * image[x2, y1] + \
                                (1 - fx) * fy * image[x1, y2] + \
                                fx * fy * image[x2, y2]

                if neighbor_value >= center:
                    binary_code |= (1 << p)

            lbp_image[i, j] = binary_code

    return lbp_image

#### **Visualize LBP**

In [None]:
def visualize_lbp(image):
    lbp_image = calculate_lbp(image)

    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    axes[0].imshow(image, cmap='gray')
    axes[0].set_title('Original Image')
    axes[0].axis('off')

    axes[1].imshow(lbp_image, cmap='gray')
    axes[1].set_title('LBP Image')
    axes[1].axis('off')

    plt.tight_layout()
    plt.show()

visualize_lbp(resized_faces)

### **Local Binary Pattern Histogram (LBPH) Implementation**

In [None]:
def calculate_lbph(image, radius=1, n_points=8, grid_x=8, grid_y=8):
    lbp_image = calculate_lbp(image, radius, n_points)

    height, width = lbp_image.shape
    grid_height = height // grid_y
    grid_width = width // grid_x

    histograms = []

    for i in range(grid_y):
        for j in range(grid_x):
            y1 = i * grid_height
            y2 = (i + 1) * grid_height if i < grid_y - 1 else height
            x1 = j * grid_width
            x2 = (j + 1) * grid_width if j < grid_x - 1 else width

            grid = lbp_image[y1:y2, x1:x2]

            hist, _ = np.histogram(grid, bins=2**n_points, range=(0, 2**n_points))
            histograms.append(hist)

    feature_vector = np.concatenate(histograms)

    feature_vector = feature_vector / (feature_vector.sum() + 1e-7)

    return feature_vector

print("Extracting LBPH features...")
lbph_features = []
for i, face in enumerate(resized_faces):
    features = calculate_lbph(face, radius=1, n_points=8, grid_x=8, grid_y=8)
    lbph_features.append(features)

    if (i + 1) % 50 == 0:
        print(f"Extracted features for {i + 1}/{len(resized_faces)} images")

lbph_features = np.array(lbph_features)
print(f"LBPH features shape: {lbph_features.shape}")

### **Label Encoding**

In [None]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(augmented_labels)  # Ganti dari valid_labels
print(f"Number of classes: {len(label_encoder.classes_)}")
print(f"Classes: {label_encoder.classes_}")
print(f"Number of encoded labels: {len(encoded_labels)}")  # Tambahkan ini untuk verifikasi

### **Data Split for Training and Validation**

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    lbph_features, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
)

print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")

## **Features Selection Using PCA**

In [None]:
pca = PCA(n_components=0.9)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

print(f"Original features: {X_train.shape[1]}")
print(f"PCA features: {X_train_pca.shape[1]}")
print(f"Variance explained: {pca.explained_variance_ratio_.sum():.2%}")

plt.figure(figsize=(10, 6))
cumsum_var = np.cumsum(pca.explained_variance_ratio_)
plt.plot(range(1, len(cumsum_var) + 1), cumsum_var)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.show()

## **Training Set**

### **Support Vector Classification**

In [None]:
print("Training SVC model...")
svm_model = SVC(kernel='rbf', C=10, gamma='scale', random_state=901)
svm_model.fit(X_train_pca, y_train)
svm_pred = svm_model.predict(X_val_pca)
svm_accuracy = accuracy_score(y_val, svm_pred)
print(f"SVC Accuracy: {svm_accuracy:.4f}")

### **Random Forest**

In [None]:
print("\nTraining Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=200, random_state=901, n_jobs=-1)
rf_model.fit(X_train_pca, y_train)
rf_pred = rf_model.predict(X_val_pca)
rf_accuracy = accuracy_score(y_val, rf_pred)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

### **KNN**

In [None]:
print("\nTraining KNN model...")
knn_model = KNeighborsClassifier(n_neighbors=7, metric='cosine')
knn_model.fit(X_train_pca, y_train)
knn_pred = knn_model.predict(X_val_pca)
knn_accuracy = accuracy_score(y_val, knn_pred)
print(f"KNN Accuracy: {knn_accuracy:.4f}")

## **Validation Set**

### **RF Validation**

In [None]:
best_model = rf_model
best_pred = rf_pred
best_accuracy = rf_accuracy

print(f"\nBest Model: RF with accuracy {best_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, best_pred, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_val, best_pred)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

### **KNN Validation**

In [None]:
best_model = knn_model
best_pred = knn_pred
best_accuracy = knn_accuracy

print(f"\nBest Model: KNN with accuracy {best_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, best_pred, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_val, best_pred)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

### **SVC Validation**

In [None]:
best_model = svm_model
best_pred = svm_pred
best_accuracy = svm_accuracy

print(f"\nBest Model: SVM with accuracy {best_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, best_pred, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_val, best_pred)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## **Testing Set**

### **Load Testing Data**

In [None]:
def load_test_images(test_folder):
    test_images = []
    test_names = []

    for img_name in os.listdir(test_folder):
        img_path = os.path.join(test_folder, img_name)
        img = cv2.imread(img_path)

        if img is not None:
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            test_images.append(gray)
            test_names.append(img_name)

    return test_images, test_names

print("Loading test images...")
test_images, test_names = load_test_images(testing_folder)
print(f"Loaded {len(test_images)} test images")

### **Data Preparation for Testing Set**

In [None]:
test_cropped = []
valid_test_names = []

for img, name in zip(test_images, test_names):
    face = detect_and_crop_face(img)
    if face is not None and face.size > 0:
        test_cropped.append(face)
        valid_test_names.append(name)

test_resized = resize_images(test_cropped)

print("Extracting features from test images...")
test_features = []
for face in test_resized:
    features = calculate_lbph(face, radius=1, n_points=8, grid_x=8, grid_y=8)
    test_features.append(features)

test_features = np.array(test_features)

test_features_pca = pca.transform(test_features)

### **Predict Test Images**

In [None]:
test_predictions = best_model.predict(test_features_pca)
test_pred_labels = label_encoder.inverse_transform(test_predictions)

results_df = pd.DataFrame({
    'image_name': valid_test_names,
    'predicted_artist': test_pred_labels
})

print("\nTest Results:")
print(results_df.head(10))

results_df.to_csv('test_predictions.csv', index=False)
print("\nPredictions saved to 'test_predictions.csv'")

### **Visualize Test Predictions**

In [None]:
n_samples = min(10, len(test_resized))
fig, axes = plt.subplots(2, 5, figsize=(15, 6))

for i, ax in enumerate(axes.flat):
    if i < n_samples:
        ax.imshow(test_resized[i], cmap='gray')
        ax.set_title(f"Predicted: {test_pred_labels[i]}")
        ax.axis('off')

plt.suptitle('Sample Test Predictions')
plt.tight_layout()
plt.show()


### **Ensemble Prediction**

In [None]:
ensemble_pred = np.zeros((len(X_val_pca), len(label_encoder.classes_)))

weights = {'svm': 0.5, 'rf': 0.4, 'knn': 0.1}

svm_proba = svm_model.decision_function(X_val_pca)
rf_proba = rf_model.predict_proba(X_val_pca)
knn_proba = knn_model.predict_proba(X_val_pca)

svm_proba_norm = np.exp(svm_proba) / np.sum(np.exp(svm_proba), axis=1, keepdims=True)

ensemble_proba = (weights['svm'] * svm_proba_norm +
                  weights['rf'] * rf_proba +
                  weights['knn'] * knn_proba)

ensemble_pred = np.argmax(ensemble_proba, axis=1)
ensemble_accuracy = accuracy_score(y_val, ensemble_pred)

print(f"\nEnsemble Model Accuracy: {ensemble_accuracy:.4f}")

In [None]:
# # Cell 20: Save Final Model
# import pickle

# # Save the best model
# with open('face_recognition_model.pkl', 'wb') as f:
#     pickle.dump(best_model, f)

# # Save the PCA transformer
# with open('pca_transformer.pkl', 'wb') as f:
#     pickle.dump(pca, f)

# # Save the label encoder
# with open('label_encoder.pkl', 'wb') as f:
#     pickle.dump(label_encoder, f)

# print("Models saved successfully!")

## **Performance Metric Summary**

In [None]:
print("\n=== PERFORMANCE SUMMARY ===")
print(f"Total Images Processed: {len(resized_faces)}")
print(f"Number of Classes: {len(label_encoder.classes_)}")
print(f"Feature Dimension (Original): {lbph_features.shape[1]}")
print(f"Feature Dimension (After PCA): {X_train_pca.shape[1]}")
print(f"\nModel Accuracies:")
print(f"- SVM: {svm_accuracy:.4f}")
print(f"- Random Forest: {rf_accuracy:.4f}")
print(f"- KNN: {knn_accuracy:.4f}")
print(f"- Ensemble: {ensemble_accuracy:.4f}")
print(f"\nBest Model: {'Ensemble' if ensemble_accuracy > best_accuracy else 'SVM'}")
print(f"Best Accuracy: {max(ensemble_accuracy, best_accuracy):.4f}")

In [None]:
# Feature Importance Analysis (untuk Random Forest)
feature_importances = rf_model.feature_importances_

# Plot top 20 features
top_features_idx = np.argsort(feature_importances)[-20:]
top_features_importance = feature_importances[top_features_idx]

plt.figure(figsize=(10, 6))
plt.barh(range(len(top_features_idx)), top_features_importance)
plt.xlabel('Feature Importance')
plt.ylabel('Feature Index')
plt.title('Top 20 Most Important Features (Random Forest)')
plt.tight_layout()
plt.show()