# Pattern Recognition - Homework 4

### Librairies

In [None]:
!pip install numpy
!pip install opencv-python
!pip install tensorflow
!pip install matplotlib
!pip install scikit-learn


In [None]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Loading data

In [None]:
def load_train_images(folder):
    # Initialization of variables
    images = []  # List to store images
    labels = []  # List to store labels

    class_folders = [os.path.join(folder, class_folder) for class_folder in sorted(os.listdir(folder))]

    # For each folder
    for label, class_folder in enumerate(class_folders):
        for filename in sorted(os.listdir(class_folder)):
            if filename.endswith('.pkl'):
                file_path = os.path.join(class_folder, filename)
                try:
                    with open(file_path, 'rb') as f:
                        image = pickle.load(f)
                        images.append(image)
                        labels.append(label)
                except (EOFError, pickle.UnpicklingError):
                    print(f"Error loading {file_path}, skipping this file.")

    # Transform list to array
    images = np.array(images)
    labels = np.array(labels)

    return images, labels

def load_test_images(test_folder):
    images = []     # List to store images
    filenames = []  # List to store filenames
    
    # For each element
    for filename in sorted(os.listdir(test_folder)):
        if filename.endswith('.pkl'):
            file_path = os.path.join(test_folder, filename)
            try:
                with open(file_path, 'rb') as f:
                    image = pickle.load(f)
                    images.append(image)
                    filenames.append(filename)
            except (EOFError, pickle.UnpicklingError):
                print(f"Error loading {file_path}, skipping this file.")

    # Transform list to array
    images = np.array(images)
    filenames = np.array(filenames)

    return images, filenames

# Initialization of variables
train_folder = '../data/train'  # Train data 
test_folder = '../data/test'    # Test data

# Loading data
train_images, train_labels = load_train_images(train_folder)
test_images, test_filenames = load_test_images(test_folder)

# Messages display
print("Number of training images :", len(train_images))
print("Number of training labels :", len(train_labels))
print("Number of test images :", len(test_images))
print()
print("Shape of training images :", train_images.shape)
print("Shape of training labels :", train_labels.shape)
print("Shape of test images :", test_images.shape)

### Plotting data

In [None]:
# Trouver les indices des images de la classe 0 et 1
indices_class_0 = [i for i, label in enumerate(train_labels) if label == 0]
indices_class_1 = [i for i, label in enumerate(train_labels) if label == 1]

# Sélectionner les 5 premières images de chaque classe
images_class_0 = [train_images[i][0] for i in indices_class_0[:5]]
images_class_1 = [train_images[i][0] for i in indices_class_1[:5]]

# Définir une fonction pour afficher une image
def show_image(image, ax):
    ax.imshow(image.astype(np.uint8))
    ax.axis('off')

# Afficher les images
fig, axes = plt.subplots(2, 5, figsize=(15, 6))

# Afficher les images de la classe 0
for i, image in enumerate(images_class_0):
    show_image(image, axes[0, i])
    axes[0, i].set_title(f'Classe 0 - {i+1}')

# Afficher les images de la classe 1
for i, image in enumerate(images_class_1):
    show_image(image, axes[1, i])
    axes[1, i].set_title(f'Classe 1 - {i+1}')

plt.tight_layout()
plt.show()

## Model

### Pre-trained model : VCG16

In [None]:
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
model = Model(inputs=base_model.input, outputs=base_model.output)

def extract_features(images):
    processed_images = preprocess_input(images)
    features = model.predict(processed_images)
    flattened_features = features.reshape((features.shape[0], -1))
    return flattened_features

train_features = [extract_features(bag) for bag in train_images]
train_features = np.array(train_features)

# Flatten the features
flattened_train_features = train_features.reshape((train_features.shape[0], -1))

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(flattened_train_features, train_labels, test_size=0.2, random_state=42)

### Searching for best model

In [None]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "SVM": SVC(kernel='linear', probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=200, random_state=42)
}

def evaluate_models(X_train, y_train, X_val, y_val, models):
    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        accuracy = model.score(X_val, y_val)
        results[name] = accuracy
        print(f'{name} validation accuracy: {accuracy}')

    return results

results = evaluate_models(X_train, y_train, X_val, y_val, models)

# Choose the best model (for example, based on highest accuracy)
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

print(f'Best model: {best_model_name} with accuracy {results[best_model_name]}')

# Train the best model on the entire training set
best_model.fit(flattened_train_features, train_labels)

### Trying a classification model

In [None]:
best_model = SVC(kernel='linear', probability=True, random_state=42)

# Train the best model on the entire training set
best_model.fit(flattened_train_features, train_labels)

## Predictions

In [None]:
test_features = [extract_features(bag) for bag in test_images]
test_features = np.array(test_features)
flattened_test_features = test_features.reshape((test_features.shape[0], -1))

test_predictions = best_model.predict(flattened_test_features)

def create_submission_file(test_files, predictions, output_folder):
    # Create a file if not exists
    os.makedirs(output_folder, exist_ok=True)
    
    output_file = os.path.join(output_folder, 'submission.csv')
    with open(output_file, 'w') as f:
        f.write('image_id,y_pred\n')
        for file, pred in zip(test_files, predictions):
            file_id = os.path.splitext(file)[0]
            f.write(f'{file_id},{pred}\n')

output_folder = "results"

create_submission_file(test_filenames, test_predictions, output_folder)
print("Submission file created successfully.")