<a href="https://colab.research.google.com/github/Emma-Cap/Deep-Learning-Project/blob/main/PROVAAAA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Import necessary libraries
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import torch
from torchvision import transforms
from PIL import Image

# Step 1: Extract Dataset
# Assume the dataset zip file has been uploaded directly to Colab's file system
dataset_zip_path = '/content/archive.zip'
extracted_data_path = '/content/face_age_dataset'

# Extracting the dataset
with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_data_path)

# Step 2: Load Dataset Labels
# The dataset contains two folders: 'faces' and 'faces_02'
faces_path = os.path.join(extracted_data_path, 'faces')
faces_02_path = os.path.join(extracted_data_path, 'faces_02')

# Load the CSV file containing the labels (assumed to be in the 'faces' folder)
dataset_csv_path = os.path.join(faces_path, 'age_data.csv')
df = pd.read_csv(dataset_csv_path)

# Step 3: Data Overview
# Let's take a look at the first few rows and general information about the dataset
print("First 5 rows of the dataset:")
print(df.head())
print("\nDataset Info:")
df.info()
print("\nMissing Values:")
print(df.isnull().sum())

# Step 4: Visualize Data Distribution
# Plotting the age distribution to understand the spread of ages
plt.figure(figsize=(10, 6))
df['age'].plot(kind='hist', bins=30, edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Distribution of Ages in the Dataset')
plt.show()

# Step 5: Display Sample Images with Ages
# Define a function to load and display some random images along with their age labels
def show_sample_images(df, img_folder_path, num_samples=5):
    plt.figure(figsize=(15, 5))
    samples = df.sample(num_samples)
    for i, (index, row) in enumerate(samples.iterrows()):
        img_path = os.path.join(img_folder_path, 'Train', row['img_name'])
        image = Image.open(img_path)
        plt.subplot(1, num_samples, i + 1)
        plt.imshow(image)
        plt.title(f"Age: {row['age']}")
        plt.axis('off')
    plt.tight_layout()
    plt.show()

# Display images from the 'faces' folder
image_folder_path = os.path.join(faces_path, 'Train')
show_sample_images(df, image_folder_path)

# Step 6: Basic Data Preparation and Cleaning
# Let's remove any data with missing labels or corrupted images
print("Cleaning the dataset...\n")
initial_count = len(df)
df.dropna(inplace=True)  # Drop rows with missing values

# Filter out any entries where the image file does not exist
valid_image_paths = df['img_name'].apply(lambda x: os.path.exists(os.path.join(image_folder_path, x)))
df = df[valid_image_paths]
print(f"Dropped {initial_count - len(df)} rows due to missing data or images.")

# Step 7: Load Additional Images from 'faces_02/part3'
# Load additional images from the 'faces_02' folder
additional_images_path = os.path.join(faces_02_path, 'part3')

# Display some sample images from 'faces_02/part3'
def show_sample_images_from_part3(img_folder_path, num_samples=5):
    img_files = os.listdir(img_folder_path)
    plt.figure(figsize=(15, 5))
    samples = np.random.choice(img_files, num_samples, replace=False)
    for i, img_name in enumerate(samples):
        img_path = os.path.join(img_folder_path, img_name)
        image = Image.open(img_path)
        plt.subplot(1, num_samples, i + 1)
        plt.imshow(image)
        plt.title(f"Image: {img_name}")
        plt.axis('off')
    plt.tight_layout()
    plt.show()

show_sample_images_from_part3(additional_images_path)

# Step 8: Transform the Images for PyTorch
# Define a basic transform to normalize images and resize them for further use
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Test the transform on a sample image
sample_image_path = os.path.join(image_folder_path, df.iloc[0]['img_name'])
image = Image.open(sample_image_path)
transformed_image = transform(image)

print("Image Shape after transformation:", transformed_image.shape)

# Step 9: Check Device and Set Up GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Next Steps:
# After this exploratory data analysis, we can proceed to create datasets and dataloaders for training and validation,
# and we can start experimenting with different PyTorch pre-trained models for fine-tuning.


FileNotFoundError: [Errno 2] No such file or directory: '/content/face_age_dataset/faces/age_data.csv'

In [4]:
# Import necessary libraries
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import torch
from torchvision import transforms
from PIL import Image

# Step 1: Extract Dataset
# Assume the dataset zip file has been uploaded directly to Colab's file system
dataset_zip_path = '/content/archive.zip'
extracted_data_path = '/content'

# Extracting the dataset
with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_data_path)

# Step 2: Load Dataset Labels
# The dataset contains two folders: 'faces' and 'faces_02'
faces_path = os.path.join(extracted_data_path, 'faces')
faces_02_path = os.path.join(extracted_data_path, 'faces_02')

# Load the CSV file containing the labels (assumed to be in the 'faces' folder)
dataset_csv_path = os.path.join(faces_path, 'age_data.csv')
df = pd.read_csv(dataset_csv_path)

# Step 3: Data Overview
# Let's take a look at the first few rows and general information about the dataset
print("First 5 rows of the dataset:")
print(df.head())
print("\nDataset Info:")
df.info()
print("\nMissing Values:")
print(df.isnull().sum())

# Step 4: Visualize Data Distribution
# Plotting the age distribution to understand the spread of ages
plt.figure(figsize=(10, 6))
df['age'].plot(kind='hist', bins=30, edgecolor='black')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Distribution of Ages in the Dataset')
plt.show()

# Step 5: Display Sample Images with Ages
# Define a function to load and display some random images along with their age labels
def show_sample_images(df, img_folder_path, num_samples=5):
    plt.figure(figsize=(15, 5))
    samples = df.sample(num_samples)
    for i, (index, row) in enumerate(samples.iterrows()):
        img_path = os.path.join(img_folder_path, 'Train', row['img_name'])
        image = Image.open(img_path)
        plt.subplot(1, num_samples, i + 1)
        plt.imshow(image)
        plt.title(f"Age: {row['age']}")
        plt.axis('off')
    plt.tight_layout()
    plt.show()

# Display images from the 'faces' folder
image_folder_path = os.path.join(faces_path, 'Train')
show_sample_images(df, image_folder_path)

# Step 6: Basic Data Preparation and Cleaning
# Let's remove any data with missing labels or corrupted images
print("Cleaning the dataset...\n")
initial_count = len(df)
df.dropna(inplace=True)  # Drop rows with missing values

# Filter out any entries where the image file does not exist
valid_image_paths = df['img_name'].apply(lambda x: os.path.exists(os.path.join(image_folder_path, x)))
df = df[valid_image_paths]
print(f"Dropped {initial_count - len(df)} rows due to missing data or images.")

# Step 7: Load Additional Images from 'faces_02/part3'
# Load additional images from the 'faces_02' folder
additional_images_path = os.path.join(faces_02_path, 'part3')

# Display some sample images from 'faces_02/part3'
def show_sample_images_from_part3(img_folder_path, num_samples=5):
    img_files = os.listdir(img_folder_path)
    plt.figure(figsize=(15, 5))
    samples = np.random.choice(img_files, num_samples, replace=False)
    for i, img_name in enumerate(samples):
        img_path = os.path.join(img_folder_path, img_name)
        image = Image.open(img_path)
        plt.subplot(1, num_samples, i + 1)
        plt.imshow(image)
        plt.title(f"Image: {img_name}")
        plt.axis('off')
    plt.tight_layout()
    plt.show()

show_sample_images_from_part3(additional_images_path)

# Step 8: Transform the Images for PyTorch
# Define a basic transform to normalize images and resize them for further use
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Test the transform on a sample image
sample_image_path = os.path.join(image_folder_path, df.iloc[0]['img_name'])
image = Image.open(sample_image_path)
transformed_image = transform(image)

print("Image Shape after transformation:", transformed_image.shape)

# Step 9: Check Device and Set Up GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Next Steps:
# After this exploratory data analysis, we can proceed to create datasets and dataloaders for training and validation,
# and we can start experimenting with different PyTorch pre-trained models for fine-tuning.


FileNotFoundError: [Errno 2] No such file or directory: '/content/faces/age_data.csv'

In [5]:
# Import necessary libraries
import os
import zipfile

# Step 1: Extract Dataset
# Assume the dataset zip file has been uploaded directly to Colab's file system
dataset_zip_path = '/content/archive.zip'
extracted_data_path = '/content'

# Extracting the dataset
with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_data_path)

# Step 2: Explore Extracted Dataset
# List the contents of the extracted folder to understand its structure
extracted_items = os.listdir(extracted_data_path)
print("Contents of extracted dataset:")
print(extracted_items)

# List the contents of 'faces' and 'faces_02' folders if they exist
faces_path = os.path.join(extracted_data_path, 'faces')
faces_02_path = os.path.join(extracted_data_path, 'faces_02')

if os.path.exists(faces_path):
    faces_items = os.listdir(faces_path)
    print("\nContents of 'faces' folder:")
    print(faces_items)

if os.path.exists(faces_02_path):
    faces_02_items = os.listdir(faces_02_path)
    print("\nContents of 'faces_02' folder:")
    print(faces_02_items)

# Step 3: Check if specific files or folders are available
# Specifically, look for the CSV file and the image folders
expected_csv_path = os.path.join(faces_path, 'age_data.csv')
if os.path.exists(expected_csv_path):
    print("\nThe CSV file 'age_data.csv' is available.")
else:
    print("\nThe CSV file 'age_data.csv' is missing.")

train_images_path = os.path.join(faces_path, 'Train')
if os.path.exists(train_images_path):
    train_images = os.listdir(train_images_path)
    print(f"\nNumber of images in 'Train' folder: {len(train_images)}")
else:
    print("\nThe 'Train' folder is missing in 'faces'.")

part3_images_path = os.path.join(faces_02_path, 'part3')
if os.path.exists(part3_images_path):
    part3_images = os.listdir(part3_images_path)
    print(f"\nNumber of images in 'part3' folder: {len(part3_images)}")
else:
    print("\nThe 'part3' folder is missing in 'faces_02'.")


Contents of extracted dataset:
['.config', 'drive', 'faces_02', 'archive.zip', 'face_age_dataset', 'faces', 'sample_data']

Contents of 'faces' folder:
['train.csv', 'Train']

Contents of 'faces_02' folder:
['part3']

The CSV file 'age_data.csv' is missing.

Number of images in 'Train' folder: 19906

Number of images in 'part3' folder: 3252


In [6]:
# Import necessary libraries
import os
import zipfile

# Step 1: Extract Dataset
# Assume the dataset zip file has been uploaded directly to Colab's file system
dataset_zip_path = '/content/archive.zip'

# Step 2: Explore Contents of the Zip File
# List the contents of the archive without extracting
with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
    zip_contents = zip_ref.namelist()
    print("Contents of the zip file:")
    for item in zip_contents:
        print(item)

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
faces/Train/7917.jpg
faces/Train/7918.jpg
faces/Train/7919.jpg
faces/Train/792.jpg
faces/Train/7921.jpg
faces/Train/7922.jpg
faces/Train/7925.jpg
faces/Train/7926.jpg
faces/Train/7927.jpg
faces/Train/7928.jpg
faces/Train/793.jpg
faces/Train/7930.jpg
faces/Train/7931.jpg
faces/Train/7935.jpg
faces/Train/7936.jpg
faces/Train/7937.jpg
faces/Train/7938.jpg
faces/Train/794.jpg
faces/Train/7940.jpg
faces/Train/7941.jpg
faces/Train/7943.jpg
faces/Train/7944.jpg
faces/Train/7946.jpg
faces/Train/7947.jpg
faces/Train/7948.jpg
faces/Train/7949.jpg
faces/Train/795.jpg
faces/Train/7950.jpg
faces/Train/7951.jpg
faces/Train/7952.jpg
faces/Train/7953.jpg
faces/Train/7954.jpg
faces/Train/7955.jpg
faces/Train/7956.jpg
faces/Train/7958.jpg
faces/Train/7959.jpg
faces/Train/796.jpg
faces/Train/7962.jpg
faces/Train/7963.jpg
faces/Train/7965.jpg
faces/Train/7966.jpg
faces/Train/7967.jpg
faces/Train/7968.jpg
faces/Train/7969.jpg
faces/Train/797.j

In [8]:
# Import necessary libraries
import os
import zipfile
from collections import defaultdict

# Step 1: Extract Dataset
# Assume the dataset zip file has been uploaded directly to Colab's file system
dataset_zip_path = '/content/archive.zip'

# Step 2: Explore Contents of the Zip File
# List the contents of the archive without extracting
with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
    zip_contents = zip_ref.namelist()
    print("Contents of the zip file:")
    folder_structure = defaultdict(list)
    for item in zip_contents:
        parts = item.split('/')
        if len(parts) > 1:
            folder_structure[parts[0]].append(item)

    # Display number of elements in each subfolder
    for folder, items in folder_structure.items():
        print(f"Folder '{folder}' contains {len(items)} items.")

    # Step 3: Combine Images from 'faces' and 'faces_02' into a single list
    combined_images = []
    combined_images.extend(folder_structure['faces'])
    combined_images.extend(folder_structure['faces_02'])

    # Display the total number of images in the combined list
    print(f"Total number of images in the combined folder: {len(combined_images)}")

# Now combined_images contains all the paths of the images from both folders,
# and you can perform further operations on this variable as needed.


Contents of the zip file:
Folder 'faces' contains 19907 items.
Folder 'faces_02' contains 3252 items.
Total number of images in the combined folder: 23159


In [10]:
# Import necessary libraries
import matplotlib.pyplot as plt
from PIL import Image
import pandas as pd
import os
import zipfile

# Step 1: Load Labels from CSV File
# Load the CSV file containing the labels from inside the 'faces' folder of the zip file
dataset_zip_path = '/content/archive.zip'
with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
    with zip_ref.open('faces/age_data.csv') as csv_file:
        df = pd.read_csv(csv_file)

# Create a dictionary to map image names to their corresponding labels (ages)
labels_dict = dict(zip(df['img_name'], df['age']))

# Step 2: Define a Function to Display Sample Images with Labels
def show_sample_images(combined_images, labels_dict, num_samples=5):
    plt.figure(figsize=(15, 5))
    samples = combined_images[:num_samples]  # Get the first 'num_samples' images

    for i, img_path in enumerate(samples):
        with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
            # Extract the image to memory
            with zip_ref.open(img_path) as img_file:
                image = Image.open(img_file)
                img_name = os.path.basename(img_path)  # Extract the image name
                label = labels_dict.get(img_name, 'Unknown')  # Get the label (age)

                # Plot the image with the label
                plt.subplot(1, num_samples, i + 1)
                plt.imshow(image)
                plt.title(f"Age: {label}")
                plt.axis('off')
    plt.tight_layout()
    plt.show()

# Step 3: Display Sample Images from Combined List
show_sample_images(combined_images, labels_dict, num_samples=5)

# This block of code displays 5 images from the combined list along with their respective labels.


KeyError: "There is no item named 'faces/age_data.csv' in the archive"