# Section One - Creating a dataset
### Function Definitions
This cell contains the two main functions required for this section.
The function `pgm_to_matrix` reads an ASCII PGM file, applies a threshold to convert pixel values into binary values (using `BLACK_PIXEL` and `WHITE_PIXEL`).
The function `save_matrix_to_csv` writes the matrix (made up of a llist of lists) into a CSV file.

In [None]:
import os
import csv

THRESHOLD = 128
BLACK_PIXEL = 1
WHITE_PIXEL = 0

INPUT_FOLDER = 'images/PGM'
OUTPUT_FOLDER = 'images/CSV'
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

def pgm_to_matrix(pgm_file_path):
    with open(pgm_file_path, 'r') as file:
        pgm_format = file.readline().strip()
        if pgm_format != 'P2':
            raise ValueError("PGM Files are not ASCII Formatted.")

        file.readline()

        width, height = map(int, file.readline().strip().split())

        file.readline()

        pixel_values = []
        for line in file:
            pixel_values.extend([int(num) for num in line.split()])

    expected_pixel_len = width * height
    if len(pixel_values) != expected_pixel_len:
        raise ValueError(f"Expected {expected_pixel_len}, got {len(pixel_values)}!")

    binary_pixels = [BLACK_PIXEL if value < THRESHOLD else WHITE_PIXEL for value in pixel_values]

    matrix = []
    for i in range(height):
        row = binary_pixels[i * width:(i + 1) * width]
        matrix.append(row)

    return matrix


def save_matrix_to_csv(matrix, csv_file_path):
    with open(csv_file_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for row in matrix:
            writer.writerow(row)

### File Processing
This cell loops through every file in the `INPUT_FOLDER` (which contains the PGM files exported from GIMP), converts each one to a binary matrix using the function found in the cell above, and then saves the matrix as a CSV file in the `OUTPUT_FOLDER`.

In [None]:
for filename in os.listdir(INPUT_FOLDER):
    if filename.lower().endswith('.pgm'):
        pgm_file_path = os.path.join(INPUT_FOLDER, filename)
        try:
            matrix = pgm_to_matrix(pgm_file_path)
        except Exception as e:
            print(f"Error processing {pgm_file_path}: {e}")
            continue

        csv_filename = os.path.splitext(filename)[0] + '.csv'
        csv_file_path = os.path.join(OUTPUT_FOLDER, csv_filename)
        save_matrix_to_csv(matrix, csv_file_path)
        print(f"Converted {pgm_file_path} to {csv_file_path}")

### File Verification
In this cell, we randomly select one CSV file from the `OUTPUT_FOLDER` and verify that it meets the assignment specifications.
The checks include:
- Confirming that the file is read using UTF-8 encoding.
- Ensuring the file is exactly 18x18.
- Verifying that the only values found in the file are either "0" or "1"

In [None]:
import os
import csv
import random

OUTPUT_FOLDER = 'images/CSV'

all_csv_files = [file for file in os.listdir(OUTPUT_FOLDER) if file.lower().endswith('.csv')]
if not all_csv_files:
    print("No CSV files found in ", OUTPUT_FOLDER)
else:
    random_file = random.choice(all_csv_files)
    random_csv = os.path.join(OUTPUT_FOLDER, random_file)
    print(f"Chose {random_csv} to sample.")

    with open(random_csv, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        matrix = [row for row in reader]

    print("File contents:")
    for row in matrix:
        print(row)

    #Verify file to spec
    expected_rows = 18
    if len(matrix) != expected_rows:
        print(f"File does not have the expected row count {len(matrix)} instead of {expected_rows}")

    expected_columns = 18
    file_correct = True
    for i, row in enumerate(matrix):
        if len(row) != expected_columns:
            print(f"Row {i + 1} has {len(row)} columns instead of {expected_columns}")
            file_correct = False
        for j, val in enumerate(row):
            if val not in ('0', '1'):
                print(f"Value {val} at row {i + 1}, column {j + 1} is not the expected '0' or '1'")
                file_correct = False

    if file_correct:
        print("Verification Passed! The file meets the specification.")
    else:
        print("Verification failed: The file does not meet the specification.")