# Environmental Setup

## Installation Block

In [1]:
# ! pip install pandas matplotlib python-dotenv

## Import 

In [2]:
import os
import random
import zipfile
import pandas as pd
from dotenv import load_dotenv

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Load Environmental Variables

In [3]:
PATH_TO_DOT_ENV = "../src/.env"

# load environment variables from the .env file
load_dotenv(PATH_TO_DOT_ENV)

# extract environment variables
APTOS_2019_ZIP_PATH = ".." + os.environ.get("APTOS_2019_ZIP_PATH")
# APTOS_2019_TEST_CSV = os.environ.get("APTOS_2019_TEST_CSV")
APTOS_2019_TRAIN_CSV = os.environ.get("APTOS_2019_TRAIN_CSV")
APTOS_2019_ID_COLUMN = os.environ.get("APTOS_2019_ID_COLUMN")
APTOS_2019_LABLE_COLUMN = os.environ.get("APTOS_2019_LABLE_COLUMN")
APTOS_2019_FILE_EXTENSION = os.environ.get("APTOS_2019_FILE_EXTENSION")

PROCESSED_DATASET = ".." + os.environ.get("PROCESSED_DATASET")

# Data Preprocessing

## Load Dataset Meta Information

In [4]:
with zipfile.ZipFile(APTOS_2019_ZIP_PATH, 'r') as raw_dataset_zip:
    # # get a list of all file names in the zip archive
    # compressed_file_names = raw_dataset_zip.namelist()
    
    # load train file
    with raw_dataset_zip.open(APTOS_2019_TRAIN_CSV) as raw_train_csv:
        train_df = pd.read_csv(raw_train_csv, index_col=APTOS_2019_ID_COLUMN)

    # # load test file
    # with raw_dataset_zip.open(APTOS_2019_TEST_CSV) as raw_test_csv:
    #     test_df = pd.read_csv(raw_test_csv, index_col=APTOS_2019_ID_COLUMN)

## Indentify Lables

In [5]:
lables = train_df[APTOS_2019_LABLE_COLUMN].unique()
print(f"Unique lables: {lables}")


Unique lables: [2 4 1 0 3]


## Stratisfied Split

### Divide Images by Lables

In [6]:
# divide the train set by lables 
lables_index = {
    lable: train_df[train_df[APTOS_2019_LABLE_COLUMN] == lable].index.to_list() for lable in lables
}

# validated the lable division
split_lable_count = sum([len(values) for values in lables_index.values()])
assert len(train_df) == split_lable_count, f"Labes index were not split properly"

### Split Lable into Train, Test, and Validation

In [7]:
porcessed_dataset_index = {
    "train": {},
    "test": {},
    "val": {}
}

for lable in lables:
    # get the size of dataset for the respective lable
    len_label = len(lables_index[lable])
    train_size = int(len_label * 0.8)
    val_size = int(len_label * 0.1)
    test_size = len_label - train_size - val_size

    # shuffle the datset to randomly sampple
    random.shuffle(lables_index[lable])

    # perform a stratisfied split of the lable into train, test, and validation
    porcessed_dataset_index["train"][lable] = lables_index[lable][ :train_size]
    porcessed_dataset_index["test"][lable] = lables_index[lable][train_size : train_size + test_size]
    porcessed_dataset_index["val"][lable] = lables_index[lable][train_size + test_size : ]

    # validate test for the split 
    assert len(lables_index[lable]) == len(porcessed_dataset_index["train"][lable]) + len(porcessed_dataset_index["test"][lable]) + len(porcessed_dataset_index["val"][lable]), f"lable split count does not add up"

    print(f"Finished splitting the lable: '{lable}' into train ({len(porcessed_dataset_index['train'][lable])}), test ({len(porcessed_dataset_index['test'][lable])}), and validation ({len(porcessed_dataset_index['val'][lable])})")
    

Finished splitting the lable: '2' into train (799), test (101), and validation (99)
Finished splitting the lable: '4' into train (236), test (30), and validation (29)
Finished splitting the lable: '1' into train (296), test (37), and validation (37)
Finished splitting the lable: '0' into train (1444), test (181), and validation (180)
Finished splitting the lable: '3' into train (154), test (20), and validation (19)


### Extract Data from zip

In [8]:
gen_processed_dir = lambda set_type, lable: f"{PROCESSED_DATASET}{set_type}/{lable}"

# Create the destination folder if it doesn't exist
for lable in lables:
    os.makedirs(gen_processed_dir("train", lable), exist_ok=True)
    os.makedirs(gen_processed_dir("test", lable), exist_ok=True)
    os.makedirs(gen_processed_dir("val", lable), exist_ok=True)

In [9]:
SET_TYPES = ["train", "test", "val"]

with zipfile.ZipFile(APTOS_2019_ZIP_PATH, 'r') as raw_dataset_zip:
    for set_type in SET_TYPES:
        print(f"Started set: {set_type}")
        for lable in lables:
            for image_name in porcessed_dataset_index[set_type][lable]:
                zip_image_path = f"train_images/{image_name}.png"
                processed_image_path = f"{gen_processed_dir(set_type, lable)}"
                raw_dataset_zip.extract(zip_image_path, processed_image_path)
                os.rename(f"{processed_image_path}/{zip_image_path}", f"{processed_image_path}/{image_name}.png")
            os.removedirs(f"{processed_image_path}/train_images/")
            print(f"Extracted lable: {lable}")
        print(f"Finished set: {set_type}")


Started set: train
Extracted lable: 2
Extracted lable: 4
Extracted lable: 1
Extracted lable: 0
Extracted lable: 3
Finished set: train
Started set: test
Extracted lable: 2
Extracted lable: 4
Extracted lable: 1
Extracted lable: 0
Extracted lable: 3
Finished set: test
Started set: val
Extracted lable: 2
Extracted lable: 4
Extracted lable: 1
Extracted lable: 0
Extracted lable: 3
Finished set: val


In [10]:
# test preprocessing
processed_len = 0
for set_type in SET_TYPES:
    for lable in lables:
        labeled_set_dir = f"{gen_processed_dir(set_type, lable)}"
        processed_len += len(os.listdir(labeled_set_dir))
    
assert processed_len == len(train_df)
