<a href="https://colab.research.google.com/github/Boris2232/Machine-Learning-Project/blob/main/Creating_csv_files_of_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import shutil
import tarfile
import zipfile
import numpy as np
import os




# Utk datatset have three parts format"part{k}.tar.gz"
# this code extracts all images and its labels from the .gz archive and collects all data into one csv file with labels:["file","age", "age_range","gender","race","split","emotion"]
# https://susanqq.github.io/UTKFace/

In [None]:
# I have downloaded UTK dataset from the site. UTK does not have 'split'
# UTK faces data collecting:
open("UTK_faces.csv", "w").close()
for k in range(1, 4):
    with tarfile.open(f'part{k}.tar.gz', 'r:gz') as folder:
        members = folder.getnames()
        names = list(map(lambda x: x.split(f'part{k}/')[-1], members))
        csv_headers = ["file", "age", "age_range", "gender", "race", "split", "emotion"]
        with open('UTK_faces.csv', mode='a', encoding='utf-8') as file:
            file_writer = csv.DictWriter(file, delimiter=';', fieldnames=csv_headers, lineterminator='\r')
            if k == 1:
                file_writer.writeheader()
            for i in names:
                obj = i.split('_')[:-1:]
                if len(obj) > 2:
                    age = obj[0]
                    gender = obj[1]
                    race = obj[2]
                    file_writer.writerow(
                        {"file": i, "age": age, "age_range": np.NaN, "gender": gender, "race": race, "split": np.NaN,
                         "emotion": np.NaN})
# now we had collected all data from all parts of datasets, and we wrote all data in a separate file named UTK_Faces.
# After collecting all data from other datasets we can make a general table.
# final step - collect data from a fair-face dataset
csv_headers = ["file", "age", "age_range", "gender", "race", "split", "emotion"]
with open('Fair-Face.csv', mode='wt', encoding='utf-8') as table:
    file_writer = csv.DictWriter(table, delimiter=';', fieldnames=csv_headers, lineterminator='\r')
    file_writer.writeheader()
    with open('fairface_label_train.csv') as csvfile:
        reader = csv.DictReader(csvfile, delimiter=',')
        for i in reader:
            file_writer.writerow(
                {"file": i['file'], "age": np.NaN,
                 "age_range": i['age'],
                 "gender": i['gender'], "race": i['race'], "split": i['file'].split('/')[0],
                 "emotion": np.NaN})
    with open('fairface_label_val.csv') as csv_f:
        reader = csv.DictReader(csv_f, delimiter=',')
        for i in reader:
            file_writer.writerow(
                {"file": i['file'], "age": np.NaN,
                 "age_range": i['age'],
                 "gender": i['gender'], "race": i['race'], "split": i['file'].split('/')[0],
                 "emotion": np.NaN})

In [None]:
# RAF-DB is a dataset which contains labels of emotions and it is splitted to train/test parts
# http://www.whdeng.cn/raf/model1.html#dataset


# It's time to collect data from RAF-DB
zip_name = 'Annotations-20230618T142719Z-001.zip'
#  First of all lets unpack the zip file
with zipfile.ZipFile(zip_name, 'r') as file:
    annotations = file.extract('Annotations/manual.zip')
    file.extract('Annotations/list_patition_label.txt')
#  let's extract manual
with zipfile.ZipFile('Annotations/manual.zip', 'r') as file:
    file.extractall('Annotations')
# let's parse through the folder and create new csv file
csv_headers = ["file", "age", "age_range", "gender", "race", "split", "emotion"]
emotions = [i.split()[1].rstrip('\n') for i in open('Annotations/list_patition_label.txt', 'r').readlines()]
with open('RAF_DB.csv', mode='wt', encoding='utf-8') as table:
    file_writer = csv.DictWriter(table, delimiter=';', fieldnames=csv_headers, lineterminator='\r')
    file_writer.writeheader()
    counter = 0
    for i in os.listdir('Annotations/manual'):
        with open(f'Annotations/manual/{i}', 'r') as file:
            new_data = list(file.readlines())
            gender, race, age_range = [data.strip('\n') for data in new_data[5::]]
            emotion = emotions[counter]
            file_writer.writerow(
                {"file": i, "age": {0: '0-3', 1: '4-19', 2: '20-39', 3: '40-69', 4: '70+'}[int(age_range)],
                 "age_range": age_range,
                 "gender": gender, "race": race, "split": i.split('_')[0],
                 "emotion": emotion})
            counter += 1
shutil.rmtree('Annotations')

In [None]:
# FairFace's authors provided all data in csv table. It is already splitted.(we created two separate tables)
# https://github.com/joojs/fairface



# final step - collect data from a fair-face dataset
csv_headers = ["file", "age", "age_range", "gender", "race", "split", "emotion"]
with open('Fair-Face.csv', mode='wt', encoding='utf-8') as table:
    file_writer = csv.DictWriter(table, delimiter=';', fieldnames=csv_headers, lineterminator='\r')
    file_writer.writeheader()
    with open('fairface_label_train.csv') as csvfile:
        reader = csv.DictReader(csvfile, delimiter=',')
        for i in reader:
            file_writer.writerow(
                {"file": i['file'], "age": np.NaN,
                 "age_range": i['age'],
                 "gender": i['gender'], "race": i['race'], "split": i['file'].split('/')[0],
                 "emotion": np.NaN})
    with open('fairface_label_val.csv') as csv_f:
        reader = csv.DictReader(csv_f, delimiter=',')
        for i in reader:
            file_writer.writerow(
                {"file": i['file'], "age": np.NaN,
                 "age_range": i['age'],
                 "gender": i['gender'], "race": i['race'], "split": i['file'].split('/')[0],
                 "emotion": np.NaN})

In [None]:
# UTK
# file;age;age_range;gender;race;split;emotion
# 2_1_2_20161219202547820.jpg;2;nan;1;2;nan;nan
# 77_1_0_20170110122639530.jpg;77;nan;1;0;nan;nan
# 1_1_0_20170109190844250.jpg;1;nan;1;0;nan;nan
# 29_1_2_20170105164315483.jpg;29;nan;1;2;nan;nan
# 76_1_0_20170110131744527.jpg;76;nan;1;0;nan;nan
# 50_1_0_20170110154254311.jpg;50;nan;1;0;nan;nan
# 2_1_2_20161219152918020.jpg;2;nan;1;2;nan;nan
# 5_1_0_20170109194229104.jpg;5;nan;1;0;nan;nan
# 81_1_2_20170105174804349.jpg;81;nan;1;2;nan;nan
# 30_0_0_20170105164847516.jpg;30;nan;0;0;nan;nan



# RAF
# file;age;age_range;gender;race;split;emotion
# test_0001_manu_attri.txt;20-39;2;1;2;test;5
# test_0002_manu_attri.txt;4-19;1;1;2;test;5
# test_0003_manu_attri.txt;4-19;1;1;0;test;4
# test_0004_manu_attri.txt;20-39;2;1;0;test;4
# test_0005_manu_attri.txt;4-19;1;2;2;test;5
# test_0006_manu_attri.txt;20-39;2;1;0;test;1
# test_0007_manu_attri.txt;20-39;2;0;0;test;5
# test_0008_manu_attri.txt;20-39;2;0;0;test;4
# test_0009_manu_attri.txt;20-39;2;1;0;test;4
# test_0010_manu_attri.txt;40-69;3;1;0;test;1
# test_0011_manu_attri.txt;20-39;2;1;0;test;4
# test_0012_manu_attri.txt;20-39;2;0;2;test;1
# test_0013_manu_attri.txt;40-69;3;0;0;test;1
# test_0014_manu_attri.txt;20-39;2;1;2;test;4
# test_0015_manu_attri.txt;20-39;2;0;0;test;5
# test_0016_manu_attri.txt;4-19;1;1;0;test;4




# FairFace
# file;age;age_range;gender;race;split;emotion
# train/1.jpg;nan;50-59;Male;East Asian;train;nan
# train/2.jpg;nan;30-39;Female;Indian;train;nan
# train/3.jpg;nan;3-9;Female;Black;train;nan
# train/4.jpg;nan;20-29;Female;Indian;train;nan
# train/5.jpg;nan;20-29;Female;Indian;train;nan
# train/6.jpg;nan;20-29;Male;White;train;nan
# train/7.jpg;nan;40-49;Male;Middle Eastern;train;nan
# train/8.jpg;nan;30-39;Female;Indian;train;nan
# train/9.jpg;nan;10-19;Male;White;train;nan
# train/10.jpg;nan;30-39;Male;Middle Eastern;train;nan
# train/11.jpg;nan;50-59;Male;East Asian;train;nan
# train/12.jpg;nan;20-29;Male;East Asian;train;nan
# train/13.jpg;nan;20-29;Male;Latino_Hispanic;train;nan
# train/14.jpg;nan;10-19;Male;Indian;train;nan