**Objetivo:** Particionar as imagens em conjuntos de treino e teste para todos os experimentos

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import numpy as np
import shutil
import random
import os

In [3]:
PATH = '/content/gdrive/Shareddrives/IA901 - Projeto Final/Datasets/'
os.makedirs(PATH+"Processed/", exist_ok=True)

#CancerClassification

## Baseline

In [None]:
# Set the path to the directory containing the images
image_dir = PATH+"Interim/Images/"

# Set the output directories
train_dir = PATH+"Processed/TumorClassification/Baseline/train/"
validation_dir = PATH+"Processed/TumorClassification/Baseline/val/"
test_dir = PATH+"Processed/TumorClassification/Baseline/test/"

# Read the CSV file
df = pd.read_csv(PATH+"Interim/ClassAndTypes.csv")

# Safeguards for corrupted files
CorruptedIndices = list(np.arange(7526, 7538,1))
df = df[~df['Unnamed: 0'].isin(CorruptedIndices)]

# Group the images by tissue type
groups = df.groupby("Types")

# Create the output directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(train_dir+"0/", exist_ok=True)
os.makedirs(train_dir+"1/", exist_ok=True)

os.makedirs(validation_dir, exist_ok=True)
os.makedirs(validation_dir+"0/", exist_ok=True)
os.makedirs(validation_dir+"1/", exist_ok=True)

os.makedirs(test_dir, exist_ok=True)
os.makedirs(test_dir+"0/", exist_ok=True)
os.makedirs(test_dir+"1/", exist_ok=True)


In [None]:
TrainIndices = []
ValIndices = []
TestIndices = []

# Iterate over the tissue types
for tissue_type, group in groups:
    # Shuffle the indices of the group
    indices = group.sample(frac=1, random_state=42).index.tolist()

    # Calculate the number of images for each set
    num_images = len(indices)
    print("Tissue: {}, N_Images: {}".format(tissue_type, num_images))
    print("Indices", indices)

    num_validation = int(0.1 * num_images)
    num_test = int(0.2 * num_images)

    # Get the indices for the train set
    train_indices = indices[:num_images - num_validation - num_test]
    TrainIndices.extend(train_indices)

    # Get the indices for the validation set
    validation_indices = indices[num_images - num_validation - num_test:num_images - num_test]
    ValIndices.extend(validation_indices)

    # Get the indices for the test set
    test_indices = indices[num_images - num_test:]
    TestIndices.extend(test_indices)

    # Copy the images to the corresponding directories
    for index in train_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        if df["Class"].iloc[index]==1:
          destination_path = os.path.join(train_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(train_dir, "0/Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)

    for index in validation_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        if df["Class"].iloc[index]==1:
          destination_path = os.path.join(validation_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(validation_dir, "0/Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)

    for index in test_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))

        if df["Class"].iloc[index]==1:
          destination_path = os.path.join(test_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(test_dir, "0/Image_{}.png".format(index))

        shutil.copyfile(source_path, destination_path)


Tissue: Adrenal_gland, N_Images: 437
Indices [6605, 1318, 4051, 6295, 6315, 6279, 3991, 1523, 6642, 6612, 6280, 1315, 3998, 6604, 3708, 1316, 6641, 3778, 1313, 3706, 1180, 3728, 1201, 6621, 1545, 1573, 1310, 6579, 6632, 1271, 6633, 6396, 3767, 6297, 6585, 1312, 3815, 1542, 1210, 6281, 6375, 1572, 4007, 1520, 3707, 1204, 3710, 6292, 1317, 4023, 6592, 1186, 6614, 1567, 1171, 1190, 4053, 6597, 1272, 3993, 6368, 1534, 6277, 6274, 6273, 6580, 1196, 6627, 6370, 1554, 1213, 3718, 6594, 6620, 6301, 1565, 6294, 1193, 1217, 1531, 4018, 1557, 6634, 1273, 3690, 1195, 1188, 1306, 4038, 6615, 1202, 1525, 6289, 3726, 1535, 4000, 6638, 3705, 6382, 4045, 6628, 1176, 1216, 3701, 6637, 6586, 4048, 1187, 6613, 1558, 6302, 1174, 1189, 3721, 1276, 6618, 6391, 1303, 1551, 4050, 6377, 1200, 3693, 3823, 4002, 1197, 1178, 3729, 6606, 1549, 6307, 6617, 1559, 1555, 3694, 3821, 6313, 6323, 3810, 6319, 6598, 6310, 6284, 3766, 3811, 1550, 6640, 3698, 4010, 6590, 6607, 1524, 6616, 3725, 1314, 6393, 6596, 1560, 6306, 

In [None]:
TrainDF = df.iloc[TrainIndices]
ValDF = df.iloc[ValIndices]
TestDF = df.iloc[TestIndices]

TrainDF.to_csv(PATH+"Processed/TumorClassification/Baseline/TrainLabels.csv", index=False, encoding='utf-8')
ValDF.to_csv(PATH+"Processed/TumorClassification/Baseline/ValLabels.csv", index=False, encoding='utf-8')
TestDF.to_csv(PATH+"Processed/TumorClassification/Baseline/TestLabels.csv", index=False, encoding='utf-8')

### Checando outputs

In [None]:
import pandas as pd
from collections import Counter

PATH = '/content/gdrive/Shareddrives/IA901 - Projeto Final/Datasets/'

In [None]:
# Train dataframe
Train = pd.read_csv(PATH+"Processed/TumorClassification/Baseline/TrainLabels.csv")
print(Train.describe())
Counter(Train['Types'])

        Unnamed: 0        Class
count  5546.000000  5546.000000
mean   3893.464299     0.537865
std    2287.466787     0.498609
min       0.000000     0.000000
25%    1900.250000     0.000000
50%    3885.500000     1.000000
75%    5871.500000     1.000000
max    7900.000000     1.000000


Counter({'Adrenal_gland': 307,
         'Bile-duct': 294,
         'Bladder': 103,
         'Breast': 1646,
         'Cervix': 206,
         'Colon': 1008,
         'Esophagus': 298,
         'HeadNeck': 270,
         'Kidney': 95,
         'Liver': 158,
         'Lung': 130,
         'Ovarian': 103,
         'Pancreatic': 137,
         'Prostate': 128,
         'Skin': 132,
         'Stomach': 103,
         'Testis': 138,
         'Thyroid': 159,
         'Uterus': 131})

In [None]:
Train

Unnamed: 0.1,Unnamed: 0,Types,Class
0,6605,Adrenal_gland,0.0
1,1318,Adrenal_gland,1.0
2,4051,Adrenal_gland,1.0
3,6295,Adrenal_gland,1.0
4,6315,Adrenal_gland,1.0
...,...,...,...
5541,2580,Uterus,0.0
5542,7752,Uterus,0.0
5543,7794,Uterus,1.0
5544,7799,Uterus,1.0


In [None]:
# Validation dataframe
Val = pd.read_csv(PATH+"Processed/TumorClassification/Baseline/ValLabels.csv")
print(Val.describe())
Counter(Val['Types'])

        Unnamed: 0       Class
count   782.000000  782.000000
mean   3985.749361    0.531969
std    2212.497218    0.499296
min      28.000000    0.000000
25%    2167.500000    0.000000
50%    3988.500000    1.000000
75%    5886.750000    1.000000
max    7899.000000    1.000000


Counter({'Adrenal_gland': 43,
         'Bile-duct': 42,
         'Bladder': 14,
         'Breast': 235,
         'Cervix': 29,
         'Colon': 144,
         'Esophagus': 42,
         'HeadNeck': 38,
         'Kidney': 13,
         'Liver': 22,
         'Lung': 18,
         'Ovarian': 14,
         'Pancreatic': 19,
         'Prostate': 18,
         'Skin': 18,
         'Stomach': 14,
         'Testis': 19,
         'Thyroid': 22,
         'Uterus': 18})

In [None]:
Val

Unnamed: 0.1,Unnamed: 0,Types,Class
0,1175,Adrenal_gland,1.0
1,4029,Adrenal_gland,1.0
2,6601,Adrenal_gland,0.0
3,1541,Adrenal_gland,1.0
4,3999,Adrenal_gland,1.0
...,...,...,...
777,7758,Uterus,0.0
778,3586,Uterus,1.0
779,6184,Uterus,1.0
780,7789,Uterus,1.0


In [None]:
# Test dataframe
Test = pd.read_csv(PATH+"Processed/TumorClassification/Baseline/TestLabels.csv")
print(Test.describe())
Counter(Test['Types'])

        Unnamed: 0        Class
count  1573.000000  1573.000000
mean   4131.558169     0.503497
std    2283.120101     0.500147
min       1.000000     0.000000
25%    2190.000000     0.000000
50%    4160.000000     1.000000
75%    6117.000000     1.000000
max    7898.000000     1.000000


Counter({'Adrenal_gland': 87,
         'Bile-duct': 84,
         'Bladder': 29,
         'Breast': 470,
         'Cervix': 58,
         'Colon': 288,
         'Esophagus': 84,
         'HeadNeck': 76,
         'Kidney': 26,
         'Liver': 44,
         'Lung': 36,
         'Ovarian': 29,
         'Pancreatic': 39,
         'Prostate': 36,
         'Skin': 37,
         'Stomach': 29,
         'Testis': 39,
         'Thyroid': 45,
         'Uterus': 37})

In [None]:
Test

Unnamed: 0.1,Unnamed: 0,Types,Class
0,3824,Adrenal_gland,1.0
1,6285,Adrenal_gland,1.0
2,6608,Adrenal_gland,0.0
3,4024,Adrenal_gland,1.0
4,6636,Adrenal_gland,0.0
...,...,...,...
1568,7733,Uterus,1.0
1569,2589,Uterus,1.0
1570,7719,Uterus,1.0
1571,7806,Uterus,1.0


### Checando se houve data leak

In [None]:
# Existem imagens iguais nos conjuntos?
  # True: existe, False: não existe

print('Treino e validação? ', Train['Unnamed: 0'].eq(Val['Unnamed: 0']).unique())
print('Treino e Teste? ', Train['Unnamed: 0'].eq(Test['Unnamed: 0']).unique())
print('Teste e validação? ', Test['Unnamed: 0'].eq(Val['Unnamed: 0']).unique())

Treino e validação?  [False]
Treino e Teste?  [False]
Teste e validação?  [False]


## Experimento 1: Separando dados deixando de fora um tecido específico


In [None]:
Tissue_to_Exclude = 'Breast'

In [None]:
PATH = '/content/gdrive/Shareddrives/IA901 - Projeto Final/Datasets/'
os.makedirs(PATH+"Processed/TumorClassification/Experiment_I", exist_ok=True)

In [None]:
# Set the path to the directory containing the images
image_dir = PATH+"Interim/Images/"

# Set the output directories
train_dir = PATH+"Processed/TumorClassification/Experiment_I/train/"
validation_dir = PATH+"Processed/TumorClassification/Experiment_I/val/"
test_dir = PATH+"Processed/TumorClassification/Experiment_I/test/"

# Read the CSV file
df = pd.read_csv(PATH+"Interim/ClassAndTypes.csv")

# Safeguards for corrupted files
CorruptedIndices = list(np.arange(7526, 7538,1))
df = df[~df['Unnamed: 0'].isin(CorruptedIndices)]

# Group the images by tissue type
groups = df.groupby("Types")

# Create the output directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(train_dir+"0/", exist_ok=True)
os.makedirs(train_dir+"1/", exist_ok=True)

os.makedirs(validation_dir, exist_ok=True)
os.makedirs(validation_dir+"0/", exist_ok=True)
os.makedirs(validation_dir+"1/", exist_ok=True)

os.makedirs(test_dir, exist_ok=True)
os.makedirs(test_dir+"0/", exist_ok=True)
os.makedirs(test_dir+"1/", exist_ok=True)


In [None]:
TrainIndices = []
ValIndices = []
TestIndices = []

# Iterate over the tissue types
for tissue_type, group in groups:

    # Putting the tissue
    if tissue_type==Tissue_to_Exclude:

      # Shuffle the indices of the group
      indices = group.sample(frac=1, random_state=42).index.tolist()
      test_indices = indices
      TestIndices.extend(test_indices)

      print("Tissue: {}, N_Images: {}".format(tissue_type, num_images))
      print("Indices", indices)

      for index in test_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))

        if os.path.isfile(source_path)==False:
          print('Missing File detected (index: {})! Lets go to the next one..'.format(index))
          continue

        if df["Class"].iloc[index]==1:
          destination_path = os.path.join(test_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(test_dir, "0/Image_{}.png".format(index))

        shutil.copyfile(source_path, destination_path)

      continue

    # Shuffle the indices of the group
    indices = group.sample(frac=1, random_state=42).index.tolist()

    # Calculate the number of images for each set
    num_images = len(indices)
    print("Tissue: {}, N_Images: {}".format(tissue_type, num_images))
    print("Indices", indices)

    num_validation = int(0.3 * num_images)

    # Get the indices for the train set
    train_indices = indices[:num_images - num_validation]
    TrainIndices.extend(train_indices)

    # Get the indices for the validation set
    validation_indices = indices[num_images - num_validation:num_images]
    ValIndices.extend(validation_indices)

    # Copy the images to the corresponding directories
    for index in train_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))

        if os.path.isfile(source_path)==False:
          print('Missing File detected (index: {})! Lets go to the next one..'.format(index))
          continue

        if df["Class"].iloc[index]==1:
          destination_path = os.path.join(train_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(train_dir, "0/Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)

    for index in validation_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))

        if os.path.isfile(source_path)==False:
          print('Missing File detected (index: {})! Lets go to the next one..'.format(index))
          continue

        if df["Class"].iloc[index]==1:
          destination_path = os.path.join(validation_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(validation_dir, "0/Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)


Tissue: Adrenal_gland, N_Images: 437
Indices [6605, 1318, 4051, 6295, 6315, 6279, 3991, 1523, 6642, 6612, 6280, 1315, 3998, 6604, 3708, 1316, 6641, 3778, 1313, 3706, 1180, 3728, 1201, 6621, 1545, 1573, 1310, 6579, 6632, 1271, 6633, 6396, 3767, 6297, 6585, 1312, 3815, 1542, 1210, 6281, 6375, 1572, 4007, 1520, 3707, 1204, 3710, 6292, 1317, 4023, 6592, 1186, 6614, 1567, 1171, 1190, 4053, 6597, 1272, 3993, 6368, 1534, 6277, 6274, 6273, 6580, 1196, 6627, 6370, 1554, 1213, 3718, 6594, 6620, 6301, 1565, 6294, 1193, 1217, 1531, 4018, 1557, 6634, 1273, 3690, 1195, 1188, 1306, 4038, 6615, 1202, 1525, 6289, 3726, 1535, 4000, 6638, 3705, 6382, 4045, 6628, 1176, 1216, 3701, 6637, 6586, 4048, 1187, 6613, 1558, 6302, 1174, 1189, 3721, 1276, 6618, 6391, 1303, 1551, 4050, 6377, 1200, 3693, 3823, 4002, 1197, 1178, 3729, 6606, 1549, 6307, 6617, 1559, 1555, 3694, 3821, 6313, 6323, 3810, 6319, 6598, 6310, 6284, 3766, 3811, 1550, 6640, 3698, 4010, 6590, 6607, 1524, 6616, 3725, 1314, 6393, 6596, 1560, 6306, 

In [None]:
TrainDF = df.iloc[TrainIndices]
ValDF = df.iloc[ValIndices]
TestDF = df.iloc[TestIndices]

TrainDF.to_csv(PATH+"Processed/TumorClassification/Experiment_I/TrainLabels.csv", index=False, encoding='utf-8')
ValDF.to_csv(PATH+"Processed/TumorClassification/Experiment_I/ValLabels.csv", index=False, encoding='utf-8')
TestDF.to_csv(PATH+"Processed/TumorClassification/Experiment_I/TestLabels.csv", index=False, encoding='utf-8')

## Experimento II: Separando 'Breast' para treino e os demais tecidos para val/teste

In [4]:
Tissue_to_Train = 'Breast'

In [5]:
PATH = '/content/gdrive/Shareddrives/IA901 - Projeto Final/Datasets/'
os.makedirs(PATH+"Processed/TumorClassification/Experiment_II", exist_ok=True)

In [17]:
# Set the path to the directory containing the images
image_dir = PATH+"Interim/Images/"

# Set the output directories
train_dir = PATH+"Processed/TumorClassification/Experiment_II/train/"
validation_dir = PATH+"Processed/TumorClassification/Experiment_II/val/"
test_dir = PATH+"Processed/TumorClassification/Experiment_II/test/"

# Read the CSV file
df = pd.read_csv(PATH+"Interim/ClassAndTypes.csv")

# Safeguards for corrupted files
#CorruptedIndices = list(np.arange(7526, 7538,1))
#df = df[~df['Unnamed: 0'].isin(CorruptedIndices)]

# Group the images by tissue type
groups = df.groupby("Types")

# Create the output directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(train_dir+"0/", exist_ok=True)
os.makedirs(train_dir+"1/", exist_ok=True)

os.makedirs(validation_dir, exist_ok=True)
os.makedirs(validation_dir+"0/", exist_ok=True)
os.makedirs(validation_dir+"1/", exist_ok=True)

os.makedirs(test_dir, exist_ok=True)
os.makedirs(test_dir+"0/", exist_ok=True)
os.makedirs(test_dir+"1/", exist_ok=True)


In [18]:
TrainIndices = []
ValIndices = []
TestIndices = []

# Iterate over the tissue types
for tissue_type, group in groups:

    # Putting the tissue
    if tissue_type==Tissue_to_Train:

      # Shuffle the indices of the group
      indices = group.sample(frac=1, random_state=42).index.tolist()
      num_images = len(indices)
      train_indices = indices
      TrainIndices.extend(train_indices)

      print("Tissue: {}, N_Images: {}".format(tissue_type, num_images))
      print("Indices", indices)

      for index in train_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))

        if os.path.isfile(source_path)==False:
          print('Missing File detected (index: {})! Lets go to the next one..'.format(index))
          continue

        if df["Class"].iloc[index]==1:
          destination_path = os.path.join(train_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(train_dir, "0/Image_{}.png".format(index))

        shutil.copyfile(source_path, destination_path)

      continue

    # Shuffle the indices of the group
    indices = group.sample(frac=1, random_state=42).index.tolist()

    # Calculate the number of images for each set
    num_images = len(indices)
    print("Tissue: {}, N_Images: {}".format(tissue_type, num_images))
    print("Indices", indices)

    num_validation = int(0.3 * num_images)

    # Get the indices for the train set
    test_indices = indices[:num_images - num_validation]
    TestIndices.extend(test_indices)

    # Get the indices for the validation set
    validation_indices = indices[num_images - num_validation:num_images]
    ValIndices.extend(validation_indices)

    # Copy the images to the corresponding directories
    for index in test_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))

        if os.path.isfile(source_path)==False:
          print('Missing File detected (index: {})! Lets go to the next one..'.format(index))
          continue

        if df["Class"].iloc[index]==1:
          destination_path = os.path.join(test_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(test_dir, "0/Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)

    for index in validation_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))

        if os.path.isfile(source_path)==False:
          print('Missing File detected (index: {})! Lets go to the next one..'.format(index))
          continue

        if df["Class"].iloc[index]==1:
          destination_path = os.path.join(validation_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(validation_dir, "0/Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)


Tissue: Adrenal_gland, N_Images: 437
Indices [6605, 1318, 4051, 6295, 6315, 6279, 3991, 1523, 6642, 6612, 6280, 1315, 3998, 6604, 3708, 1316, 6641, 3778, 1313, 3706, 1180, 3728, 1201, 6621, 1545, 1573, 1310, 6579, 6632, 1271, 6633, 6396, 3767, 6297, 6585, 1312, 3815, 1542, 1210, 6281, 6375, 1572, 4007, 1520, 3707, 1204, 3710, 6292, 1317, 4023, 6592, 1186, 6614, 1567, 1171, 1190, 4053, 6597, 1272, 3993, 6368, 1534, 6277, 6274, 6273, 6580, 1196, 6627, 6370, 1554, 1213, 3718, 6594, 6620, 6301, 1565, 6294, 1193, 1217, 1531, 4018, 1557, 6634, 1273, 3690, 1195, 1188, 1306, 4038, 6615, 1202, 1525, 6289, 3726, 1535, 4000, 6638, 3705, 6382, 4045, 6628, 1176, 1216, 3701, 6637, 6586, 4048, 1187, 6613, 1558, 6302, 1174, 1189, 3721, 1276, 6618, 6391, 1303, 1551, 4050, 6377, 1200, 3693, 3823, 4002, 1197, 1178, 3729, 6606, 1549, 6307, 6617, 1559, 1555, 3694, 3821, 6313, 6323, 3810, 6319, 6598, 6310, 6284, 3766, 3811, 1550, 6640, 3698, 4010, 6590, 6607, 1524, 6616, 3725, 1314, 6393, 6596, 1560, 6306, 

In [19]:
TrainDF = df.loc[TrainIndices]
ValDF = df.loc[ValIndices]
TestDF = df.loc[TestIndices]

TrainDF.to_csv(PATH+"Processed/TumorClassification/Experiment_II/TrainLabels.csv", index=False, encoding='utf-8')
ValDF.to_csv(PATH+"Processed/TumorClassification/Experiment_II/ValLabels.csv", index=False, encoding='utf-8')
TestDF.to_csv(PATH+"Processed/TumorClassification/Experiment_II/TestLabels.csv", index=False, encoding='utf-8')

# Tissue Classification

## Experimento I: Breast (positivo) vs Rest (negativo)

In [None]:
PositiveFeature = 'Breast'

In [None]:
PATH = '/content/gdrive/Shareddrives/IA901 - Projeto Final/Datasets/'
os.makedirs(PATH+"Processed/", exist_ok=True)

In [None]:
# Set the path to the directory containing the images
image_dir = PATH+"Interim/Images/"

# Set the output directories
train_dir = PATH+"Processed/TissueClassification/Experiment_I/train/"
validation_dir = PATH+"Processed/TissueClassification/Experiment_I/val/"
test_dir = PATH+"Processed/TissueClassification/Experiment_I/test/"

# Read the CSV file
df = pd.read_csv(PATH+"Interim/ClassAndTypes.csv")

# Safeguards for corrupted files
CorruptedIndices = list(np.arange(7526, 7538,1))
df = df[~df['Unnamed: 0'].isin(CorruptedIndices)]

# Group the images by tissue type
groups = df.groupby("Types")

# Create the output directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(train_dir+"0/", exist_ok=True)
os.makedirs(train_dir+"1/", exist_ok=True)

os.makedirs(validation_dir, exist_ok=True)
os.makedirs(validation_dir+"0/", exist_ok=True)
os.makedirs(validation_dir+"1/", exist_ok=True)

os.makedirs(test_dir, exist_ok=True)
os.makedirs(test_dir+"0/", exist_ok=True)
os.makedirs(test_dir+"1/", exist_ok=True)


In [None]:
TrainIndices = []
ValIndices = []
TestIndices = []

# Iterate over the tissue types
for tissue_type, group in groups:
    # Shuffle the indices of the group
    indices = group.sample(frac=1, random_state=42).index.tolist()

    # Calculate the number of images for each set
    num_images = len(indices)
    print("Tissue: {}, N_Images: {}".format(tissue_type, num_images))
    print("Indices", indices)

    num_validation = int(0.1 * num_images)
    num_test = int(0.2 * num_images)

    # Get the indices for the train set
    train_indices = indices[:num_images - num_validation - num_test]
    TrainIndices.extend(train_indices)

    # Get the indices for the validation set
    validation_indices = indices[num_images - num_validation - num_test:num_images - num_test]
    ValIndices.extend(validation_indices)

    # Get the indices for the test set
    test_indices = indices[num_images - num_test:]
    TestIndices.extend(test_indices)

    # Copy the images to the corresponding directories
    for index in train_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        if df["Types"].iloc[index]==PositiveFeature:
          destination_path = os.path.join(train_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(train_dir, "0/Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)

    for index in validation_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        if df["Types"].iloc[index]==PositiveFeature:
          destination_path = os.path.join(validation_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(validation_dir, "0/Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)

    for index in test_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))

        if df["Types"].iloc[index]==PositiveFeature:
          destination_path = os.path.join(test_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(test_dir, "0/Image_{}.png".format(index))

        shutil.copyfile(source_path, destination_path)


Tissue: Adrenal_gland, N_Images: 437
Indices [6605, 1318, 4051, 6295, 6315, 6279, 3991, 1523, 6642, 6612, 6280, 1315, 3998, 6604, 3708, 1316, 6641, 3778, 1313, 3706, 1180, 3728, 1201, 6621, 1545, 1573, 1310, 6579, 6632, 1271, 6633, 6396, 3767, 6297, 6585, 1312, 3815, 1542, 1210, 6281, 6375, 1572, 4007, 1520, 3707, 1204, 3710, 6292, 1317, 4023, 6592, 1186, 6614, 1567, 1171, 1190, 4053, 6597, 1272, 3993, 6368, 1534, 6277, 6274, 6273, 6580, 1196, 6627, 6370, 1554, 1213, 3718, 6594, 6620, 6301, 1565, 6294, 1193, 1217, 1531, 4018, 1557, 6634, 1273, 3690, 1195, 1188, 1306, 4038, 6615, 1202, 1525, 6289, 3726, 1535, 4000, 6638, 3705, 6382, 4045, 6628, 1176, 1216, 3701, 6637, 6586, 4048, 1187, 6613, 1558, 6302, 1174, 1189, 3721, 1276, 6618, 6391, 1303, 1551, 4050, 6377, 1200, 3693, 3823, 4002, 1197, 1178, 3729, 6606, 1549, 6307, 6617, 1559, 1555, 3694, 3821, 6313, 6323, 3810, 6319, 6598, 6310, 6284, 3766, 3811, 1550, 6640, 3698, 4010, 6590, 6607, 1524, 6616, 3725, 1314, 6393, 6596, 1560, 6306, 

In [None]:
TrainDF = df.iloc[TrainIndices]
ValDF = df.iloc[ValIndices]
TestDF = df.iloc[TestIndices]

TrainDF.to_csv(PATH+"Processed/TissueClassification/Experiment_I/TrainLabels.csv", index=False, encoding='utf-8')
ValDF.to_csv(PATH+"Processed/TissueClassification/Experiment_I/ValLabels.csv", index=False, encoding='utf-8')
TestDF.to_csv(PATH+"Processed/TissueClassification/Experiment_I/TestLabels.csv", index=False, encoding='utf-8')

## Experimento II: Colon (positivo) vs Rest (negativo)

In [None]:
PositiveFeature = 'Colon'

In [None]:
PATH = '/content/gdrive/Shareddrives/IA901 - Projeto Final/Datasets/'
os.makedirs(PATH+"Processed/", exist_ok=True)

In [None]:
# Set the path to the directory containing the images
image_dir = PATH+"Interim/Images/"

# Set the output directories
train_dir = PATH+"Processed/TissueClassification/Experiment_II/train/"
validation_dir = PATH+"Processed/TissueClassification/Experiment_II/val/"
test_dir = PATH+"Processed/TissueClassification/Experiment_II/test/"

# Read the CSV file
df = pd.read_csv(PATH+"Interim/ClassAndTypes.csv")

# Safeguards for corrupted files
CorruptedIndices = list(np.arange(7526, 7538,1))
df = df[~df['Unnamed: 0'].isin(CorruptedIndices)]

# Group the images by tissue type
groups = df.groupby("Types")

# Create the output directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(train_dir+"0/", exist_ok=True)
os.makedirs(train_dir+"1/", exist_ok=True)

os.makedirs(validation_dir, exist_ok=True)
os.makedirs(validation_dir+"0/", exist_ok=True)
os.makedirs(validation_dir+"1/", exist_ok=True)

os.makedirs(test_dir, exist_ok=True)
os.makedirs(test_dir+"0/", exist_ok=True)
os.makedirs(test_dir+"1/", exist_ok=True)


In [None]:
TrainIndices = []
ValIndices = []
TestIndices = []

# Iterate over the tissue types
for tissue_type, group in groups:
    # Shuffle the indices of the group
    indices = group.sample(frac=1, random_state=42).index.tolist()

    # Calculate the number of images for each set
    num_images = len(indices)
    print("Tissue: {}, N_Images: {}".format(tissue_type, num_images))
    print("Indices", indices)

    num_validation = int(0.1 * num_images)
    num_test = int(0.2 * num_images)

    # Get the indices for the train set
    train_indices = indices[:num_images - num_validation - num_test]
    TrainIndices.extend(train_indices)

    # Get the indices for the validation set
    validation_indices = indices[num_images - num_validation - num_test:num_images - num_test]
    ValIndices.extend(validation_indices)

    # Get the indices for the test set
    test_indices = indices[num_images - num_test:]
    TestIndices.extend(test_indices)

    # Copy the images to the corresponding directories
    for index in train_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        if df["Types"].iloc[index]==PositiveFeature:
          destination_path = os.path.join(train_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(train_dir, "0/Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)

    for index in validation_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        if df["Types"].iloc[index]==PositiveFeature:
          destination_path = os.path.join(validation_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(validation_dir, "0/Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)

    for index in test_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))

        if df["Types"].iloc[index]==PositiveFeature:
          destination_path = os.path.join(test_dir, "1/Image_{}.png".format(index))
        else:
          destination_path = os.path.join(test_dir, "0/Image_{}.png".format(index))

        shutil.copyfile(source_path, destination_path)


Tissue: Adrenal_gland, N_Images: 437
Indices [6605, 1318, 4051, 6295, 6315, 6279, 3991, 1523, 6642, 6612, 6280, 1315, 3998, 6604, 3708, 1316, 6641, 3778, 1313, 3706, 1180, 3728, 1201, 6621, 1545, 1573, 1310, 6579, 6632, 1271, 6633, 6396, 3767, 6297, 6585, 1312, 3815, 1542, 1210, 6281, 6375, 1572, 4007, 1520, 3707, 1204, 3710, 6292, 1317, 4023, 6592, 1186, 6614, 1567, 1171, 1190, 4053, 6597, 1272, 3993, 6368, 1534, 6277, 6274, 6273, 6580, 1196, 6627, 6370, 1554, 1213, 3718, 6594, 6620, 6301, 1565, 6294, 1193, 1217, 1531, 4018, 1557, 6634, 1273, 3690, 1195, 1188, 1306, 4038, 6615, 1202, 1525, 6289, 3726, 1535, 4000, 6638, 3705, 6382, 4045, 6628, 1176, 1216, 3701, 6637, 6586, 4048, 1187, 6613, 1558, 6302, 1174, 1189, 3721, 1276, 6618, 6391, 1303, 1551, 4050, 6377, 1200, 3693, 3823, 4002, 1197, 1178, 3729, 6606, 1549, 6307, 6617, 1559, 1555, 3694, 3821, 6313, 6323, 3810, 6319, 6598, 6310, 6284, 3766, 3811, 1550, 6640, 3698, 4010, 6590, 6607, 1524, 6616, 3725, 1314, 6393, 6596, 1560, 6306, 

In [None]:
TrainDF = df.iloc[TrainIndices]
ValDF = df.iloc[ValIndices]
TestDF = df.iloc[TestIndices]

TrainDF.to_csv(PATH+"Processed/TissueClassification/Experiment_II/TrainLabels.csv", index=False, encoding='utf-8')
ValDF.to_csv(PATH+"Processed/TissueClassification/Experiment_II/ValLabels.csv", index=False, encoding='utf-8')
TestDF.to_csv(PATH+"Processed/TissueClassification/Experiment_II/TestLabels.csv", index=False, encoding='utf-8')

## Experimento III: 6 tissues vs 6 tissues

### Train/val/Test split

In [None]:
TissueClasses = ['Breast','Colon','Esophagus','HeadNeck','Adrenal_gland','Bile-duct'] # classes

In [None]:
# Set the path to the directory containing the images
image_dir = PATH+"Interim/Images/"

# Set the output directories
train_dir = PATH+"Processed/TissueClassification/Experiment_III/train/"
validation_dir = PATH+"Processed/TissueClassification/Experiment_III/val/"
test_dir = PATH+"Processed/TissueClassification/Experiment_III/test/"

# Read the CSV file
df = pd.read_csv(PATH+"Interim/ClassAndTypes.csv")

# Safeguards for corrupted files
CorruptedIndices = list(np.arange(7526, 7538,1))
df = df[~df['Unnamed: 0'].isin(CorruptedIndices)]

# Selecionando apenas imagens das classes de interesse:
df = df[df['Types'].isin(TissueClasses)]

# Group the images by tissue type
groups = df.groupby("Types")

# Create the output directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(validation_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

Classcount = 0
for tissue_type, group in groups:

  os.makedirs(train_dir+"{}/".format(Classcount), exist_ok=True)
  os.makedirs(validation_dir+"{}/".format(Classcount), exist_ok=True)
  os.makedirs(test_dir+"{}/".format(Classcount), exist_ok=True)

  Classcount = Classcount + 1

In [None]:
TrainIndices = []
ValIndices = []
TestIndices = []

Classcount = 0
# Iterate over the tissue types
for tissue_type, group in groups:
    # Shuffle the indices of the group
    indices = group.sample(frac=1, random_state=42).index.tolist()

    # Calculate the number of images for each set
    num_images = len(indices)
    print("Tissue: {}, N_Images: {}".format(tissue_type, num_images))
    print("Indices", indices)
    print('Class ID: ', Classcount)

    num_validation = int(0.1 * num_images)
    num_test = int(0.2 * num_images)

    # Get the indices for the train set
    train_indices = indices[:num_images - num_validation - num_test]
    TrainIndices.extend(train_indices)

    # Get the indices for the validation set
    validation_indices = indices[num_images - num_validation - num_test:num_images - num_test]
    ValIndices.extend(validation_indices)

    # Get the indices for the test set
    test_indices = indices[num_images - num_test:]
    TestIndices.extend(test_indices)

    # Copy the images to the corresponding directories
    for index in train_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        destination_path = os.path.join(train_dir, "{}/Image_{}.png".format(Classcount,index))
        shutil.copyfile(source_path, destination_path)

    for index in validation_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        destination_path = os.path.join(validation_dir, "{}/Image_{}.png".format(Classcount,index))
        shutil.copyfile(source_path, destination_path)

    for index in test_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        destination_path = os.path.join(test_dir, "{}/Image_{}.png".format(Classcount,index))
        shutil.copyfile(source_path, destination_path)

    Classcount = Classcount + 1


Tissue: Adrenal_gland, N_Images: 437
Indices [6605, 1318, 4051, 6295, 6315, 6279, 3991, 1523, 6642, 6612, 6280, 1315, 3998, 6604, 3708, 1316, 6641, 3778, 1313, 3706, 1180, 3728, 1201, 6621, 1545, 1573, 1310, 6579, 6632, 1271, 6633, 6396, 3767, 6297, 6585, 1312, 3815, 1542, 1210, 6281, 6375, 1572, 4007, 1520, 3707, 1204, 3710, 6292, 1317, 4023, 6592, 1186, 6614, 1567, 1171, 1190, 4053, 6597, 1272, 3993, 6368, 1534, 6277, 6274, 6273, 6580, 1196, 6627, 6370, 1554, 1213, 3718, 6594, 6620, 6301, 1565, 6294, 1193, 1217, 1531, 4018, 1557, 6634, 1273, 3690, 1195, 1188, 1306, 4038, 6615, 1202, 1525, 6289, 3726, 1535, 4000, 6638, 3705, 6382, 4045, 6628, 1176, 1216, 3701, 6637, 6586, 4048, 1187, 6613, 1558, 6302, 1174, 1189, 3721, 1276, 6618, 6391, 1303, 1551, 4050, 6377, 1200, 3693, 3823, 4002, 1197, 1178, 3729, 6606, 1549, 6307, 6617, 1559, 1555, 3694, 3821, 6313, 6323, 3810, 6319, 6598, 6310, 6284, 3766, 3811, 1550, 6640, 3698, 4010, 6590, 6607, 1524, 6616, 3725, 1314, 6393, 6596, 1560, 6306, 

In [None]:
max(TrainIndices)

7900

In [None]:
TrainDF = df.loc[TrainIndices]
ValDF = df.loc[ValIndices]
TestDF = df.loc[TestIndices]

TrainDF.to_csv(PATH+"Processed/TissueClassification/Experiment_III/TrainLabels.csv", index=False, encoding='utf-8')
ValDF.to_csv(PATH+"Processed/TissueClassification/Experiment_III/ValLabels.csv", index=False, encoding='utf-8')
TestDF.to_csv(PATH+"Processed/TissueClassification/Experiment_III/TestLabels.csv", index=False, encoding='utf-8')

## Experimento IV: 6 tissues vs 6 tissues

### Train/val/Test split

In [None]:
#TissueClasses = ['Breast','Colon','Lung','Kidney','Prostate','Bladder','Stomach','Ovarian','Esophagus','Pancreatic','Uterus','Thyroid','Skin','Cervix','Adrenal_gland','Bile-duct','Testis','HeadNeck','Liver']

In [None]:
# Set the path to the directory containing the images
image_dir = PATH+"Interim/Images/"

os.makedirs(PATH+"Processed/TissueClassification/Experiment_IV/", exist_ok=True) # Pasta do experimento

# Set the output directories
train_dir = PATH+"Processed/TissueClassification/Experiment_IV/train/"
validation_dir = PATH+"Processed/TissueClassification/Experiment_IV/val/"
test_dir = PATH+"Processed/TissueClassification/Experiment_IV/test/"

# Read the CSV file
df = pd.read_csv(PATH+"Interim/ClassAndTypes.csv")

# Safeguards for corrupted files
CorruptedIndices = list(np.arange(7526, 7538,1))
df = df[~df['Unnamed: 0'].isin(CorruptedIndices)]

# Group the images by tissue type
groups = df.groupby("Types")

# Create the output directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(validation_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

Classcount = 0
for tissue_type, group in groups:

  print('Tissue: {}, Class: {}'.format(tissue_type, Classcount))
  os.makedirs(train_dir+"{}/".format(Classcount), exist_ok=True)
  os.makedirs(validation_dir+"{}/".format(Classcount), exist_ok=True)
  os.makedirs(test_dir+"{}/".format(Classcount), exist_ok=True)

  Classcount = Classcount + 1

In [None]:
TrainIndices = []
ValIndices = []
TestIndices = []

Classcount = 0
# Iterate over the tissue types
for tissue_type, group in groups:
    # Shuffle the indices of the group
    indices = group.sample(frac=1, random_state=42).index.tolist()

    # Calculate the number of images for each set
    num_images = len(indices)
    print("Tissue: {}, N_Images: {}".format(tissue_type, num_images))
    print("Indices", indices)
    print('Class ID: ', Classcount)

    num_validation = int(0.1 * num_images)
    num_test = int(0.2 * num_images)

    # Get the indices for the train set
    train_indices = indices[:num_images - num_validation - num_test]
    TrainIndices.extend(train_indices)

    # Get the indices for the validation set
    validation_indices = indices[num_images - num_validation - num_test:num_images - num_test]
    ValIndices.extend(validation_indices)

    # Get the indices for the test set
    test_indices = indices[num_images - num_test:]
    TestIndices.extend(test_indices)

    # Copy the images to the corresponding directories
    for index in train_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        destination_path = os.path.join(train_dir, "{}/Image_{}.png".format(Classcount,index))
        shutil.copyfile(source_path, destination_path)

    for index in validation_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        destination_path = os.path.join(validation_dir, "{}/Image_{}.png".format(Classcount,index))
        shutil.copyfile(source_path, destination_path)

    for index in test_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        destination_path = os.path.join(test_dir, "{}/Image_{}.png".format(Classcount,index))
        shutil.copyfile(source_path, destination_path)

    Classcount = Classcount + 1


Tissue: Adrenal_gland, N_Images: 437
Indices [6605, 1318, 4051, 6295, 6315, 6279, 3991, 1523, 6642, 6612, 6280, 1315, 3998, 6604, 3708, 1316, 6641, 3778, 1313, 3706, 1180, 3728, 1201, 6621, 1545, 1573, 1310, 6579, 6632, 1271, 6633, 6396, 3767, 6297, 6585, 1312, 3815, 1542, 1210, 6281, 6375, 1572, 4007, 1520, 3707, 1204, 3710, 6292, 1317, 4023, 6592, 1186, 6614, 1567, 1171, 1190, 4053, 6597, 1272, 3993, 6368, 1534, 6277, 6274, 6273, 6580, 1196, 6627, 6370, 1554, 1213, 3718, 6594, 6620, 6301, 1565, 6294, 1193, 1217, 1531, 4018, 1557, 6634, 1273, 3690, 1195, 1188, 1306, 4038, 6615, 1202, 1525, 6289, 3726, 1535, 4000, 6638, 3705, 6382, 4045, 6628, 1176, 1216, 3701, 6637, 6586, 4048, 1187, 6613, 1558, 6302, 1174, 1189, 3721, 1276, 6618, 6391, 1303, 1551, 4050, 6377, 1200, 3693, 3823, 4002, 1197, 1178, 3729, 6606, 1549, 6307, 6617, 1559, 1555, 3694, 3821, 6313, 6323, 3810, 6319, 6598, 6310, 6284, 3766, 3811, 1550, 6640, 3698, 4010, 6590, 6607, 1524, 6616, 3725, 1314, 6393, 6596, 1560, 6306, 

In [None]:
TrainDF = df.loc[TrainIndices]
ValDF = df.loc[ValIndices]
TestDF = df.loc[TestIndices]

TrainDF.to_csv(PATH+"Processed/TissueClassification/Experiment_IV/TrainLabels.csv", index=False, encoding='utf-8')
ValDF.to_csv(PATH+"Processed/TissueClassification/Experiment_IV/ValLabels.csv", index=False, encoding='utf-8')
TestDF.to_csv(PATH+"Processed/TissueClassification/Experiment_IV/TestLabels.csv", index=False, encoding='utf-8')

### Data augmentation nas classes minoritárias

In [None]:
# Aplicando Data augmentation sobre classes minoritarias

# import the needed libs

from __future__ import print_function, division

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from torchsummary import summary
import os
import random
import copy
import cv2 as cv

torch.manual_seed(42) # semente aleatoria!!!

<torch._C.Generator at 0x7f2eca044210>

In [None]:
# This function will read the image using its path with opencv
def Load_Image(Path):
    img = cv.imread(Path)
    return img

In [None]:
# Data augmentation transformations

size, padding = 200, 60

HorizontalFlip = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomHorizontalFlip(p=1.0),
        ])

VerticalFlip = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomVerticalFlip(p=1.0),
        ])

RandomCrop = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomCrop(size, padding,padding_mode='reflect'),
        ])

In [None]:
MinTissueClasses = ['Colon', 'Esophagus','HeadNeck','Adrenal_gland','Bile-duct'] # classes minoritarias

Transformations = ['HorizontalFlip', 'VerticalFlip', 'RandomCrop'] # data augmentation transformations

# Loop over tissues
for j in MinTissueClasses:

    source_path = PATH+"Processed/TissueClassification/Experiment_III/train/{}".format(j)

    # Read the CSV file
    df = pd.read_csv(PATH+"Processed/TissueClassification/Experiment_III/TrainLabels.csv")

    # Selecionando apenas imagens das classes de interesse:
    df = df[df['Types'].isin([j])]

    for k in df.index:

      Image = Load_Image(source_path+'/Image_{}'.format(k))

      if j=='Colon':
        RandomNumber = random.randint(0, 2)
        AugmentedImage = globals()[str(Transformations[RandomNumber])](Image) # random transformation

      else:
        for i in range(0,3):
          AugmentedImage = globals()[str(Transformations[i])](Image)

      # Saving transformed image
      destination_path = os.path.join(source_path, "/AugmentedImage_{}.png".format(k))
      AugmentedImage.save(destination_path, format="png")



In [None]:
# # Set the path to the directory containing the images
# image_dir = PATH+"Interim/Images/"

# AugmentedInterimPath = PATH+"Interim/AugmentedData/"
# # Create the output directory
# os.makedirs(AugmentedInterimPath, exist_ok=True)

# # Read the CSV file
# df = pd.read_csv(PATH+"Processed/TissueClassification/Experiment_III/TrainLabels.csv")

# # Safeguards for corrupted files
# CorruptedIndices = list(np.arange(7526, 7538,1))
# df = df[~df['Unnamed: 0'].isin(CorruptedIndices)]

# # Selecionando apenas imagens das classes de interesse:
# df = df[df['Types'].isin(TissueClasses)]

# # Group the images by tissue type
# groups = df.groupby("Types")


In [None]:
# # Applying augmentation and saving images

# AugmentedDataset = []
# IndexCounter = 0
# Transformations = ['HorizontalFlip', 'VerticalFlip', 'RandomCrop']
# # Iterate over the tissue types
# for tissue_type, group in groups:

#   print('Tissue processing: ', tissue_type)
#   indices = group.sample(frac=1, random_state=42).index.tolist()

#   for index in indices:

#     source_path = os.path.join(image_dir, "Image_{}.png".format(index))
#     destination_path = os.path.join(AugmentedInterimPath, "Image_{}.png".format(IndexCounter))

#     # Salvando imagem original
#     shutil.copyfile(source_path, destination_path)

#     if tissue_type!='Breast' and tissue_type!='Colon':

#       Image = Load_Image(source_path)
#       AugmentedDataset.append({'Index':IndexCounter, 'Type':tissue_type,'IsAugmented':0})

#       for i in range(0,3):
#         IndexCounter = IndexCounter + 1
#         AugmentedDataset.append({'Index': IndexCounter, 'Type':tissue_type,'IsAugmented':1})
#         AugmentedImage = globals()[str(Transformations[i])](Image)

#         # Saving transformed image
#         destination_path = os.path.join(AugmentedInterimPath, "Image_{}.png".format(IndexCounter))
#         AugmentedImage.save(destination_path, format="png")
#         # HorizontalImage = HorizontalFlip(Image)
#         # VerticalImage = VerticalFlip(Image)
#         # RandomCropImage = RandomCrop(Image)

#     else:
#       AugmentedDataset.append({'Index':IndexCounter, 'Type':tissue_type,'IsAugmented':0})

#     IndexCounter = IndexCounter+1


# Cell counting

In [None]:
# Set the path to the directory containing the images
image_dir = PATH+"Interim/Images/"

os.makedirs(PATH+"Processed/CellCounting/", exist_ok=True)

# Set the output directories
train_dir = PATH+"Processed/CellCounting/train/"
validation_dir = PATH+"Processed/CellCounting/val/"
test_dir = PATH+"Processed/CellCounting/test/"

# Read the CSV file
df = pd.read_csv(PATH+"Interim/ClassAndTypes.csv")

# Safeguards for corrupted files
CorruptedIndices = list(np.arange(7526, 7538,1))
df = df[~df['Unnamed: 0'].isin(CorruptedIndices)]

# Group the images by tissue type
groups = df.groupby("Types")

# Create the output directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(validation_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)


In [None]:
TrainIndices = []
ValIndices = []
TestIndices = []

# Iterate over the tissue types
for tissue_type, group in groups:
    # Shuffle the indices of the group
    indices = group.sample(frac=1, random_state=42).index.tolist()

    # Calculate the number of images for each set
    num_images = len(indices)
    print("Tissue: {}, N_Images: {}".format(tissue_type, num_images))
    print("Indices", indices)

    num_validation = int(0.1 * num_images)
    num_test = int(0.2 * num_images)

    # Get the indices for the train set
    train_indices = indices[:num_images - num_validation - num_test]
    TrainIndices.extend(train_indices)

    # Get the indices for the validation set
    validation_indices = indices[num_images - num_validation - num_test:num_images - num_test]
    ValIndices.extend(validation_indices)

    # Get the indices for the test set
    test_indices = indices[num_images - num_test:]
    TestIndices.extend(test_indices)

    # Copy the images to the corresponding directories
    for index in train_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        destination_path = os.path.join(train_dir, "Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)

    for index in validation_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        destination_path = os.path.join(validation_dir, "Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)

    for index in test_indices:
        source_path = os.path.join(image_dir, "Image_{}.png".format(index))
        destination_path = os.path.join(test_dir, "Image_{}.png".format(index))
        shutil.copyfile(source_path, destination_path)


Tissue: Adrenal_gland, N_Images: 437
Indices [6605, 1318, 4051, 6295, 6315, 6279, 3991, 1523, 6642, 6612, 6280, 1315, 3998, 6604, 3708, 1316, 6641, 3778, 1313, 3706, 1180, 3728, 1201, 6621, 1545, 1573, 1310, 6579, 6632, 1271, 6633, 6396, 3767, 6297, 6585, 1312, 3815, 1542, 1210, 6281, 6375, 1572, 4007, 1520, 3707, 1204, 3710, 6292, 1317, 4023, 6592, 1186, 6614, 1567, 1171, 1190, 4053, 6597, 1272, 3993, 6368, 1534, 6277, 6274, 6273, 6580, 1196, 6627, 6370, 1554, 1213, 3718, 6594, 6620, 6301, 1565, 6294, 1193, 1217, 1531, 4018, 1557, 6634, 1273, 3690, 1195, 1188, 1306, 4038, 6615, 1202, 1525, 6289, 3726, 1535, 4000, 6638, 3705, 6382, 4045, 6628, 1176, 1216, 3701, 6637, 6586, 4048, 1187, 6613, 1558, 6302, 1174, 1189, 3721, 1276, 6618, 6391, 1303, 1551, 4050, 6377, 1200, 3693, 3823, 4002, 1197, 1178, 3729, 6606, 1549, 6307, 6617, 1559, 1555, 3694, 3821, 6313, 6323, 3810, 6319, 6598, 6310, 6284, 3766, 3811, 1550, 6640, 3698, 4010, 6590, 6607, 1524, 6616, 3725, 1314, 6393, 6596, 1560, 6306, 

In [None]:
TrainDF = df.loc[TrainIndices]
ValDF = df.loc[ValIndices]
TestDF = df.loc[TestIndices]

TrainDF.to_csv(PATH+"Processed/CellCounting/TrainLabels.csv", index=False, encoding='utf-8')
ValDF.to_csv(PATH+"Processed/CellCounting/ValLabels.csv", index=False, encoding='utf-8')
TestDF.to_csv(PATH+"Processed/CellCounting/TestLabels.csv", index=False, encoding='utf-8')