# Creation of Training, Test and Validation Sets Based on Session ID

In this notebook, we assume that the image paths contain the 'session ids'.

A session contains multiple images from the same patient and from the same dermago request. 

Here, we want to create the train, test and validation sets by making sure that a session id is not in two or more sets at the same time. 

In [1]:
import pandas as pd
import os
import numpy as np

In [25]:
df = pd.read_csv('D:\oro\GCP_stuff\dataset_1.1.csv', index_col=0).reset_index(drop=True)

# 1. Filter Dataset

Creation of the dataset by removing images with unrelevant classes.

In [26]:
df

Unnamed: 0,pathBucketImage,labelledby,reviewedby,Mole,No_mole,dateInserted,isresized,ratio,labels
0,resized/11017/AF175AAF9D/ailment_photos/AP-0c5...,Marc-Andre Dore,,True,False,2020-12-23,True,0.740573,Mole
1,resized/11017/AF175AAF9D/ailment_photos/AP-89e...,Marc-Andre Dore,,True,False,2020-12-23,True,0.692921,Mole
2,resized/112/625828998E/ailment_photos/AP-3ab67...,Marc-Andre Dore,,True,False,2020-12-23,True,0.587014,Mole
3,resized/112/625828998E/ailment_photos/AP-eee10...,Marc-Andre Dore,,True,False,2020-12-23,True,0.801559,Mole
4,resized/112/625828998E/ailment_photos/AP-6c400...,Marc-Andre Dore,,True,False,2020-12-23,True,0.819874,Mole
...,...,...,...,...,...,...,...,...,...
24967,resized/16868/E0236C2C08/ailment_photos/AP-b77...,Marc-Andre Dore,,False,True,2020-12-23,True,0.742566,No_mole
24968,resized/16870/1CC8EB0FD9/ailment_photos/AP-24f...,Marc-Andre Dore,,False,True,2020-12-23,True,0.730489,No_mole
24969,resized/16868/E0236C2C08/ailment_photos/AP-c2f...,Marc-Andre Dore,,False,True,2020-12-23,True,0.862823,No_mole
24970,resized/16871/5DE6DB1A51/ailment_photos/AP-144...,Marc-Andre Dore,,False,True,2020-12-23,True,0.868702,No_mole


In [17]:
#classes= ['acne_mixed','acne_scars','atopic_dermatitis','acne_cystic','acne_excoriated','rosacea_inflammatory','rosacea_erythemato_telangiectasique','acne_comedos','peri_oral_dermatitis','seborrheic_keratosis','psoriasis_vulgar','seborrheic_dermatitis','nummular_eczema','tinea_versicolor','chronic_hand_eczema','vulgar_warts','folliculitis','alopecia_androgenic','dyshidrosis','nevus','melasma','alopecia_areata','intertrigo','urticaria','vitiligo','keratosis_pilaris','molluscum','cheilitis_eczematous','tinea_corporis','prurigo_nodularis','actinic_keratosis','genital_warts','plane_warts','pityriasis_rosae','melanonychia','psoriasis_pustular_palmoplantar','granuloma_annulare','psoriasis_guttate','lichen_simplex_chronicus','shingles','herpes_simplex', 'no_disease']
#all_classes=  ['unspecified','other','rosacea_rhinophyma','acne_scars','lichen_planus','psoriasis_inverse','chelitis_actinic','acne_mixed','atopic_dermatitis','acne_cystic','acne_excoriated','rosacea_inflammatory','rosacea_erythemato_telangiectasique','acne_comedos','peri_oral_dermatitis','seborrheic_keratosis','psoriasis_vulgar','seborrheic_dermatitis','nummular_eczema','tinea_versicolor','chronic_hand_eczema','vulgar_warts','folliculitis','alopecia_androgenic','dyshidrosis','nevus','melasma','alopecia_areata','intertrigo','urticaria','vitiligo','keratosis_pilaris','molluscum','cheilitis_eczematous','tinea_corporis','prurigo_nodularis','actinic_keratosis','genital_warts','plane_warts','pityriasis_rosae','melanonychia','psoriasis_pustular_palmoplantar','granuloma_annulare','psoriasis_guttate','lichen_simplex_chronicus','shingles','herpes_simplex', 'no_disease']

classes = ['Mole', 'No_mole']
all_classes = ['pathBucketImage', 'labelledby', 'reviewedby', 'Mole', 'No_mole', 'dateInserted', 'isresized', 'ratio', 'labels']

In [18]:
len(classes)

2

In [19]:
set(all_classes) - set(classes)

{'dateInserted',
 'isresized',
 'labelledby',
 'labels',
 'pathBucketImage',
 'ratio',
 'reviewedby'}

In [20]:
unused_classes = list(set(all_classes) - set(classes))
#unused_classes.append('isresized')
#unused_classes.append('reviewedby')
#unused_classes.append('dateInserted')
#unused_classes.append('labelledby')

In [35]:
unused_classes

['reviewedby',
 'pathBucketImage',
 'labelledby',
 'ratio',
 'dateInserted',
 'labels',
 'isresized']

In [32]:
df.loc[(df[classes].sum(axis=1) == 1), :]

Unnamed: 0,pathBucketImage,labelledby,reviewedby,Mole,No_mole,dateInserted,isresized,ratio,labels
0,resized/11017/AF175AAF9D/ailment_photos/AP-0c5...,Marc-Andre Dore,,True,False,2020-12-23,True,0.740573,Mole
1,resized/11017/AF175AAF9D/ailment_photos/AP-89e...,Marc-Andre Dore,,True,False,2020-12-23,True,0.692921,Mole
2,resized/112/625828998E/ailment_photos/AP-3ab67...,Marc-Andre Dore,,True,False,2020-12-23,True,0.587014,Mole
3,resized/112/625828998E/ailment_photos/AP-eee10...,Marc-Andre Dore,,True,False,2020-12-23,True,0.801559,Mole
4,resized/112/625828998E/ailment_photos/AP-6c400...,Marc-Andre Dore,,True,False,2020-12-23,True,0.819874,Mole
...,...,...,...,...,...,...,...,...,...
24967,resized/16868/E0236C2C08/ailment_photos/AP-b77...,Marc-Andre Dore,,False,True,2020-12-23,True,0.742566,No_mole
24968,resized/16870/1CC8EB0FD9/ailment_photos/AP-24f...,Marc-Andre Dore,,False,True,2020-12-23,True,0.730489,No_mole
24969,resized/16868/E0236C2C08/ailment_photos/AP-c2f...,Marc-Andre Dore,,False,True,2020-12-23,True,0.862823,No_mole
24970,resized/16871/5DE6DB1A51/ailment_photos/AP-144...,Marc-Andre Dore,,False,True,2020-12-23,True,0.868702,No_mole


In [33]:
df = df.loc[(df[classes].sum(axis=1) == 1), :]

In [34]:
df['label']=df.apply(lambda x: str(x[classes][x[classes]].index.tolist()), axis=1)
df

In [38]:
df = df.rename(columns={'pathBucketImage': 'filename'})

Unnamed: 0,filename,labelledby,reviewedby,Mole,No_mole,dateInserted,isresized,ratio,labels,label
0,resized/11017/AF175AAF9D/ailment_photos/AP-0c5...,Marc-Andre Dore,,True,False,2020-12-23,True,0.740573,Mole,['Mole']
1,resized/11017/AF175AAF9D/ailment_photos/AP-89e...,Marc-Andre Dore,,True,False,2020-12-23,True,0.692921,Mole,['Mole']
2,resized/112/625828998E/ailment_photos/AP-3ab67...,Marc-Andre Dore,,True,False,2020-12-23,True,0.587014,Mole,['Mole']
3,resized/112/625828998E/ailment_photos/AP-eee10...,Marc-Andre Dore,,True,False,2020-12-23,True,0.801559,Mole,['Mole']
4,resized/112/625828998E/ailment_photos/AP-6c400...,Marc-Andre Dore,,True,False,2020-12-23,True,0.819874,Mole,['Mole']
...,...,...,...,...,...,...,...,...,...,...
24967,resized/16868/E0236C2C08/ailment_photos/AP-b77...,Marc-Andre Dore,,False,True,2020-12-23,True,0.742566,No_mole,['No_mole']
24968,resized/16870/1CC8EB0FD9/ailment_photos/AP-24f...,Marc-Andre Dore,,False,True,2020-12-23,True,0.730489,No_mole,['No_mole']
24969,resized/16868/E0236C2C08/ailment_photos/AP-c2f...,Marc-Andre Dore,,False,True,2020-12-23,True,0.862823,No_mole,['No_mole']
24970,resized/16871/5DE6DB1A51/ailment_photos/AP-144...,Marc-Andre Dore,,False,True,2020-12-23,True,0.868702,No_mole,['No_mole']


In [40]:
# Select only the rows where only one of the classes is True (based on ALL POSSIBLE CLASSES)
#df = df.loc[(df[all_classes].sum(axis=1) >= 1), :]
#df['label']=df.apply(lambda x: str(x[classes][x[classes]].index.tolist()), axis=1)
#df = df.rename(columns={'pathBucketImage': 'filename'})
unused_classes.remove('pathBucketImage')
df = df.drop(columns=unused_classes)

In [41]:
df

Unnamed: 0,filename,Mole,No_mole,label
0,resized/11017/AF175AAF9D/ailment_photos/AP-0c5...,True,False,['Mole']
1,resized/11017/AF175AAF9D/ailment_photos/AP-89e...,True,False,['Mole']
2,resized/112/625828998E/ailment_photos/AP-3ab67...,True,False,['Mole']
3,resized/112/625828998E/ailment_photos/AP-eee10...,True,False,['Mole']
4,resized/112/625828998E/ailment_photos/AP-6c400...,True,False,['Mole']
...,...,...,...,...
24967,resized/16868/E0236C2C08/ailment_photos/AP-b77...,False,True,['No_mole']
24968,resized/16870/1CC8EB0FD9/ailment_photos/AP-24f...,False,True,['No_mole']
24969,resized/16868/E0236C2C08/ailment_photos/AP-c2f...,False,True,['No_mole']
24970,resized/16871/5DE6DB1A51/ailment_photos/AP-144...,False,True,['No_mole']


In [42]:
# we need to remove the images named as 'signature' or 'avatar'
searchfor = ['signature', 'avatar']
df = df[~df['filename'].str.contains('|'.join(searchfor))].reset_index(drop=True)

In [43]:
short_filenames =df['filename'].apply(lambda x: x.split('/')[-1])
short_filenames[short_filenames.duplicated()]

22672    Screenshot_20201008_122532_comandroidchrome.jpg
Name: filename, dtype: object

In [15]:
short_filenames =df['filename'].apply(lambda x: x.split('/')[-1])
duplicated = short_filenames[short_filenames.duplicated()]
if duplicated.shape[0] > 0:
    print(f'WARNING - There are duplicates in the dataset: { duplicated.values}')



# 2. Create Sessions

In [44]:
df['session'] =  df['filename'].apply(lambda x: x.split('/')[1]+ '/' + x.split('/')[2])
df['patient_id'] =  df['filename'].apply(lambda x: x.split('/')[1])

In [45]:
df

Unnamed: 0,filename,Mole,No_mole,label,session,patient_id
0,resized/11017/AF175AAF9D/ailment_photos/AP-0c5...,True,False,['Mole'],11017/AF175AAF9D,11017
1,resized/11017/AF175AAF9D/ailment_photos/AP-89e...,True,False,['Mole'],11017/AF175AAF9D,11017
2,resized/112/625828998E/ailment_photos/AP-3ab67...,True,False,['Mole'],112/625828998E,112
3,resized/112/625828998E/ailment_photos/AP-eee10...,True,False,['Mole'],112/625828998E,112
4,resized/112/625828998E/ailment_photos/AP-6c400...,True,False,['Mole'],112/625828998E,112
...,...,...,...,...,...,...
24928,resized/16868/E0236C2C08/ailment_photos/AP-b77...,False,True,['No_mole'],16868/E0236C2C08,16868
24929,resized/16870/1CC8EB0FD9/ailment_photos/AP-24f...,False,True,['No_mole'],16870/1CC8EB0FD9,16870
24930,resized/16868/E0236C2C08/ailment_photos/AP-c2f...,False,True,['No_mole'],16868/E0236C2C08,16868
24931,resized/16871/5DE6DB1A51/ailment_photos/AP-144...,False,True,['No_mole'],16871/5DE6DB1A51,16871


In [46]:
print(f"There are {len(df['patient_id'].unique())} unique patients." )
print(f"There are {len(df['session'].unique())} unique sessions" )
print(f"There are {len(df['filename'].unique())} unique images" )

There are 6948 unique patients.
There are 7501 unique sessions
There are 24933 unique images


In [47]:
df_group =df[['session', 'label']].drop_duplicates().groupby('session').count().sort_values(by='label', ascending=False)

In [48]:
df_group

Unnamed: 0_level_0,label
session,Unnamed: 1_level_1
1126/51DEDC9314,2
9561/087B2A2799,2
13671/7DE670B053,2
9577/47FB969D46,2
1126/0D3B5B58C4,2
...,...
15585/B8D89055B7,1
15583/A8432F8163,1
15581/11D607B0E0,1
15573/9A3A3E5A94,1


In [49]:
print(f"There are {df_group[df_group['label']==1]['label'].sum()} sessions that have all the same label for all images")
print(f"There are {len(df['session'].unique())-df_group[df_group['label']==1]['label'].sum()} sessions that have various labels")

There are 7460 sessions that have all the same label for all images
There are 41 sessions that have various labels


let's see some samples of sessions with various labels.

In [50]:
count = 0
for session in df_group[df_group['label']!=1].index:
    if count == 7:
        break
    print('Muliple labels')
    print(df[df['session']==session][['session', 'label']])
    count +=1
    

Muliple labels
               session        label
6      1126/51DEDC9314     ['Mole']
15     1126/51DEDC9314     ['Mole']
3133   1126/51DEDC9314  ['No_mole']
3240   1126/51DEDC9314  ['No_mole']
21242  1126/51DEDC9314  ['No_mole']
Muliple labels
               session        label
343    9561/087B2A2799     ['Mole']
18951  9561/087B2A2799  ['No_mole']
19100  9561/087B2A2799  ['No_mole']
Muliple labels
               session        label
54    13671/7DE670B053     ['Mole']
5245  13671/7DE670B053  ['No_mole']
5300  13671/7DE670B053  ['No_mole']
5330  13671/7DE670B053  ['No_mole']
5355  13671/7DE670B053  ['No_mole']
5426  13671/7DE670B053  ['No_mole']
Muliple labels
               session        label
344    9577/47FB969D46     ['Mole']
19002  9577/47FB969D46  ['No_mole']
19052  9577/47FB969D46  ['No_mole']
19165  9577/47FB969D46  ['No_mole']
24758  9577/47FB969D46  ['No_mole']
Muliple labels
               session        label
13     1126/0D3B5B58C4     ['Mole']
14     1126/0D3B5B58C4   

In [23]:
len(['no_disease', 'acne_mixed','acne_scars','atopic_dermatitis','acne_cystic','acne_excoriated','rosacea_inflammatory','rosacea_erythemato_telangiectasique','acne_comedos','peri_oral_dermatitis','seborrheic_keratosis','psoriasis_vulgar','seborrheic_dermatitis','nummular_eczema','tinea_versicolor','chronic_hand_eczema','vulgar_warts','folliculitis','alopecia_androgenic','dyshidrosis','nevus','melasma','alopecia_areata','intertrigo','urticaria','vitiligo','keratosis_pilaris','molluscum','cheilitis_eczematous','tinea_corporis','prurigo_nodularis','actinic_keratosis','genital_warts','plane_warts','pityriasis_rosae','melanonychia','psoriasis_pustular_palmoplantar','granuloma_annulare','psoriasis_guttate','lichen_simplex_chronicus','shingles','herpes_simplex'])

42

# 3. Sample Dataset

Since there are so many 'acne mixed', 'no disease' and 'atopic dermatitis' images, we need to sample our dataset.



In [52]:
# Select only the rows where only one of the classes is True (based on ALL POSSIBLE CLASSES)
classes = ['Mole', 'No_mole']
df = pd.read_csv('D:\oro\GCP_stuff\dataset_1.1.csv', index_col=0).reset_index(drop=True)
df = df.loc[(df[classes].sum(axis=1) == 1), :]
#Select only the rows where one of the classes is True
df = df.loc[(df[classes].sum(axis=1) == 1), :]

In [53]:
df[classes].idxmax(axis=1)

0           Mole
1           Mole
2           Mole
3           Mole
4           Mole
          ...   
24967    No_mole
24968    No_mole
24969    No_mole
24970    No_mole
24971    No_mole
Length: 24972, dtype: object

In [54]:
#Select only the rows where one of the classes is True
df['label'] = df[classes].idxmax(axis=1)
df = df.rename(columns={'pathBucketImage': 'filename'})
df = df.drop(columns=unused_classes)
df['session'] =  df['filename'].apply(lambda x: x.split('/')[1]+ '/' + x.split('/')[2])
df['patient_id'] =  df['filename'].apply(lambda x: x.split('/')[1])
# we need to remove the images named as 'signature' or 'avatar'
searchfor = ['signature', 'avatar']
df = df[~df['filename'].str.contains('|'.join(searchfor))].reset_index(drop=True)


In [55]:
df

Unnamed: 0,filename,Mole,No_mole,label,session,patient_id
0,resized/11017/AF175AAF9D/ailment_photos/AP-0c5...,True,False,Mole,11017/AF175AAF9D,11017
1,resized/11017/AF175AAF9D/ailment_photos/AP-89e...,True,False,Mole,11017/AF175AAF9D,11017
2,resized/112/625828998E/ailment_photos/AP-3ab67...,True,False,Mole,112/625828998E,112
3,resized/112/625828998E/ailment_photos/AP-eee10...,True,False,Mole,112/625828998E,112
4,resized/112/625828998E/ailment_photos/AP-6c400...,True,False,Mole,112/625828998E,112
...,...,...,...,...,...,...
24928,resized/16868/E0236C2C08/ailment_photos/AP-b77...,False,True,No_mole,16868/E0236C2C08,16868
24929,resized/16870/1CC8EB0FD9/ailment_photos/AP-24f...,False,True,No_mole,16870/1CC8EB0FD9,16870
24930,resized/16868/E0236C2C08/ailment_photos/AP-c2f...,False,True,No_mole,16868/E0236C2C08,16868
24931,resized/16871/5DE6DB1A51/ailment_photos/AP-144...,False,True,No_mole,16871/5DE6DB1A51,16871


In [52]:
# we only want to keep the images labeled as 'no_disease' that have all the same label per session
###########

#df_no_disease = pd.read_csv('no_disease.csv', index_col=0)


In [49]:
#filename_all = set(df[df['label']=='no_disease']['filename'])
#filename_to_keep = set(df_no_disease['filename'])

In [52]:
#filename_to_remove_ = filename_all.difference(filename_to_keep)

In [54]:
#df= df[~df['filename'].isin(filename_to_remove_)].reset_index(drop=True)

In [56]:
# we want to reduce the number of acne mixed, no disease and atopic dermatatis
def sample_per_disease(df, disease, n):
    n_disease = df[df[disease]==True].shape[0]
    idx= df[df[disease]==True].sample(n=n_disease - n ,random_state=42).index
    df.drop(idx, axis=0, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [58]:
df = sample_per_disease(df, 'No_mole', 1700) # 1488

In [59]:
df

Unnamed: 0,filename,Mole,No_mole,label,session,patient_id
0,resized/11017/AF175AAF9D/ailment_photos/AP-0c5...,True,False,Mole,11017/AF175AAF9D,11017
1,resized/11017/AF175AAF9D/ailment_photos/AP-89e...,True,False,Mole,11017/AF175AAF9D,11017
2,resized/112/625828998E/ailment_photos/AP-3ab67...,True,False,Mole,112/625828998E,112
3,resized/112/625828998E/ailment_photos/AP-eee10...,True,False,Mole,112/625828998E,112
4,resized/112/625828998E/ailment_photos/AP-6c400...,True,False,Mole,112/625828998E,112
...,...,...,...,...,...,...
2146,resized/9324/8825B7F16A/ailment_photos/AP-fb2b...,False,True,No_mole,9324/8825B7F16A,9324
2147,resized/9340/E133B62547/ailment_photos/AP-9ec7...,False,True,No_mole,9340/E133B62547,9340
2148,resized/9362/5366E8E4F4/ailment_photos/AP-2ead...,False,True,No_mole,9362/5366E8E4F4,9362
2149,resized/9712/6DFF645764/ailment_photos/AP-5f31...,False,True,No_mole,9712/6DFF645764,9712


In [63]:
#df = sample_per_disease(df, 'no_disease', 1700) #1630

In [64]:
#df = sample_per_disease(df, 'atopic_dermatitis', 1300)

In [60]:
print(f"There are {df['label'].shape[0]} images in the final dataset.")
print(f"There are {len(df['patient_id'].unique())} unique patients." )
print(f"There are {len(df['session'].unique())} unique sessions" )

There are 2151 images in the final dataset.
There are 1608 unique patients.
There are 1674 unique sessions


Here is the distributions of the new dataset:

In [61]:
# create final sets
df['label'].value_counts()

No_mole    1700
Mole        451
Name: label, dtype: int64

# 4. Create Final Sets

Here we create the final test, train and validation set.

We make sure that there is no intersection between all the sets in term of session ids. 

In [62]:
from sklearn.model_selection import StratifiedGroupKFold

In [63]:
cv = StratifiedGroupKFold(n_splits=6)
for train_idxs, test_idxs in cv.split(df[classes], df['label'], df['session']):
    break

In [64]:
df_train = df.loc[train_idxs].copy().reset_index(drop=True)
df_test = df.loc[test_idxs].copy().reset_index(drop=True)

In [65]:
cv = StratifiedGroupKFold(n_splits=9)
for train_idxs, test_idxs in cv.split(df_train[classes], df_train['label'], df_train['session']):
    break
df_val = df_train.loc[test_idxs].copy().reset_index(drop=True)
df_train = df_train.loc[train_idxs].copy().reset_index(drop=True)

In [66]:
print(f"There are {df_test.shape[0]} images in  the test set.")
print(f"There are {df_train.shape[0]} images in  the training set." )
print(f"There are {df_val.shape[0]} images in  the validation set." )

There are 360 images in  the test set.
There are 1592 images in  the training set.
There are 199 images in  the validation set.


In [67]:
def get_number_classes_per_set(df, name='train'):
    num_classes = df['label'].value_counts().size
    if  num_classes < len(classes):
        print(f"WARNING- There are {num_classes} classes in  the {name} set.")
get_number_classes_per_set(df_train, name='train')
get_number_classes_per_set(df_test, name='test')
get_number_classes_per_set(df_val, name='val')

Here are the distributions of the three sets:

In [68]:
df_train['label'].value_counts()

No_mole    1259
Mole        333
Name: label, dtype: int64

Now, we need to make sure that there is no intersection between the three sets in term of session ids.

In [69]:
print(f"Intersection between test and train: {len(set(df_test['filename']).intersection(set(df_train['filename'])))}")
print(f"Intersection between test and val: {len(set(df_test['filename']).intersection(set(df_val['filename'])))}")
print(f"Intersection between val and train: {len(set(df_val['filename']).intersection(set(df_train['filename'])))}")

print(f"Intersection between test and train in terms of session ids: {len(set(df_test['session']).intersection(set(df_train['session'])))}")
print(f"Intersection between test and val in terms of session ids: {len(set(df_test['session']).intersection(set(df_val['session'])))}")
print(f"Intersection between val and train in terms of session ids: {len(set(df_val['session']).intersection(set(df_train['filename'])))}")

Intersection between test and train: 0
Intersection between test and val: 0
Intersection between val and train: 0
Intersection between test and train in terms of session ids: 0
Intersection between test and val in terms of session ids: 0
Intersection between val and train in terms of session ids: 0


In [101]:
(df_train*1).to_csv('train_set_initial.csv')

In [102]:
(df_test*1).to_csv('test_set_initial.csv')

In [103]:
(df_val*1).to_csv('validation_set.csv')

Removing tumours from train and test set

In [14]:
df = pd.read_csv('D:/oro/GCP_stuff/validation_set_additional.csv')

In [15]:
df['label'].value_counts()

Mole       163
No_mole    157
Name: label, dtype: int64

In [16]:
df = df[~df['filename'].str.contains('tumours')].reset_index(drop=True)

In [17]:
df

Unnamed: 0.1,Unnamed: 0,filename,Mole,No_mole,label,session,patient_id
0,0,resized/112/625828998E/ailment_photos/AP-3ab67...,1,0,Mole,112/625828998E,112
1,1,resized/112/625828998E/ailment_photos/AP-eee10...,1,0,Mole,112/625828998E,112
2,2,resized/112/625828998E/ailment_photos/AP-6c400...,1,0,Mole,112/625828998E,112
3,3,resized/112/625828998E/ailment_photos/AP-c2a20...,1,0,Mole,112/625828998E,112
4,4,resized/7983/D7A522DCFE/ailment_photos/AP-fcea...,1,0,Mole,7983/D7A522DCFE,7983
...,...,...,...,...,...,...,...
217,292,resized/additional_training_images/nevus/7-154...,1,0,Mole,557/00DB944F16,-124
218,294,resized/additional_training_images/nevus/7-130...,1,0,Mole,557/00DB944F16,-62
219,295,resized/additional_training_images/melanoma/7-...,1,0,Mole,557/00DB944F16,-37
220,304,resized/additional_training_images/melanoma/7-...,1,0,Mole,557/00DB944F16,-38


In [18]:
df['label'].value_counts()

No_mole    157
Mole        65
Name: label, dtype: int64

In [19]:
df.to_csv('D:/oro/GCP_stuff/validation_set_additional_no_tumours.csv')

Modifying test set with only public images

In [2]:
df = pd.read_csv('D:/oro/GCP_stuff/test_set_initial_additional.csv')
df

Unnamed: 0.1,Unnamed: 0,filename,Mole,No_mole,label,session,patient_id
0,0,resized/13549/259DC2E95A/ailment_photos/AP-151...,1,0,Mole,13549/259DC2E95A,13549
1,1,resized/1442/72B1BEED1F/ailment_photos/AP-a15d...,1,0,Mole,1442/72B1BEED1F,1442
2,2,resized/1442/72B1BEED1F/ailment_photos/AP-7586...,1,0,Mole,1442/72B1BEED1F,1442
3,3,resized/1442/72B1BEED1F/ailment_photos/AP-fca5...,1,0,Mole,1442/72B1BEED1F,1442
4,4,resized/1461/A26464738D/ailment_photos/AP-d219...,1,0,Mole,1461/A26464738D,1461
...,...,...,...,...,...,...,...
415,415,resized/additional_training_images/tumours/lym...,1,0,Mole,5876/0E29A8F6E8,-469
416,416,resized/additional_training_images/nevus/7-154...,1,0,Mole,5876/0E29A8F6E8,-129
417,417,resized/additional_training_images/tumours/epi...,1,0,Mole,5876/0E29A8F6E8,-298
418,418,resized/additional_training_images/tumours/epi...,1,0,Mole,5876/0E29A8F6E8,-211


In [3]:
df['label'].value_counts()

No_mole    284
Mole       136
Name: label, dtype: int64

In [4]:
#removing tumour entries
df = df[~df['filename'].str.contains('tumours')].reset_index(drop=True)

In [5]:
# removing production images
df = df[df['filename'].str.contains('additional_training_images')].reset_index(drop=True)

In [6]:
df

Unnamed: 0.1,Unnamed: 0,filename,Mole,No_mole,label,session,patient_id
0,368,resized/additional_training_images/nevus/7-134...,1,0,Mole,5876/0E29A8F6E8,-68
1,370,resized/additional_training_images/melanoma/7-...,1,0,Mole,5876/0E29A8F6E8,-49
2,373,resized/additional_training_images/nevus/7-128...,1,0,Mole,5876/0E29A8F6E8,-59
3,377,resized/additional_training_images/nevus/7-146...,1,0,Mole,5876/0E29A8F6E8,-99
4,385,resized/additional_training_images/nevus/7-137...,1,0,Mole,5876/0E29A8F6E8,-83
5,386,resized/additional_training_images/nevus/7-143...,1,0,Mole,5876/0E29A8F6E8,-96
6,392,resized/additional_training_images/melanoma/7-...,1,0,Mole,5876/0E29A8F6E8,-12
7,400,resized/additional_training_images/nevus/7-147...,1,0,Mole,5876/0E29A8F6E8,-104
8,408,resized/additional_training_images/nevus/7-134...,1,0,Mole,5876/0E29A8F6E8,-69
9,413,resized/additional_training_images/nevus/7-154...,1,0,Mole,5876/0E29A8F6E8,-127


In [23]:
import cv2

#mole_names = []
#resized_mole_names = []
for i in df['filename'].str.split('/').tolist():
    
#    names.append(os.path.join('sdd_Debarpan', 'Images_danderm', i[-2], i[-1]))

    img = cv2.imread(os.path.join('D:\\', 'oro', 'Images_danderm', 'moles', i[-2], i[-1]))
    img = cv2.resize(img, (224, 224))
    cv2.imwrite(os.path.join('D:\\', 'oro', 'Images_danderm', 'resized', 'moles', i[-2], i[-1]), img)

#    mole_names.append(os.path.join('D:\\', 'oro', 'Images_danderm', 'moles', i[-2], i[-1]))
#    resized_mole_names.append(os.path.join('D:\\', 'oro', 'Images_danderm', 'resized', 'moles', i[-2], i[-1]))

In [37]:
list_non_moles =[]
for root, dir, files in os.walk('D:\\oro\\Images_danderm\\non_moles'):
    for i in files:
        list_non_moles.append(os.path.join(root, i))
#    for i in dir:
#        list_non_moles.append(i)
#        if dir != 'resized_no_moles' or 'resized_no_moles' not in root:
#            if os.pa


In [40]:
import random

list_non_moles = random.sample(list_non_moles, 20)

In [41]:
list_non_moles

['D:\\oro\\Images_danderm\\non_moles\\drug_eruptions\\atrophic_dystrophic_diseases\\3-215.jpg',
 'D:\\oro\\Images_danderm\\non_moles\\diseases_of_derivatives\\inflammatory_nail_changes\\4-74-12.jpg',
 'D:\\oro\\Images_danderm\\non_moles\\infections_skin_diseases_infestations\\skin_diseases_parasites\\8-165.jpg',
 'D:\\oro\\Images_danderm\\non_moles\\infections_skin_diseases_infestations\\bacterial_infections\\8-42-3.jpg',
 'D:\\oro\\Images_danderm\\non_moles\\collagen_diseases\\vascular_disorders_ulcers_factitial dermatoses_granuloma annulare_necrobiosis_lipoidica_lichen_sclerosis_atrophicus\\6-104.jpg',
 'D:\\oro\\Images_danderm\\non_moles\\genodermatosis\\keratodermas\\5-81-12 .jpg',
 'D:\\oro\\Images_danderm\\non_moles\\eczema_dermatitis\\seborrhoeic_dermatitis\\2-117-3.jpg',
 'D:\\oro\\Images_danderm\\non_moles\\drug_eruptions\\viral_exanthemas\\3-81-slor.jpg',
 'D:\\oro\\Images_danderm\\non_moles\\drug_eruptions\\atrophic_dystrophic_diseases\\3-173-1.jpg',
 'D:\\oro\\Images_dander

In [42]:
path = 'D:\\oro\\Images_danderm\\resized\\non_moles'
for x in os.listdir('D:\\oro\\Images_danderm\\resized\\non_moles'):
    img = cv2.imread(os.path.join(path, x))
    img = cv2.resize(img, (224, 224))
    cv2.imwrite(os.path.join(path, x), img)
