# Preliminary operations

In [None]:
#@title Drive mount

from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
#@title Main libraries
import pandas as pd
import numpy as np

import os
import random

from sklearn.utils import  compute_class_weight

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

import tensorflow as tf
tfk = tf.keras
tfkl = tf.keras.layers

In [None]:
#@title Reproducibility
# Setting a seed for reproducibility
seed = 90 
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print("The seed is: " + str(seed))

The seed is: 90


# Get predictions

In [None]:
#test_dir = '/content/gdrive/MyDrive/Assignment/Clean_And_Preprocessed_Dataset/CLEAN_and_HMP/TEST'
test_dir = '/content/gdrive/MyDrive/HIDDEN_DATASET/READY_FOR_PREDICTION'

In [None]:
label_inv = {'N':0, 'P':1, 'T':2}

In [None]:
effnetb0 = {  'name': 'EffNet_B0',
              'model': tfk.models.load_model("/content/gdrive/MyDrive/AI project/models/5.2. HMP_EffNet_model_30_01") }

In [None]:
data_hyperparameters = {
    'batch_size': 128,
    'resized_shape_height': 224, # as in chex net
    'resized_shape_width': 224,
    'color_mode': 'rgb' 
}

In [None]:
test_set_effnet = ImageDataGenerator().flow_from_directory(directory = test_dir,
                                                                target_size = (data_hyperparameters['resized_shape_height'], data_hyperparameters['resized_shape_width']),
                                                                color_mode = data_hyperparameters['color_mode'],
                                                                class_mode = None,
                                                                batch_size = data_hyperparameters['batch_size'],
                                                                shuffle = False,
                                                                seed = seed)

Found 5144 images belonging to 1 classes.


In [None]:
eff_predictions = effnetb0['model'].predict(test_set_effnet)



In [None]:
eff_predictions

array([[7.3878680e-16, 9.9999994e-01, 6.3266371e-29],
       [9.7495250e-02, 1.5212510e-02, 8.8729221e-01],
       [9.9999994e-01, 7.1355571e-10, 1.1889653e-09],
       ...,
       [3.9773434e-07, 9.9999958e-01, 4.3933485e-13],
       [9.9999994e-01, 1.3760048e-13, 2.9594753e-12],
       [4.9697895e-09, 9.9999994e-01, 1.3260074e-16]], dtype=float32)

In [None]:
np.save('/content/gdrive/MyDrive/AI project/models/HT_FINAL_PREDICTIONS_EFFNET.npy', eff_predictions)

In [None]:
#@title with Target
# NOT TO EXECUTE WITHOUT KNOWING THE TARGET
pred= eff_predictions
target = test_set_effnet.classes
print("Accuracy: ", accuracy_score(target, np.argmax(pred, axis=-1)))
print("F1 score: ", f1_score(target, np.argmax(pred, axis=-1), average=None))
cm = confusion_matrix(target, np.argmax(pred, axis=-1), normalize='true')
labels = ['N', 'P', 'T']
plt.figure(figsize=(7,7))
sns.heatmap(cm.T, annot=True, fmt='.2f', xticklabels=list(labels), yticklabels=list(labels))
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

In [None]:
resnet = {  'name': 'Resnet_SEblocks',
            'model': tfk.models.load_model("/content/gdrive/MyDrive/AI project/models/2.2. HMP_ResNet_model_01022023")} 

In [None]:
data_hyperparameters = {
    'batch_size': 128,
    'resized_shape_height': 100,
    'resized_shape_width': 100,
    'color_mode': 'grayscale'
}

In [None]:
test_set_resnet = ImageDataGenerator(rescale = 1/255.).flow_from_directory(directory = test_dir,
                                                                target_size = (data_hyperparameters['resized_shape_height'], data_hyperparameters['resized_shape_width']),
                                                                color_mode = data_hyperparameters['color_mode'],
                                                                class_mode = 'categorical',
                                                                batch_size = data_hyperparameters['batch_size'],
                                                                shuffle = False,
                                                                seed = seed)

Found 5144 images belonging to 1 classes.


In [None]:
resnet_predictions = resnet['model'].predict(test_set_resnet)



In [None]:
np.save('/content/gdrive/MyDrive/AI project/models/HT_FINAL_PREDICTIONS_RESNET.npy', resnet_predictions) 

In [None]:
test_set_resnet.filenames == test_set_effnet.filenames

True

In [None]:
fns = test_set_resnet.filenames
np.save('/content/gdrive/MyDrive/AI project/models/FINAL_PREDICTIONS_FILENAMES.npy', fns) 

In [None]:
#@title checks
import csv
ref = pd.read_csv('/content/gdrive/MyDrive/Assignment/OriginalDataset/test_merged/Test_list.csv')
ref = ref.assign(Num_label = ref['label'].map(lambda x: label_inv[x]))
df = pd.DataFrame()
df['file_csv'] = ref['file']
df['file_gen_eff'] = test_set_effnet.filenames
df['file_gen_res'] = test_set_resnet.filenames
df = df.assign(file_gen_eff = df['file_gen_eff'].map(lambda x:x[4:])) # 'all/P04220_1.jpeg',
df = df.assign(file_gen_res = df['file_gen_res'].map(lambda x:x[4:]))
df['y_true'] = ref['Num_label']
df['y_effnet'] = np.argmax(eff_predictions, axis=-1)
df['y_resnet'] = np.argmax(resnet_predictions, axis=-1)
print(df)
print(sum(df['file_gen_eff']==df['file_gen_res']))
print(sum(df['file_gen_eff']==df['file_csv']))
print("Accuracy effnet: ", accuracy_score(df['y_true'], df['y_effnet']))
print("Accuracy resnet: ", accuracy_score(df['y_true'], df['y_resnet']))

In [None]:
#@title with Target
# NOT TO EXECUTE WITHOUT KNOWING THE TARGET
pred= resnet_predictions
target = test_set_resnet.classes
print("Accuracy: ", accuracy_score(target, np.argmax(pred, axis=-1)))
print("F1 score: ", f1_score(target, np.argmax(pred, axis=-1), average=None))
cm = confusion_matrix(target, np.argmax(pred, axis=-1), normalize='true')
labels = ['N', 'P', 'T']
plt.figure(figsize=(7,7))
sns.heatmap(cm.T, annot=True, fmt='.2f', xticklabels=list(labels), yticklabels=list(labels))
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

# Ensemble

In [None]:
resnet = {  'name': 'Resnet_SEblocks',
            'predictions': np.load('/content/gdrive/MyDrive/AI project/models/HT_FINAL_PREDICTIONS_RESNET.npy')}

effnetb0 = {  'name': 'EffNet_B0',
              'predictions': np.load('/content/gdrive/MyDrive/AI project/models/HT_FINAL_PREDICTIONS_EFFNET.npy')}

In [None]:
final_predictions = (resnet['predictions'] + effnetb0['predictions'])/2
final_predictions.shape

(5144, 3)

In [None]:
np.save('/content/gdrive/MyDrive/AI project/models/HT_FINAL_PREDICTIONS_ENSEMBLE.npy', final_predictions) 

In [None]:
#@title with target
# NOT TO EXECUTE WITHOUT KNOWING THE TARGET
pred= final_predictions
target = test_set_resnet.classes
print("Accuracy: ", accuracy_score(target, np.argmax(pred, axis=-1)))
print("F1 score: ", f1_score(target, np.argmax(pred, axis=-1), average=None))
cm = confusion_matrix(target, np.argmax(pred, axis=-1), normalize='true')
labels = ['N', 'P', 'T']
plt.figure(figsize=(7,7))
sns.heatmap(cm.T, annot=True, fmt='.2f', xticklabels=list(labels), yticklabels=list(labels))
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

# Labels mapping

In [None]:
fp = np.load('/content/gdrive/MyDrive/AI project/models/HT_FINAL_PREDICTIONS_ENSEMBLE.npy') 
fns = np.load('/content/gdrive/MyDrive/AI project/models/FINAL_PREDICTIONS_FILENAMES.npy') 
fpc = np.argmax(fp, axis=-1)

In [None]:
# like above but on local
fp = np.load('models/HT_FINAL_PREDICTIONS_ENSEMBLE.npy')
fns = np.load('models/FINAL_PREDICTIONS_FILENAMES.npy')
fpc = np.argmax(fp, axis=-1)

In [None]:
legend = {0: 'N', 1: 'P', 2: 'T'}

In [None]:
final_pred = fpc
final_predictions_label = []
for i in range(len(final_pred)): final_predictions_label.append(legend[final_pred[i]])

print(len(final_pred))
print(len(final_predictions_label))

5144
5144


# Final saving

In [None]:
fp

array([[1.5194212e-04, 9.9984640e-01, 1.6346513e-06],
       [4.8765130e-02, 5.0758594e-01, 4.4364882e-01],
       [9.9999577e-01, 6.4990999e-08, 4.1421499e-06],
       ...,
       [4.1663270e-03, 9.9583364e-01, 1.6098792e-08],
       [9.9990535e-01, 1.6975537e-08, 9.4663912e-05],
       [8.5046622e-07, 9.9999905e-01, 7.8642245e-08]], dtype=float32)

In [None]:
final_df = pd.DataFrame()
final_df['files'] = fns

In [None]:
final_df['y_N'] = fp[:,0]
final_df['y_P'] = fp[:,1]
final_df['y_T'] = fp[:,2]


In [None]:
final_df['y_labels'] = fpc

In [None]:
# need to change fpc (0,1,2) to N,P,T using legend
final_df['y_labels'] = final_df['y_labels'].map(lambda x: legend[x])

In [None]:
submission = pd.DataFrame()
submission['file'] = fns
submission['label'] = fpc

# need to change fpc (0,1,2) to N,P,T using legend
submission['label'] = submission['label'].map(lambda x: legend[x])

In [None]:
fns

array(['all/P00009_1.jpeg', 'all/P00009_2.jpeg', 'all/P00015_1.png', ...,
       'all/P16086_1.jpeg', 'all/P16093_1.png', 'all/P16102_1.jpeg'],
      dtype='<U17')

In [None]:
# from fns temove 'all/'
submission['file'] = submission['file'].map(lambda x:x[4:])

In [None]:
# saving to csv without index on local
#final_df.to_csv('models/FINAL_PREDICTIONS_ENSEMBLE.csv', index=False)
#submission.to_csv('models/PREDICTIONS_BOSCARINO_CASTELLANI_CAVALLINI.csv', index=False) 

In [None]:
# saving to csv without index on COLAB
final_df.to_csv('/content/gdrive/MyDrive/AI project/models/FINAL_PREDICTIONS_ENSEMBLE.csv', index=False)
submission.to_csv('/content/gdrive/MyDrive/AI project/models/PREDICTIONS_BOSCARINO_CASTELLANI_CAVALLINI.csv', index=False) 

In [None]:
# compare with nelly target

# import from I:\Il mio Drive\HIDDEN_DATASET\NELLYZATION
#nelly_target = pd.read_csv('I:/Il mio Drive/HIDDEN_DATASET/NELLYZATION/labels_test.csv')

nelly_target = pd.read_csv('/content/gdrive/MyDrive/HIDDEN_DATASET/NELLYZATION/labels_test.csv')


# reorder by file name
nelly_target = nelly_target.sort_values(by=['file'])

In [None]:
nelly_target

Unnamed: 0,file,label
2689,P00009_1.jpeg,P
2359,P00009_2.jpeg,P
367,P00015_1.png,N
52,P00015_2.png,N
309,P00017_1.png,N
...,...,...
214,P16074_1.jpeg,N
3526,P16084_1.jpeg,P
2696,P16086_1.jpeg,P
2440,P16093_1.png,N


In [None]:
submission['label']

0       P
1       P
2       N
3       N
4       N
       ..
5139    N
5140    P
5141    P
5142    N
5143    P
Name: label, Length: 5144, dtype: object

In [None]:
nelly_target['label']

2689    P
2359    P
367     N
52      N
309     N
       ..
214     N
3526    P
2696    P
2440    N
4107    P
Name: label, Length: 5144, dtype: object

In [None]:
# merge the two dataframes on file name
# the name of the columns are file, submission, nelly_target

merged = pd.merge(submission, nelly_target, on='file')
# rename the columns
merged.columns = ['file', 'submission', 'nelly_target']
# lets create a new column with 1 if the two labels are the same
merged['same'] = merged['submission'] == merged['nelly_target']
# lets count the number of 1 and divide by the total number of rows
print('this is the match among our and nellys: ', merged['same'].sum()/len(merged))
merged

this is the match among our and nellys:  0.9663685847589425


Unnamed: 0,file,submission,nelly_target,same
0,P00009_1.jpeg,P,P,True
1,P00009_2.jpeg,P,P,True
2,P00015_1.png,N,N,True
3,P00015_2.png,N,N,True
4,P00017_1.png,N,N,True
...,...,...,...,...
5139,P16074_1.jpeg,N,N,True
5140,P16084_1.jpeg,P,P,True
5141,P16086_1.jpeg,P,P,True
5142,P16093_1.png,N,N,True


In [None]:
merged.to_csv('/content/gdrive/MyDrive/AI project/models/NELLY_COMPARISON.csv')