In [None]:
from scipy.special import softmax
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
import statistics
import pickle

In [None]:
BASE = "/home/dona/bm75/Project_Chathurika/Classification/data/revised_data/"
path_for_models = BASE + "remove_background/models/"
path_to_results = BASE + "remove_background/results/"
segments = ['body', 'head', 'thorax', 'abdomen']
fold = 5 #number of folds

In [None]:
def read_data(path_to_results, segment, phase):
    
    output_files = []
    data = []
    
    files = os.listdir(path_to_results)
    files.sort()
    
    if phase == "test":
        for file in files:
            if file[-8:-4] == "test" and (segment in file.lower()):
                output_files.append(file)
    elif phase == "train":
        for file in files:
            if file[-9:-4] == "train" and (segment in file.lower()):
                output_files.append(file)
        
    no_of_files = len(output_files)
    #print("total_files", no_of_files)
    
    for i, file in enumerate(output_files):
        parameters = file.split('_')
        fold = parameters[1]
        segment = parameters[0]
        phase = parameters[2]
        f = open(path_to_results + file)
        f.readline()
        
        predicted_label = []
        predicted_values = []
        actual_label = []
        sample_id = []
        
        for line in f:
            splits = line.strip().split(',')
            preds = [float(splits[0]), float(splits[1])]
            predicted_values.append(preds)
            predicted_label.append(softmax(np.array(preds))[1])
            actual_label.append(int(splits[2]))
            sample_id.append(int(splits[3]))

        obj = { 'fold': fold,
                'predicted_label' : predicted_label,
                'predicted_value' : predicted_values,
                'actual_label': actual_label,
                'sample_id' : sample_id
              }
        data.append(obj)
        
    sorted_data = sorted(data, key=lambda d: d['fold'])
        
    return sorted_data

In [None]:
def confusion_matrix(labels, predicted_labels, segment = ""):
    acc = accuracy_score(labels, predicted_labels)
    p, r, f, _ = precision_recall_fscore_support(labels, predicted_labels, average="binary")
    #print('accuracy: {}, precision: {}, recall: {}, f1-score: {}'.format(acc, p, r, f))
    return acc, p, r, f

In [None]:
#Performance for indivial body part
for segment in segments:
    data = read_data(path_to_results, segment, 'test')
    #print(data)
    acc_list = []
    p_list = []
    r_list = []
    f1_list = []
    print("segment: ", segment)
    for d in data:
        #print('fold: ', d['fold'])
        actual_labels = np.array(d['actual_label']).reshape(-1, 1)
        predicted_labels = d['predicted_label']
        predicted_labels = np.array(predicted_labels).reshape(-1,1)
        predicted_labels[predicted_labels>0.5]=1
        predicted_labels[predicted_labels<=0.5]=0
        acc, p, r, f = confusion_matrix(actual_labels, predicted_labels, segment)
        acc_list.append(acc)
        p_list.append(p)
        r_list.append(r)
        f1_list.append(f) 
    mean = [statistics.mean(acc_list), statistics.mean(p_list), statistics.mean(r_list), statistics.mean(f1_list)]
    stdev = [statistics.stdev(acc_list), statistics.stdev(p_list), statistics.stdev(r_list), statistics.stdev(f1_list)]
    print('Mean values - accuracy: {}, precision: {}, recall: {}, f1-score: {}'.format(mean[0], mean[1], mean[2], mean[3]))
    print('Stdv values - accuracy: {}, precision: {}, recall: {}, f1-score: {}'.format(stdev[0], stdev[1], stdev[2], stdev[3]))

In [None]:
head_test = read_data(path_to_results, 'head', 'test')
head_train = read_data(path_to_results, 'head', 'train')
thorax_test = read_data(path_to_results, 'thorax', 'test')
thorax_train = read_data(path_to_results, 'thorax', 'train')
abdomen_test = read_data(path_to_results, 'abdomen', 'test')
abdomen_train = read_data(path_to_results, 'abdomen', 'train')
body_test = read_data(path_to_results, 'body', 'test')
body_train = read_data(path_to_results, 'body', 'train')

In [None]:
def process_preds(d):
    predicted_labels = d['predicted_label']
    predicted_labels = np.array(predicted_labels).reshape(-1,1)
    predicted_labels[predicted_labels>0.5]=1
    predicted_labels[predicted_labels<=0.5]=0
    return predicted_labels

def plot_model_scores(model, is_needed = False):
    if is_needed:
        objects = ('body', 'head', 'thorax', 'abdomen')
        y_pos = np.arange(len(objects))
        #print(model.coef_)
        plt.bar(y_pos, model.coef_[0], align='center', alpha=0.5)
        plt.xticks(y_pos, objects)
        plt.show()

def train_model(train_x, train_y, test_x, text_y):
    model = LogisticRegression().fit(train_x, train_y)
    file = path_to_results + 'ensembling_model.sav' #uncomment if you want to save the model
    pickle.dump(model, open(file, 'wb'))
    ensemble_train_preds = model.predict(train_x)
    ensemble_test_preds = model.predict(test_x)
    plot_model_scores(model, True)
    return ensemble_train_preds, ensemble_test_preds

In [None]:
#Performance for the augmented model
acc_list = []
p_list = []
r_list = []
f1_list = []
for i in range(0, fold):
    train_x = np.concatenate((process_preds(body_train[i]), process_preds(head_train[i]), process_preds(thorax_train[i]), process_preds(abdomen_train[i])),axis = -1) 
    train_y = np.array(body_train[i]['actual_label'])
    test_x = np.concatenate((process_preds(body_test[i]), process_preds(head_test[i]), process_preds(thorax_test[i]), process_preds(abdomen_test[i])),axis = -1) 
    test_y = np.array(body_test[i]['actual_label'])
    ensemble_train_preds, ensemble_test_preds = train_model(train_x, train_y, test_x, test_y)
    acc, p, r, f = confusion_matrix(test_y, ensemble_test_preds, 'ensamble_test')
    acc_list.append(acc)
    p_list.append(p)
    r_list.append(r)
    f1_list.append(f)
mean = [statistics.mean(acc_list), statistics.mean(p_list), statistics.mean(r_list), statistics.mean(f1_list)]
stdev = [statistics.stdev(acc_list), statistics.stdev(p_list), statistics.stdev(r_list), statistics.stdev(f1_list)]
print('Mean values - accuracy: {}, precision: {}, recall: {}, f1-score: {}'.format(mean[0], mean[1], mean[2], mean[3]))
print('Stdv values - accuracy: {}, precision: {}, recall: {}, f1-score: {}'.format(stdev[0], stdev[1], stdev[2], stdev[3]))