In [None]:
import pandas as pd
import numpy as np
import scipy.io
import matplotlib.pyplot as plt
import seaborn as sns
import math

from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import  ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from sklearn.inspection import permutation_importance

plt.rcParams['figure.figsize'] = [18, 6]

## Load Dataset

In [None]:
df_training = pd.read_csv('./bap-imgap-212022/train.csv')
print(df_training.head(), '\n')


df_test = pd.read_csv('./bap-imgap-212022/test.csv')
print(df_test.head())


def get_data_from_mat(train_file, test_file):
    train_mat = scipy.io.loadmat(train_file) 
    test_mat = scipy.io.loadmat(test_file) 

    train_np = np.array(train_mat['train_data']).transpose(2, 0, 1)
    test_np = np.array(test_mat['test_data']).transpose(2, 0, 1)

    print(train_np.shape)
    print(test_np.shape)

    return train_np, test_np

train_data, test_data = get_data_from_mat('./bap-imgap-212022/train_data.mat','./bap-imgap-212022/test_data.mat')

#### Brain Regions

In [None]:
regions = pd.read_csv('./bap-imgap-212022/regions.csv')
regions = regions.drop(['id'], axis = 1)

#### Data Treatment

In [None]:
original_train_data = []
connecitvity = []
done = False

for t in train_data:
    aux = []
    for i in range(len(t)):
        for j in range(len(t[i])):
            if i > j:
                aux.append(t[i][j])
                if not done:
                    connecitvity.append(str(regions.loc[regions.index[i]]['region']) + '--' + str(regions.loc[regions.index[j]]['region']))
    
    done = True

    original_train_data.append(aux)

original_train_data = np.array(original_train_data)
original_train_data = pd.DataFrame(original_train_data, columns = connecitvity)

df_training = pd.concat([df_training, original_train_data], axis = 1)

df_training = df_training.drop(['id'], axis = 1)

## Feature metrics

In [None]:
NUM_FEATURE_METRICS = 10

#### Variance

In [None]:
vars_dic = {}
vars = df_training.var()

for i in range(len(vars)):
    vars_dic[df_training.columns[i]] = vars[i]

top_vars = sorted(vars_dic.items(), key = lambda x: x[1], reverse = True)[0:NUM_FEATURE_METRICS]
print(top_vars)

#### Skewness

In [None]:
skews_dic = {}
skews = df_training.skew()

for i in range(len(skews)):
    skews_dic[df_training.columns[i]] = skews[i]


top_skews = []

for e in sorted(skews_dic.items(), key = lambda x: abs(x[1]), reverse = False):
    if e[1] != 0.0:
        top_skews.append(e)


top_skews = top_skews[0:NUM_FEATURE_METRICS]
print(top_skews)

#### Kurtosis

In [None]:
kurts_dic = {}
kurts = df_training.kurt()

for i in range(len(kurts)):
    kurts_dic[df_training.columns[i]] = kurts[i]


top_kurts = []

for e in sorted(kurts_dic.items(), key = lambda x: abs(x[1]), reverse = False):
    if e[1] != 0.0:
        top_kurts.append(e)


top_kurts = top_kurts[0:NUM_FEATURE_METRICS]
print(top_kurts)

#### Correlation with Age

In [None]:
corrs = df_training.corr()
corrs = corrs['age'][1:]

corrs_dic = {}

for i in range(len(corrs)):
    corrs_dic[df_training.columns[i]] = corrs[i] if not math.isnan(corrs[i]) else 0.0

top_corrs = sorted(corrs_dic.items(), key = lambda x: abs(x[1]), reverse = True)[0:NUM_FEATURE_METRICS]
print(top_corrs)

## Validation Split

In [None]:
x = df_training.drop(['age'], axis = 1)
y = df_training['age']

In [None]:
def countDup(data, elem):
    i = 0

    for e in data:
        if e == elem:
            i += 1

    return i


def validationSplit(data):
    val = []
    blacklist = []

    for e in range(len(data)):
        if data[e] not in blacklist:
            if countDup(data, data[e]) > 5:
                index = takePerc(data, 0.3, data[e])

                for i in index:
                    val.append(i)

            elif countDup(data, data[e]) > 1 and countDup(data, data[e]) <= 5:
                val.append(e)
            
            blacklist.append(data[e])

    return val


def takePerc(data, perc, elem):
    threshold = int(countDup(data, elem) * perc)
    i = 0
    index = []

    for e in range(len(data)):
        if i < threshold:
            if data[e] == elem:
                index.append(e)
                i += 1
        else:
            break

    return index


def split(x, y, index):

    x_train = x
    y_train = y
    x_val = []
    y_val = []

    for i in index:
        x_val.append(x.loc[x.index[i]])
        y_val.append(y[i])
        x_train = x_train.drop([i], axis = 0)
        y_train = y_train.drop([i], axis = 0)

    return x_train, np.array(x_val), y_train, np.array(y_val)

index = np.array(validationSplit(y))
x_train, x_val, y_train, y_val = split(x, y, index)

sns.set_style('whitegrid')
sns.countplot(x = 'age', data = pd.DataFrame(y, columns = ['age']), palette = 'rainbow')
plt.show()

sns.set_style('whitegrid')
sns.countplot(x = 'age', data = pd.DataFrame(y_train, columns = ['age']), palette = 'rainbow')
plt.show()

sns.set_style('whitegrid')
sns.countplot(x = 'age', data = pd.DataFrame(y_val, columns = ['age']), palette = 'rainbow')
plt.show()

## Data pipeline

#### Show

In [None]:
def show_history(history):
    print(history.history.keys())
    
    # Summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc = 'upper right')
    plt.show()

#### Prepare the train datasets

In [None]:
BATCHSIZE = 1

def prepare_callbacks(file):
    checkpoint = ModelCheckpoint(filepath = file, monitor = 'val_loss', 
                                 verbose = 1, save_weights_only = True, save_best_only = True)

    earlyStopper = EarlyStopping(monitor = 'val_loss', min_delta = 0.00001, patience = 50, verbose = 1)

    reduceLR = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.5, patience = 5, min_lr = 0.000000001, verbose = 1)

    return [checkpoint, earlyStopper, reduceLR]

In [None]:
def createModel(output, input):

    model = Sequential()

    model.add(Flatten(input_shape = (input,)))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha = 0.01))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha = 0.01))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha = 0.01))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha = 0.01))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha = 0.01))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha = 0.01))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha = 0.01))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha = 0.01))
    model.add(Dense(512))
    model.add(Dense(output, activation = 'linear'))

    model.compile(optimizer = Adam(learning_rate = 0.001), loss = 'mae')
    
    return model


test = createModel(1, x_train.shape[1])
print(test.summary())

In [None]:
TRAIN = False

def runningPipeline(path):
    for i in range(5):
        model = createModel(1, x_train.shape[1])

        _ = model.fit(x = x_train, y = y_train, epochs = 1000, batch_size = BATCHSIZE, 
                      validation_data = (x_val, y_val), 
                      callbacks = prepare_callbacks(path + f'/model{i}/cp.ckpt'))

if TRAIN:
    runningPipeline('./models')

## Evaluate

In [None]:
EVALUATE = True

def evaluatePipeline(path):
    for i in range(5):
        model = createModel(1, x_train.shape[1])

        model.load_weights(path + f'/model{i}/cp.ckpt')

        print(model.evaluate(x_val, y_val, batch_size = BATCHSIZE, verbose = 2))
    
if EVALUATE:
    evaluatePipeline('./models')

## Load Best Model

In [None]:
bestModel = 2

model = createModel(1, x_train.shape[1])
model.load_weights(f'./models/model{bestModel}/cp.ckpt')

## Feature Importance

In [None]:
FEATURE_IMPORTANCE = False

if FEATURE_IMPORTANCE:
    r = permutation_importance(model, x_val, y_val, n_repeats = 5, random_state = 0, scoring = ['neg_mean_absolute_error'])
    r = r['neg_mean_absolute_error']

In [None]:
NUM_FEATURES = 10
count = 0
dic = {}

for i in r.importances_mean.argsort()[::-1]:
    if count < NUM_FEATURES:
        if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
            dic[x.columns[i]] = (r.importances_mean[i], r.importances_std[i])
            count += 1
    else:
        break

print(dic)

## Hard Subjects to Predict

In [None]:
predictions = np.around(model.predict(x), 0)

NUM_ROWS = 10
dic = {}

for p in range(len(predictions)):
    error = abs(predictions[p] - y[p])
    dic[p + 1] = error[0]

dic = {k: v for k, v in sorted(dic.items(), key = lambda item: item[1])}


top = [(k, v) for k, v in dic.items()][-NUM_ROWS:]
print(top)

## Predictions

In [None]:
df_test = df_test.drop(['id'], axis = 1)

new_test_data = []
PREDICT = False

for t in test_data:
    aux = []
    for i in range(len(t)):
        for j in range(len(t[i])):
            if i > j:
                aux.append(t[i][j])

    new_test_data.append(aux)

new_test_data = np.array(new_test_data)
new_test_data = pd.DataFrame(new_test_data)

test_data = pd.concat([df_test, new_test_data], axis = 1)

if PREDICT:
    predictions = model.predict(test_data)
    predictions = np.around(predictions, 0)
    print(predictions)

In [None]:
WRITE = False

if WRITE:
    f = open('submission.csv', 'w+')

    f.write('id,age\n')

    for i in range(len(predictions)):
        f.write(str(i + 1) + ',' + str(int(predictions[i][0])) + '\n')

    f.flush()