In [None]:
import pandas as pd
import numpy as np
tune_path = 'nlst_tune_with_labels.npz' # @param {type:"string"}
train_path = 'nlst_train_with_labels.npz' # @param {type:"string"}

def read_embeddings(f):
    data = np.load(f,allow_pickle=True)
    key = data.files[0]
    return pd.DataFrame.from_dict(data[key].item(), orient='index')

df_tune = read_embeddings(tune_path)
df_train = read_embeddings(train_path)

demo=pd.read_csv('nlst_780_prsn_idc_20210527.csv')

In [None]:
def get_pid(df):
    pid=[int(i.split('/')[1]) for i in df.index]
    df['pid']=pid
    return df
df_tune=get_pid(df_tune)
df_train=get_pid(df_train)

In [None]:
def add_demo(df):
    columns=['age','gender','race']
    pid=list(df['pid'])
    demo_pid=list(demo['pid'])
    indices = [demo_pid.index(x) for x in pid if x in demo_pid]
    #print(indices)
    selected_rows = demo.iloc[indices]
    #print(selected_rows)
    selected_columns = selected_rows[columns]
    selected_columns.reset_index(drop=True, inplace=True)
    print(len(selected_columns),len(df))
    df['age']=list(selected_columns['age'])
    df['gender']=list(selected_columns['gender'])
    df['race']=list(selected_columns['race'])

    return df

df_tune=add_demo(df_tune)

In [None]:
df_train=add_demo(df_train)

## gender

In [None]:
import tensorflow as tf 
import keras
from sklearn.metrics import roc_auc_score
from keras.layers import Dense
from tensorflow.keras.losses import BinaryFocalCrossentropy
from keras.models import Sequential

import numpy as np
def make_mlp():
    model = Sequential()
    model.add(Dense(1, activation='sigmoid',input_dim=1408))
    return model


X_train=np.array(list(df_train['embedding']))
y_train=np.array(list(df_train['gender']))
X_test=np.array(list(df_tune['embedding']))
y_test=np.array(list(df_tune['gender']))

print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

print(np.unique(y_train),np.unique(y_test))
y_train=y_train-1
y_test=y_test-1
print(np.unique(y_train),np.unique(y_test))


model=make_mlp()
model.compile(loss=tf.keras.losses.BinaryFocalCrossentropy(), optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001), metrics=['AUC'])
from keras.callbacks import CSVLogger
csv_logger = CSVLogger('saved_csv/gender.csv') #change the name of this to reflect your model

history=model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=16,verbose=1, shuffle=True, callbacks=[csv_logger], use_multiprocessing = True)

## race

In [None]:
def clean_data(X,y):
    index_good=np.where(y<=3)
    return X[index_good],y[index_good]

def integer_to_categorical(array):
    if np.any((array < 1) | (array > 3)):
        raise ValueError("All integers must be between 1 and 7.")
    # Create a matrix of zeros with shape (number of integers, 7 categories)
    categorical = np.zeros((array.size, 3))
    # Use numpy advanced indexing to set the appropriate positions to 1
    categorical[np.arange(array.size), array - 1] = 1
    return categorical

from sklearn.preprocessing import label_binarize

def compute_per_class_auc(y_true, y_score):
    # Determine the number of classes based on y_score
    n_classes = y_score.shape[1]
    
    # Binarize the true labels for AUC calculation
    y_true_binarized = label_binarize(y_true, classes=np.arange(n_classes))
    
    # Calculate AUC for each class
    per_class_auc = {
        i: roc_auc_score(y_true_binarized[:, i], y_score[:, i]) for i in range(n_classes)
    }
    
    return per_class_auc

In [None]:

def make_mlp():
    model = Sequential()
    model.add(Dense(3, activation='softmax',input_dim=1408))
    return model


X_train=np.array(list(df_train['embedding']))
y_train=np.array(list(df_train['race']))
X_test=np.array(list(df_tune['embedding']))
y_test=np.array(list(df_tune['race']))

#only use white, black, asian
X_train,y_train=clean_data(X_train,y_train)
X_test,y_test=clean_data(X_test,y_test)

y_train = integer_to_categorical(y_train)
y_test = integer_to_categorical(y_test)

print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

print(np.unique(y_train,axis=0).shape,np.unique(y_test,axis=0).shape)

model=make_mlp()

model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001), metrics=[tf.keras.metrics.AUC(multi_label = True)])
csv_logger = CSVLogger('saved_csv/race.csv') #change the name of this to reflect your model
history=model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=16,verbose=1, shuffle=True, callbacks=[csv_logger], use_multiprocessing = True)

y_pred=model.predict(X_test)

In [None]:
race=['White',"Black or African-American","Asian"]

d=compute_per_class_auc(y_test, y_pred)
d={race[i]:np.round(j,4) for i,j in d.items()}
d

In [None]:
np.mean(list(d.values())),np.std(list(d.values()))

## age

In [None]:
def make_mlp():
    model = Sequential()
    model.add(Dense(1, input_dim=1408))
    return model


X_train=np.array(list(df_train['embedding']))
y_train=np.array(list(df_train['age']))-55
X_test=np.array(list(df_tune['embedding']))
y_test=np.array(list(df_tune['age']))-55


print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

print(np.unique(y_train,axis=0),np.unique(y_test,axis=0))

model=make_mlp()
model.compile(loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()], optimizer=tf.keras.optimizers.Adam(learning_rate = 0.001))

from keras.callbacks import CSVLogger
csv_logger = CSVLogger('saved_csv/age.csv') #change the name of this to reflect your model
callback = tf.keras.callbacks.EarlyStopping(monitor='val_root_mean_squared_error',mode='min',verbose=1, patience=10)
history=model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=16,verbose=1, shuffle=True, callbacks=[callback,csv_logger], use_multiprocessing = True)