In [76]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model, Model
from sklearn.preprocessing import StandardScaler, maxabs_scale
from keras.optimizers import Adam, SGD
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score,f1_score, roc_curve, auc, accuracy_score, confusion_matrix
import mltest
import tensorflow as tf
import keras.backend as K

In [25]:
merged_df = pd.read_csv('/data/meron/kenya_data/meron_merged_mal(sam_mam_upsampled)_muac.csv')
meta_data = pd.read_csv('/data/meron/kenya_data/meron_link_data/meron_meta_processed.csv')
sam_names=pd.read_csv('/data/meron/sam_photo_ids.csv')
mam_names=pd.read_csv('/data/meron/mam_photo_ids.csv')

merged_df=merged_df[merged_df['wfh']<4]

In [26]:
sam_samples=merged_df.loc[merged_df['photo_id'].isin(sam_names['photo_id'])]
mam_samples=merged_df.loc[merged_df['photo_id'].isin(mam_names['photo_id'])]

other_samples=merged_df.loc[~merged_df['photo_id'].isin(sam_names['photo_id'])].loc[~merged_df['photo_id'].isin(mam_names['photo_id'])]

In [27]:
sam_train=sam_samples[:188]
sam_validation=sam_samples[188:]
mam_train=mam_samples[:750]
mam_validation=mam_samples[750:]

In [28]:
target_sam_train = np.array(sam_train[['maln_mam','maln_normal','maln_sam']])
target_sam_validation=np.array(sam_validation[['maln_mam','maln_normal','maln_sam']])
target_sam_muac_validation=np.array(sam_validation['muac_prediction'])


target_mam_train = np.array(mam_train[['maln_mam','maln_normal','maln_sam']])
target_mam_validation=np.array(mam_validation[['maln_mam','maln_normal','maln_sam']])
target_mam_muac_validation=np.array(mam_validation['muac_prediction'])


target_other=np.array(other_samples[['maln_mam','maln_normal','maln_sam']])
target_other_muac_validation=np.array(other_samples['muac_prediction'])

In [29]:
deselect_cols = ['photo_id','maln_mam','maln_normal','maln_sam','wfh','maln_class', 'muac_prediction']
select = [x for x in merged_df.columns if x not in deselect_cols]
sam_train_features = sam_train.loc[:, select]
sam_validation_features = sam_validation.loc[:, select]

mam_train_features = mam_train.loc[:, select]
mam_validation_features = mam_validation.loc[:, select]

other_features=other_samples.loc[:, select]
other_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2040,2041,2042,2043,2044,2045,2046,2047,gender_male,age_months
4,0.0,0.369026,2.461378,2.946527,0.661946,4.072535,0.526728,6.682256,0.0,0.0,...,0.687893,0.0,0.0,1.743518,0.575827,0.0,2.062518,0.352439,1,18.0
9,6.018714,0.0,4.922188,0.250943,3.632266,0.441533,2.783796,9.161777,0.0,0.0,...,0.0,0.0,0.451506,3.653616,0.73475,0.0,0.0,0.0,0,25.0
10,0.302067,0.0,0.856374,4.006856,0.0,0.476372,4.314477,11.858264,0.0,0.0,...,1.534502,0.0,0.453811,16.605633,0.0,0.845772,0.506828,0.0,1,40.0
11,0.0,0.29694,4.927894,0.212098,0.008911,1.134834,3.333884,2.95491,1.047672,0.0,...,1.352508,0.005078,0.010983,8.27194,0.0,0.0,0.876071,0.032372,0,41.0
12,0.0,0.228124,0.801086,0.476915,1.492239,1.457904,1.052291,18.379776,0.0,0.0,...,0.005074,0.0,0.0,12.780549,0.740513,0.0,1.912864,1.549561,0,28.0


In [30]:
whole_data=pd.concat([other_features, sam_train_features, sam_validation_features,mam_train_features,mam_validation_features], axis=0)

In [31]:
### scale/standardize the features?
conv_scaler = StandardScaler().fit(whole_data)
sam_train_features_scaled = conv_scaler.transform(sam_train_features)
sam_validation_features_scaled=conv_scaler.transform(sam_validation_features)

mam_train_features_scaled = conv_scaler.transform(mam_train_features)
mam_validation_features_scaled=conv_scaler.transform(mam_validation_features)

other_features_scaled=conv_scaler.transform(other_features)

In [32]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(other_features_scaled, target_other, test_size=0.2, 
                                                        random_state=42, stratify=target_other)

X_train=np.concatenate((X_train1,sam_train_features_scaled,mam_train_features_scaled), axis=0)
y_train=np.concatenate((y_train1,target_sam_train,target_mam_train), axis=0)

X_test=np.concatenate((X_test1,sam_validation_features_scaled,mam_validation_features_scaled), axis=0)
y_test=np.concatenate((y_test1,target_sam_validation, target_mam_validation), axis=0)

In [97]:

def check_dim(input_array, label_array):
    
    assert input_array.shape[1] == 2050
    assert label_array.shape[1] == 3
    

In [98]:
check_dim(X_train, y_train)

In [73]:
y_test.shape

(838, 3)