# Survey data tidying code

*Jason Heeris, NAL*

The following code will recurse through the data structures for *challenge 2* and convert it to nested dicts and `ndarray`s. The keys remain the same. Within each dict you must index again by subject ID eg. `4222`. See the examples below.

In [1]:
import scipy.io

def dict_from_nobj(objdata):
    if objdata.dtype.kind != 'V':
        return objdata
        
    names = set(objdata.dtype.names)
    
    as_dict = {
        nm: dict_from_nobj(objdata[nm][()])
        for nm in names
    }
    return as_dict

def zip_with_index(data, index):
    if type(data) == dict:
        return {
            k: zip_with_index(data[k], index) for k in data
        }
    else:
        return {
            iv: data[num]            
            for num, iv in enumerate(index)
        }

def convert_survey_to_nested_dicts(data, index_name):
    index_vals = data[index_name][()]
    return zip_with_index(data, index_vals)

survey_data = scipy.io.loadmat('Hackathon Data/Challenge2/data/ClinicalData.mat', squeeze_me=True)
tidied = convert_survey_to_nested_dicts(dict_from_nobj(survey_data['MyIHeardata']), 'SubjID')

# Examples:

In [30]:
from pprint import pprint
print(tidied['TER']['SNR_Right'][4222])
print(tidied['PTA_Left'][998])
print(tidied['OAE_P2_Amplitude_Right'])

[13.5 23.5 23.2 23.5 10.1]
[15. 10.  0.  5. 10. 10. 15.]
{998: array([nan, nan, nan, nan, nan, nan, nan, nan]), 999: array([nan, nan, nan, nan, nan, nan, nan, nan]), 1001: array([11.56263  , 14.02164  ,  6.088096 , 15.07891  ,  0.5836863,
        8.160465 , 12.03649  , -1.902029 ]), 1002: array([ 9.167895,  9.211441,  8.229638,  2.466634,  8.86393 , 16.05083 ,
       16.94271 ,  4.87931 ]), 1003: array([21.04252 , 17.55815 , 11.55905 , 13.39359 , 16.43727 , 15.80256 ,
       16.20273 ,  3.330749]), 1004: array([15.33805 , 12.43796 , 11.1897  ,  9.48939 , 12.40654 ,  9.842617,
       13.869   ,  3.398509]), 1005: array([ 2.660547 , -0.4901747,  1.551638 ,  3.193802 ,  2.397072 ,
        7.669169 , -5.8142   , 14.94306  ]), 1006: array([ 5.210936, 12.14657 , 13.3499  ,  7.788217, 12.29796 , 15.21828 ,
       18.20079 , -2.99439 ]), 1007: array([19.5851 , 13.29337, 12.81538, 14.41912, 14.17577, 17.87767,
       17.86829, 12.06431]), 1010: array([14.27249 , 12.60135 ,  1.59484 ,  7.217408,

In [3]:
import numpy as np
from scipy.io import loadmat  # this is the SciPy module that loads mat-files
import matplotlib.pyplot as plt
from datetime import datetime, date, time
import pandas as pd

mat = loadmat('Hackathon Data/Challenge2/data/ClinicalData.mat')  # load mat-file
mdata = mat['MyIHeardata']  # variable in mat file
mdtype = mdata.dtype  # dtypes of structures are "unsized objects"
# * SciPy reads in structures as structured NumPy arrays of dtype object
# * The size of the array is the size of the structure array, not the number
#   elements in any particular field. The shape defaults to 2-dimensional.
# * For convenience make a dictionary of the data using the names from dtypes
# * Since the structure has only one element, but is 2-D, index it at [0, 0]
ndata = {n: mdata[n][0, 0] for n in mdtype.names}

In [4]:
indexs = ndata['SubjID']

In [5]:
good_col = []
bad_col = []
column_names = []
for key in ndata.keys():
    print(key)
    if ndata[key].shape[0] == 1:
        bad_col.append(key)
    else:
        if ndata[key].shape[1] > 1:

            add =  [key + str(k) for k in range(ndata[key].shape[1])]
            print(add)
            column_names += add
        else:
            column_names.append(key)
        good_col.append(key)

OAE_P2_Amplitude_Right
['OAE_P2_Amplitude_Right0', 'OAE_P2_Amplitude_Right1', 'OAE_P2_Amplitude_Right2', 'OAE_P2_Amplitude_Right3', 'OAE_P2_Amplitude_Right4', 'OAE_P2_Amplitude_Right5', 'OAE_P2_Amplitude_Right6', 'OAE_P2_Amplitude_Right7']
Adults_Dosimeter_personal_Stereo_moreThan85dBSPL_Left
SchooAge_Questioanire_Believes_Loud_Noisy_Leisure_Risk
SchooAge_Dosimeter_School_moreThan90dBA_SPL
Adults_Dosimeter_OtherLeisure_moreThan80dBA_SPL
Adults_Questioanire_EducationalLevel
SchooAge_Dosimeter_personal_Stereo_moreThan95dBA_SPL_Left
SchooAge_Questioanire_Behaviour_Avoids_Loud_Sounds
Adults_Dosimeter_personal_Stereo_moreThan70dBSPL_Right
Adults_Dosimeter_personal_Stereo_moreThan80dBSPL_Left
Otoscopy_stenosis_Right
SchooAge_Questioanire_Tinitus_Right
SchooAge_Dosimeter_personal_Stereo_moreThan80dBA_SPL_Left
Adults_Dosimeter_OtherLeisure_moreThan85dBA_SPL
Adults_Dosimeter_OtherLeisure_moreThan95dBA_SPL
SchooAge_Dosimeter_School_moreThan85dBA_SPL
Adults_Dosimeter_OtherLeisure_moreThan90dBSPL


In [7]:
data = np.hstack([ndata[c] for c in good_col])
print(data.shape)
print(len(column_names))

(1469, 249)
249
['OAE_P2_Amplitude_Right0', 'OAE_P2_Amplitude_Right1', 'OAE_P2_Amplitude_Right2', 'OAE_P2_Amplitude_Right3', 'OAE_P2_Amplitude_Right4', 'OAE_P2_Amplitude_Right5', 'OAE_P2_Amplitude_Right6', 'OAE_P2_Amplitude_Right7', 'Adults_Dosimeter_personal_Stereo_moreThan85dBSPL_Left', 'SchooAge_Questioanire_Believes_Loud_Noisy_Leisure_Risk', 'SchooAge_Dosimeter_School_moreThan90dBA_SPL', 'Adults_Dosimeter_OtherLeisure_moreThan80dBA_SPL', 'Adults_Questioanire_EducationalLevel', 'SchooAge_Dosimeter_personal_Stereo_moreThan95dBA_SPL_Left', 'SchooAge_Questioanire_Behaviour_Avoids_Loud_Sounds', 'Adults_Dosimeter_personal_Stereo_moreThan70dBSPL_Right', 'Adults_Dosimeter_personal_Stereo_moreThan80dBSPL_Left', 'Otoscopy_stenosis_Right', 'SchooAge_Questioanire_Tinitus_Right', 'SchooAge_Dosimeter_personal_Stereo_moreThan80dBA_SPL_Left', 'Adults_Dosimeter_OtherLeisure_moreThan85dBA_SPL', 'Adults_Dosimeter_OtherLeisure_moreThan95dBA_SPL', 'SchooAge_Dosimeter_School_moreThan85dBA_SPL', 'Adults_

In [8]:
temp = [ndata[c] for c in bad_col]

In [9]:
l = []
new_cols = []
for i in range(0,len(temp)):
    for name in temp[i].dtype.names:
        print(temp[i][name][0][0].shape)
        if temp[i][name][0][0].shape[1]>1:
            cols = [name + str(k) for k in range(temp[i][name][0][0].shape[1])]
            new_cols += cols
        else:
            new_cols.append(name)
        l.append(temp[i][name])  

(1469, 1)
(1469, 1)
(1469, 5)
(1469, 5)
(1469, 1)
(1469, 1)


In [17]:
bad_col

['Adults_Questioanire_Awareness', 'TER', 'SchooAge_Questioanire_Awareness']

In [18]:
x = np.hstack([np.hstack(l).T[i][0] for i in range(6)])

In [20]:
print(x.shape)
print(len(new_cols))
print(new_cols)

(1469, 14)
14
['Loud_Noise_Risk', 'Loud_Noise_Problems', 'SNR_Right0', 'SNR_Right1', 'SNR_Right2', 'SNR_Right3', 'SNR_Right4', 'Amp_Right0', 'Amp_Right1', 'Amp_Right2', 'Amp_Right3', 'Amp_Right4', 'Loud_Noise_Risk', 'Loud_Noise_Problems']


In [21]:
columns = column_names + new_cols

In [None]:
columns

In [22]:
import numpy as np
good_data = np.concatenate((data,x),axis=1)

In [28]:
df = pd.DataFrame(good_data,index=indexs[:,0],columns=columns)

In [32]:
#df.to_excel("uncleaned_data.xlsx")
df.to_csv("uncleaned_data.csv")

In [29]:
df.head()

Unnamed: 0,OAE_P2_Amplitude_Right0,OAE_P2_Amplitude_Right1,OAE_P2_Amplitude_Right2,OAE_P2_Amplitude_Right3,OAE_P2_Amplitude_Right4,OAE_P2_Amplitude_Right5,OAE_P2_Amplitude_Right6,OAE_P2_Amplitude_Right7,Adults_Dosimeter_personal_Stereo_moreThan85dBSPL_Left,SchooAge_Questioanire_Believes_Loud_Noisy_Leisure_Risk,...,SNR_Right2,SNR_Right3,SNR_Right4,Amp_Right0,Amp_Right1,Amp_Right2,Amp_Right3,Amp_Right4,Loud_Noise_Risk,Loud_Noise_Problems
998,,,,,,,,,,,...,,,,,,,,,,
999,,,,,,,,,,,...,,,,,,,,,,
1001,11.56263,14.02164,6.088096,15.07891,0.583686,8.160465,12.03649,-1.902029,,,...,-0.6,-6.4,-4.6,-0.4,0.9,-3.5,-20.9,-20.3,,
1002,9.167895,9.211441,8.229638,2.466634,8.86393,16.05083,16.94271,4.87931,,,...,1.0,1.9,6.4,3.7,1.0,-3.1,-4.1,-11.5,,
1003,21.04252,17.55815,11.55905,13.39359,16.43727,15.80256,16.20273,3.330749,,,...,26.3,27.1,14.7,8.6,15.3,14.3,8.5,-2.0,,


In [25]:
columns

['OAE_P2_Amplitude_Right0',
 'OAE_P2_Amplitude_Right1',
 'OAE_P2_Amplitude_Right2',
 'OAE_P2_Amplitude_Right3',
 'OAE_P2_Amplitude_Right4',
 'OAE_P2_Amplitude_Right5',
 'OAE_P2_Amplitude_Right6',
 'OAE_P2_Amplitude_Right7',
 'Adults_Dosimeter_personal_Stereo_moreThan85dBSPL_Left',
 'SchooAge_Questioanire_Believes_Loud_Noisy_Leisure_Risk',
 'SchooAge_Dosimeter_School_moreThan90dBA_SPL',
 'Adults_Dosimeter_OtherLeisure_moreThan80dBA_SPL',
 'Adults_Questioanire_EducationalLevel',
 'SchooAge_Dosimeter_personal_Stereo_moreThan95dBA_SPL_Left',
 'SchooAge_Questioanire_Behaviour_Avoids_Loud_Sounds',
 'Adults_Dosimeter_personal_Stereo_moreThan70dBSPL_Right',
 'Adults_Dosimeter_personal_Stereo_moreThan80dBSPL_Left',
 'Otoscopy_stenosis_Right',
 'SchooAge_Questioanire_Tinitus_Right',
 'SchooAge_Dosimeter_personal_Stereo_moreThan80dBA_SPL_Left',
 'Adults_Dosimeter_OtherLeisure_moreThan85dBA_SPL',
 'Adults_Dosimeter_OtherLeisure_moreThan95dBA_SPL',
 'SchooAge_Dosimeter_School_moreThan85dBA_SPL',
 '

In [27]:
indexs.shape

(1469, 1)