In [135]:
import numpy as np
import scipy
from activ import load_data
from activ.readfile import load_preprocessed, TrackTBIFile, encode
from activ.data_normalization import data_normalization
import pandas as pd
from sklearn import preprocessing
from collections import Counter

In [95]:
subdata = load_data()
subdata_oc = subdata.outcomes
subdata_bm = subdata.biomarkers
oc_features = subdata.outcome_features
bm_features = subdata.biomarker_features
sub_features = np.hstack((subdata.biomarker_features, subdata.outcome_features))
ids = subdata.patient_ids

In [13]:
fulldata = load_preprocessed()
fulldata.shape        

(586, 918)

In [47]:
gose12 = fulldata[np.logical_or(fulldata.GOSE_OverallScore6M == 1.0, fulldata.GOSE_OverallScore6M == 2.0)]

In [150]:
gose12_ids = gose12.index

In [164]:
fulldata.cause

BR-1001            MV(car)
BR-1003    MV(cyclist/ped)
BR-1004            Assault
BR-1006            MV(car)
BR-1007     MV(motorcycle)
BR-1008    MV(cyclist/ped)
BR-1009               Fall
BR-1010               Fall
BR-1011            Assault
BR-1012               Fall
BR-1013               Fall
BR-1014            Assault
BR-1015               Fall
BR-1016    MV(cyclist/ped)
BR-1017               Fall
BR-1018            Assault
BR-1019            Assault
BR-1020               Fall
BR-1021               Fall
BR-1023            MV(car)
BR-1024               Fall
BR-1025               Fall
BR-1026    MV(cyclist/ped)
BR-1027            MV(car)
BR-1028            MV(car)
BR-1029     MV(motorcycle)
BR-1030               Fall
BR-1031          Struck By
BR-1032               Fall
BR-1033            MV(car)
                ...       
SF-1315            Assault
SF-1316            MV(car)
SF-1318    MV(cyclist/ped)
SF-1319     MV(motorcycle)
SF-1320               Fall
SF-1321    MV(cyclist/ped)
S

In [106]:
gose12_responder6m = gose12['GOSE_Reponder6M']

In [115]:
# one-hot encoding oc
gose12['GOSE_Reponder6M_Patient.alone'] = np.zeros(29)
gose12['GOSE_Reponder6M_Patient.plus.relative'] = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [119]:
keep_cols = np.asarray(list(set(sub_features).intersection(gose12.columns)))
keep_oc = np.asarray(list(set(keep_cols).intersection(oc_features)))
keep_bm = np.asarray(list(set(keep_cols).intersection(bm_features)))

In [147]:
len(keep_cols)

389

In [143]:
bm_diff = sorted(list(set(bm_features).difference(keep_bm)))

In [144]:
bm_diff

['DispER_Discharge.Home',
 'DispER_ICU.Admit',
 'DispER_Operating.Room',
 'DispER_Stepdown.Admit',
 'DispER_Ward Admit',
 'PatientTypeCoded_ED.Discharge',
 'PatientTypeCoded_ICU.Admit',
 'PatientTypeCoded_Ward.Admit',
 'PresArrivalMethod_Ambulance',
 'PresArrivalMethod_Helicopter',
 'PresArrivalMethod_Walk.in',
 'RACE_3CAT_Black',
 'RACE_3CAT_Other',
 'RACE_3CAT_White',
 'cause_Assault',
 'cause_Fall',
 'cause_MV.car.',
 'cause_MV.cyclist.ped.',
 'cause_MV.motorcycle.',
 'cause_Other',
 'cause_Struck.By',
 'injurytype_Closed',
 'injurytype_Penetrating']

In [184]:
for c in fulldata.columns:
    if 'RACE' in c:
        print(c)

RACE_3CAT


In [187]:
Counter(fulldata.injurytype)

Counter({'Closed': 578, 'Blast': 1, 'Penetrating': 5, nan: 2})

In [188]:
gose12.injurytype

BR-1043         Closed
BR-1078         Closed
PI-1014         Closed
PI-1020         Closed
PI-1046         Closed
PI-1051         Closed
PI-1052         Closed
PI-1053         Closed
PI-1061         Closed
PI-1065         Closed
PI-1091         Closed
PI-1098         Closed
PI-1113         Closed
PI-1131         Closed
PI-1137         Closed
PI-1145         Closed
PI-1161         Closed
PI-1175         Closed
SF-1009         Closed
SF-1073         Closed
SF-1118         Closed
SF-1129    Penetrating
SF-1130         Closed
SF-1162         Closed
SF-1168         Closed
SF-1211         Closed
SF-1267         Closed
SF-1271         Closed
SF-1316         Closed
Name: injurytype, dtype: object

In [190]:
np.where(gose12.injurytype == 'Penetrating')

(array([21]),)

In [191]:
# one-hot encoding bm

gose12['cause_Assault'] = [0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0]
gose12['cause_Fall'] = [1,1,1,1,0,1,1,0,1,1,0,1,0,1,1,1,1,1,1,0,1,1,1,1,0,0,0,1,0]
gose12['cause_MV.car.'] = [0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1]
gose12['cause_MV.cyclist.ped.'] = [0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
gose12['cause_MV.motorcycle.'] = np.zeros(29)
gose12['cause_Other'] = np.zeros(29)
gose12['cause_Struck.By'] = np.zeros(29)
gose12['DispER_Discharge.Home'] = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
gose12['DispER_ICU.Admit'] = [0,1,0,1,1,0,1,0,1,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1,0,0,0,0,0]
gose12['DispER_Operating.Room'] = [0,0,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0]
gose12['DispER_Stepdown.Admit'] = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0]
gose12['DispER_Ward Admit'] = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0]
gose12['PatientTypeCoded_ED.Discharge'] = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1]
gose12['PatientTypeCoded_ICU.Admit'] = [0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0]
gose12['PatientTypeCoded_Ward.Admit'] = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0]
gose12['PresArrivalMethod_Ambulance'] = [1,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1]
gose12['PresArrivalMethod_Helicopter'] = [0,1,0,1,1,0,0,1,1,1,1,1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0]
gose12['PresArrivalMethod_Walk.in'] = np.zeros(29)
gose12['RACE_3CAT_Black'] = np.zeros(29)
gose12['RACE_3CAT_Other'] = np.zeros(29)
gose12['RACE_3CAT_White'] = np.ones(29)
gose12['injurytype_Closed'] = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1]
gose12['injurytype_Penetrating'] = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the ca

In [194]:
gose12.shape

In [197]:
gose12

412

In [5]:
mask = pd.read_csv('/Users/ahyeon/Desktop/TBIanalysis/data/mask_feature_outcome.txt', header=None)
mask = mask.values
len(mask)

918

In [6]:
bm_mask = np.where(mask==0)[0]
oc_mask = np.where(mask==1)[0]

In [7]:
oc_mask

array([329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341,
       342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354,
       355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367,
       368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380,
       381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393,
       394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406,
       407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
       420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432,
       433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445,
       446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458,
       459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471,
       472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484,
       485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497,
       498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 50

In [8]:
data = load_preprocessed()
data.shape

(586, 918)

In [9]:
full_bm = np.asarray(data.columns[bm_mask])
full_oc = np.asarray(data.columns[oc_mask])
len(full_bm), len(full_oc)

(524, 394)

In [140]:
idx = [i for i, x in enumerate(data.columns) if 'GOSE_Overall' in x]

In [141]:
data.iloc[:, idx]

Unnamed: 0,GOSE_OverallScore3M,GOSE_OverallScore6M,GOSE_OverallScore12M
BR-1001,,,
BR-1003,,,
BR-1004,8.0,8.0,
BR-1006,,,
BR-1007,,,
BR-1008,6.0,5.0,4.0
BR-1009,7.0,,
BR-1010,,,6.0
BR-1011,,,
BR-1012,5.0,,


In [167]:
temp = data[np.logical_or(data['GOSE_OverallScore3M']==1.0, data['GOSE_OverallScore3M']==2.0)].iloc[:,oc_mask]

temp.isnull().sum().sum()/np.product(temp.shape)

0.9738672682835119

In [10]:
data_bm = data[full_bm]
data_oc = data[full_oc]
data_bm.shape

(586, 524)

In [90]:
def encode_bm(df, get_binvars=False):
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder, LabelBinarizer
    cols = dict()
    enc = LabelEncoder()
    lb = LabelBinarizer()
    binvars = dict()
    for colname, coltype in df.dtypes.items():
        if coltype == object:
            newcol = enc.fit_transform(df[colname].astype(str))
            if len(enc.classes_) > 2:
                newcols = lb.fit_transform(newcol)
                for i, c in enumerate(newcols.T):
                    name = "%s=%s" % (colname, enc.classes_[lb.classes_[i]])
                    cols[name] = c
            else:
                cols[colname] = newcol
                binvars[colname] = enc.classes_
        else:
            cols[colname] = df[colname]
    newdf = pd.DataFrame(cols, index=df.index)
    if get_binvars:
        return newdf, binvars
    else:
        return newdf

In [108]:
def encode_oc(df, get_binvars=False):
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder, LabelBinarizer
    cols = dict()
    enc = LabelEncoder()
    lb = LabelBinarizer()
    binvars = dict()
    for colname, coltype in df.dtypes.items():
        if coltype == object:
            newcol = enc.fit_transform(df[colname].astype(str))
            cols[colname] = newcol
            binvars[colname] = enc.classes_
        else:
            cols[colname] = df[colname]
    newdf = pd.DataFrame(cols, index=df.index)
    if get_binvars:
        return newdf, binvars
    else:
        return newdf

In [92]:
newdata_bm = encode_bm(data_bm)

In [4]:
newdata_bm.shape

NameError: name 'newdata_bm' is not defined

In [93]:
newdata_oc = encode_oc(data_oc)

In [94]:
len(newdata_bm.columns), len(newdata_oc.columns)

(830, 394)

In [96]:
TrackTBIFile.write('/Users/ahyeon/Desktop/TBIanalysis/data/fulldata.h5', newdata_bm, newdata_oc, biomarker_features=newdata_bm.columns, outcome_features=newdata_oc.columns)

In [97]:
for ii,c in enumerate(newdata_oc.columns):
    if 'GOSE_Overall' in c:
        print(c,ii)

GOSE_OverallScore3M 24
GOSE_OverallScore6M 77
GOSE_OverallScore12M 287


In [114]:
gose3m = newdata_oc.iloc[:,24].fillna(0).values.astype(int)
gose6m = newdata_oc.iloc[:,77].fillna(0).values.astype(int)
gose12m = newdata_oc.iloc[:,287].fillna(0).values.astype(int)

In [125]:
from collections import Counter
Counter(gose3m)



Counter({0: 130, 8: 129, 6: 72, 7: 133, 5: 53, 4: 20, 3: 22, 1: 25, 2: 2})

In [126]:
newdata_oc[gose3m==1]

Unnamed: 0,MR_result,GOSE_Responder3M,GOSE_SimpleCommand3M,GOSE_NeedAssistance3M,GOSE_NeedFreqHelp3M,GOSE_AssistanceBaseline3M,GOSE_Shop3M,GOSE_ShopBaseline3M,GOSE_Travel3M,GOSE_TravelBaseline3M,...,CVLTFreeRecallIntrusionsStandard_12mo,CVLTCuedRecallIntrusionsRaw_12mo,CVLTCuedRecallIntrusionsStandard_12mo,CVLTTotalIntrusionsRaw_12mo,CVLTTotalIntrusionsStandard_12mo,CVLTTotalRepetitionsRaw_12mo,CVLTTotalRepetitionsStandard_12mo,CVLTTotalRecognitionHitsRawScore_12mo,CVLTTotalRecognitionHitsStandardScore_12mo,CVLTTotalRecognitionDiscriminabilityRawScore_12mo
BR-1043,,3,2,2,2,2,2,2,2,2,...,,,,,,,,,,
BR-1078,,3,2,2,2,2,2,2,2,2,...,,,,,,,,,,
PI-1014,,3,2,2,2,2,2,2,2,2,...,,,,,,,,,,
PI-1020,,3,2,2,2,2,2,2,2,2,...,,,,,,,,,,
PI-1051,,3,2,2,2,2,2,2,2,2,...,,,,,,,,,,
PI-1052,,3,2,2,2,2,2,2,2,2,...,,,,,,,,,,
PI-1053,1.0,3,2,2,2,2,2,2,2,2,...,,,,,,,,,,
PI-1061,,3,2,2,2,2,2,2,2,2,...,,,,,,,,,,
PI-1065,,3,2,2,2,2,2,2,2,2,...,4.0,16.0,5.0,28.0,5.0,1.0,-1.0,15.0,0.0,1.0
PI-1091,1.0,3,2,2,2,2,2,2,2,2,...,,,,,,,,,,


In [120]:
count=0
patients3m = []
patients6m = []
patients1 = []
patients2 = []

for i in range(586):
    if gose3m[i] == 1 or gose3m[i] == 2:
        patients3m.append(i)
    if gose3m[i] ==1:
        patients1.append(i)
    if gose3m[i]==2:
        patients2.append(i)
    count+=1
print(len(patients3m))
bm_gose3m = np.zeros((len(patients3m), newdata_bm.shape[1]))
oc_gose3m = np.zeros((len(patients3m), newdata_oc.shape[1]))
for ii,p in enumerate(patients3m):
    bm_gose3m[ii,:] = newdata_bm.iloc[p,:]
    oc_gose3m[ii,:] = newdata_oc.iloc[p,:]
print(bm_gose3m.shape)
print(oc_gose3m.shape)

27
(27, 830)
(27, 394)


In [121]:
fname = '/Users/ahyeon/data/activ/gose1and2.h5'
TrackTBIFile.write(fname, bm_gose3m, oc_gose3m)

In [1]:
247+27

274