In [1]:
import os
import glob
import pandas as pd

import numpy as np
import nibabel as nib #Read / write access to some common neuroimaging file formats
import matplotlib.pyplot as plt

In [2]:
path = './openfmri'
t1wglob =  '/sub*/anat*/*T1w.nii.gz'

In [3]:
subdatasets = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path,d))]

In [4]:
includes_tsv = []
missing_tsv = []
for subdataset in subdatasets:
    if(os.path.isfile(os.path.join(path,subdataset,'participants.tsv'))):
        includes_tsv.append(subdataset)
    else:
        missing_tsv.append(subdataset)

In [5]:
os.path.join(path,subdatasets[0],'participants.tsv')

'./openfmri/ds000258/participants.tsv'

In [6]:
print(len(includes_tsv), len(missing_tsv))

76 24


In [7]:
includes_tsv[0:5]

['ds000009', 'ds000245', 'ds000116', 'ds000005', 'ds000224']

In [8]:
dfs = []
for i,ds in enumerate(includes_tsv):
    df = pd.read_csv(os.path.join(path,ds,'participants.tsv'),sep='\t')
    dfs_columns = df.columns.tolist()
    #print(i, ds, dfs_columns[:5])
    dfs.append(df)
    #files = glob.glob(ds+t1wglob)

In [9]:
for df in dfs:
    new_cols = []
    for c in df.columns:
        new_name = c.lower() #lower case
        if new_name == 'gender':
            new_name = 'sex'
        new_cols.append(new_name)
    df.columns = new_cols
    #print(df.columns[:5])

In [10]:
for i, df in enumerate(dfs):
    dfs_columns = df.columns.tolist()
    if( ('participant_id' in dfs_columns) and ('sex' in dfs_columns) and ('age' in dfs_columns) ):
        df = df[['participant_id', 'age', 'sex']]
    print(i,includes_tsv[i],df.columns[:5])

0 ds000009 Index(['participant_id', 'age', 'sex'], dtype='object')
1 ds000245 Index(['participant_id', 'age', 'sex'], dtype='object')
2 ds000116 Index(['participant_id', 'age', 'sex'], dtype='object')
3 ds000005 Index(['participant_id', 'age', 'sex'], dtype='object')
4 ds000224 Index(['participant_id', 'age', 'sex'], dtype='object')
5 ds000144 Index(['participant_id', 'poverty', 'race', 'scanage', 'jsex'], dtype='object')
6 ds000002 Index(['participant_id', 'age', 'sex'], dtype='object')
7 ds000006 Index(['participant_id', 'age', 'sex'], dtype='object')
8 ds000202 Index(['participant_id', 'sex', 'age (years)', 'harm avoidance (ha)',
       'novelty seeking (ns)'],
      dtype='object')
9 ds000120 Index(['participant_id', 'age', 'sex'], dtype='object')
10 ds000229 Index(['participant_id', 'age', 'sex'], dtype='object')
11 ds000221 Index(['participant_id', 'sex', 'age (5-year bins)'], dtype='object')
12 ds000011 Index(['participant_id', 'age', 'sex'], dtype='object')
13 ds000115 Index(['

In [135]:
dfs[5].head(4)

Unnamed: 0,participant_id,poverty,race,scanage,jsex,hand,iq,imp_p,sadsx_p,saddx_p,...,saddx_f,gadsx_f,gaddx_f,socsx_f,socdx_f,anyanx_p,anyanx_f,internsx_p,internsx_f,scanner
0,sub-01,0.0,1,7,F,1,132,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,sub-02,0.0,1,7,F,1,127,0,1,0,...,0,1,0,0,0,0,0,4,2,0
2,sub-03,0.0,1,6,F,2,93,0,0,0,...,0,3,0,0,0,0,0,2,4,0
3,sub-04,1.0,1,6,F,1,100,3,4,1,...,0,1,0,0,0,1,0,7,2,0


In [12]:
delete_datasets = [5]

In [13]:
dfs[8].head(4)

Unnamed: 0,participant_id,sex,age (years),harm avoidance (ha),novelty seeking (ns),reward dependence (rd),persistence (p),self-directedness (sd),cooperativeness (co),self-transcedence (st),tr_class
0,sub-1001,Female,20,17,12,13,6,42,33,13,SHORT_TR
1,sub-1002,Female,20,17,33,20,2,22,39,24,SHORT_TR
2,sub-1003,Female,21,16,24,20,7,35,37,15,SHORT_TR
3,sub-1004,Female,20,11,24,17,3,36,37,9,SHORT_TR


In [16]:
df = dfs[8].iloc[:,:3]
df.head(3)

Unnamed: 0,participant_id,sex,age (years)
0,sub-1001,Female,20
1,sub-1002,Female,20
2,sub-1003,Female,21


In [17]:
df.columns = ['participant_id', 'sex', 'age']
df = df[['participant_id', 'age', 'sex']]
df.head(3)

Unnamed: 0,participant_id,age,sex
0,sub-1001,20,Female
1,sub-1002,20,Female
2,sub-1003,21,Female


In [18]:
dfs[8] = df

In [173]:
df = dfs[11]
df.head(4)

Unnamed: 0,participant_id,sex,age
0,sub-010001,f,55-60
1,sub-010002,f,65-70
2,sub-010004,f,65-70
3,sub-010005,m,25-30


In [418]:
def fix_sex(str_sex_old):
    str_sex = str(str_sex_old).lower()
    if(str_sex=='male' or str_sex=='m'):
        return 'm'
    elif(str_sex=='female' or str_sex=='f'):
        return 'f'
    else:
        print(str_sex,'--> -1')
        return '-1'

In [216]:
df.sex = df.sex.apply(fix_sex)
df.head(4)

Unnamed: 0,participant_id,sex,age
0,sub-010001,f,55-60
1,sub-010002,f,65-70
2,sub-010004,f,65-70
3,sub-010005,m,25-30


In [217]:
df.columns = ['participant_id', 'sex', 'age']
df.age[0:6].tolist()

['55-60', '65-70', '65-70', '25-30', '65-70', '65-70']

In [218]:
df.head(4)

Unnamed: 0,participant_id,sex,age
0,sub-010001,f,55-60
1,sub-010002,f,65-70
2,sub-010004,f,65-70
3,sub-010005,m,25-30


In [219]:
def fix_agebin(age):
    str_age = str(age)
    if len(str_age) != 5:
        print(str_age, '--> -1')
        return -1
    if int(str_age[3:5])-int(str_age[0:2]) != 5:
        assert False
    return int(str_age[0:2])+2.5

In [220]:
df.age = df.age.apply(fix_agebin)

nan --> -1
nan --> -1


In [221]:
df.head(4)

Unnamed: 0,participant_id,sex,age
0,sub-010001,f,57.5
1,sub-010002,f,67.5
2,sub-010004,f,67.5
3,sub-010005,m,27.5


In [222]:
dfs[11] = df

In [None]:
delete_datasets.append(19)

In [20]:
dfs[20].head(4)

Unnamed: 0,id,sex,agegroup,bmi1,bmi2,educationlevel,hads_anxiety,hads_depression,isi,ksq_sleepqualityindex,...,ppir_ir15,ppir_ir40,badd_total,badd_activation,badd_attention,badd_effort,badd_affect,badd_memory,sl_cond,trial
0,9001,Male,Young,1978997095,1978997095,Studerar för närvarande på universitet/högskola,0,1,12,475,...,11.0,29.0,16.0,6.0,2.0,2.0,6.0,0.0,2,B
1,9002,Male,Old,2179944511,2146915048,Har avslutat gymnasieskolan,2,3,9,55,...,10.0,32.0,33.0,5.0,8.0,6.0,8.0,6.0,2,B
2,9003,Male,Old,2049861496,2049861496,Har examen från universitet/högskola,2,3,10,525,...,12.0,26.0,13.0,4.0,4.0,4.0,1.0,0.0,1,B
3,9004,Female,Old,2294811574,2294811574,Har examen från universitet/högskola,1,2,11,425,...,10.0,28.0,24.0,3.0,7.0,5.0,6.0,3.0,1,B


In [33]:
df = dfs[20].iloc[:,:3]
df.columns = ['participant_id', 'sex', 'age']
df = df[['participant_id', 'age', 'sex']]
df.head(4)

Unnamed: 0,participant_id,age,sex
0,9001,Young,Male
1,9002,Old,Male
2,9003,Old,Male
3,9004,Old,Female


In [34]:
# Young = 20-30, old = 65-75
# notum bara miðgildi
def fix_age(str_age):
    str_age = str(str_age)
    if(str_age.lower()=='young'):
        return 25
    elif(str_age.lower()=='old'):
        return 70
    else:
        print(str_age,'--> -1')
        return -1

In [35]:
df.age = df.age.apply(fix_age)
df.head(4)

nan --> -1


Unnamed: 0,participant_id,age,sex
0,9001,25,Male
1,9002,70,Male
2,9003,70,Male
3,9004,70,Female


In [37]:
df.sex = df.sex.apply(fix_sex)
df.head(4)

Unnamed: 0,participant_id,age,sex
0,9001,25,m
1,9002,70,m
2,9003,70,m
3,9004,70,f


In [38]:
dfs[20] = df

In [341]:
dfs[22] = df
df.head(1000)

Unnamed: 0,participant_id,age,sex,m_ssrtquant,m_numpumps_avg,m_totalamt_avg,m_numexplavg,m_logit_k,m_an-sn,b_ssrtquant,...,panas_(post-pre)-irritable,panas_(post-pre)-alert,panas_(post-pre)-ashamed,panas_(post-pre)-inspired,panas_(post-pre)-nervous,panas_(post-pre)-determined,panas_(post-pre)-attentive,panas_(post-pre)-jittery,panas_(post-pre)-active,panas_(post-pre)-afraid
0,sub-01,20,F,154.8,5.33,46.0,8,0.0042,0.6,106.7,...,0,0,0,0,0,3,0,0,1,-1
1,sub-02,26,F,259.0,6.17,39.75,15,0.0014,1.65,126.2,...,0,1,0,1,1,0,0,0,0,0
2,sub-03,22,M,176.1,5.36,42.0,10,0.0646,0.665,157.8,...,-1,0,0,1,-1,0,0,0,1,0
3,sub-04,19,M,99.5,5.75,42.5,12,0.0021,0.755,134.5,...,0,1,0,1,-1,1,0,0,0,-1
4,sub-05,20,F,125.4,5.72,44.0,11,0.004,0.15,97.0,...,0,-2,0,2,-4,3,0,-2,-1,-3
5,sub-06,21,M,114.8,5.78,43.25,12,0.0025,1.2,98.8,...,-1,1,-1,0,-2,-2,1,-1,0,-2
6,sub-07,21,M,160.2,5.61,43.0,11,0.1252,0.7,143.7,...,-1,0,-1,1,0,0,0,0,0,0
7,sub-09,20,M,196.8,5.61,35.5,15,0.0016,0.9,169.6,...,1,0,0,0,-1,1,0,1,0,0
8,sub-10,19,F,114.6,5.53,37.25,14,0.0205,0.065,156.0,...,-1,3,0,0,-1,3,1,1,2,-2
9,sub-11,19,F,172.4,5.81,42.0,13,0.0031,0.4,192.3,...,0,-1,0,0,-1,0,-1,-2,1,0


In [226]:
# nr38 tvær myndir af hverjum, eitt á baseline og svo þremur árum seinna
print(len(dfs[38]))

42


In [96]:
###dfs[39] = pd.read_csv(os.path.join(path,includes_tsv[39],'participants.tsv'),sep='\t')

In [104]:
df = dfs[39]
df.head(4)

Unnamed: 0,participant_id,study_group,handedness,gender
0,sub-01,MODAFINIL,Right,Male
1,sub-02,MODAFINIL,Right,Male
2,sub-03,MODAFINIL,Right,Male
3,sub-04,PLACEBO,Right,Male


In [105]:
df = df[['participant_id','gender']]
df['gender']= df['gender'].apply(fix_sex)
age = -np.ones(len(df.gender))
df = pd.DataFrame(data={"participant_id": df.participant_id, "age": age, "sex": df.gender})
df.head(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,participant_id,age,sex
0,sub-01,-1.0,m
1,sub-02,-1.0,m
2,sub-03,-1.0,m
3,sub-04,-1.0,m
4,sub-05,-1.0,m
5,sub-06,-1.0,m
6,sub-07,-1.0,m
7,sub-08,-1.0,m
8,sub-09,-1.0,m
9,sub-10,-1.0,m


In [None]:
df['sex']= df['sex'].apply(fix_sex)
age = -np.ones(len(df.sex))
df = pd.DataFrame(data={"participant_id": df.participant_id, "age": age, "sex": df.sex})
df.head(4)

In [106]:
dfs[39] = df

In [119]:
dfs[40] = pd.read_csv(os.path.join(path,includes_tsv[40],'participants.tsv'),sep='\t')

In [120]:
df = dfs[40]
df.head(4)

Unnamed: 0,participant_id,age,gender_F
0,sub-01,23,1
1,sub-02,26,0
2,sub-03,24,0
3,sub-04,24,0


In [121]:
df['gender_F'] = df['gender_F'].apply(lambda x : 'f' if x==1 else 'm')
df.head(4)

Unnamed: 0,participant_id,age,gender_F
0,sub-01,23,f
1,sub-02,26,m
2,sub-03,24,m
3,sub-04,24,m


In [122]:
df.columns = ['participant_id','age','sex']
df.head(4)

Unnamed: 0,participant_id,age,sex
0,sub-01,23,f
1,sub-02,26,m
2,sub-03,24,m
3,sub-04,24,m


In [254]:
df = dfs[43]
df.head(5)

Unnamed: 0,participant_id,sex,age
0,sub-01,9.796,F
1,sub-02,17.8535,F
2,sub-03,19.5866,F
3,sub-04,13.1034,M
4,sub-05,16.1588,M


In [255]:
df.columns = ['participant_id','age','sex']
df.head(5)

Unnamed: 0,participant_id,age,sex
0,sub-01,9.796,F
1,sub-02,17.8535,F
2,sub-03,19.5866,F
3,sub-04,13.1034,M
4,sub-05,16.1588,M


In [256]:
dfs[43] = df

In [257]:
dfs[52].head(4)

Unnamed: 0,participant_id,age,sex,dominant_hand
0,sub-emptyroom,,,
1,sub-0001,25.0,Male,Right


In [None]:
delete_datasets.append(52)

In [123]:
df = dfs[54]
df.head(4)

Unnamed: 0,participant_id,sex,group
0,sub-01,female,overweight
1,sub-02,female,overweight
2,sub-03,female,normalweight
3,sub-04,female,overweight


In [124]:
df.sex = df.sex.apply(fix_sex)
df.head(4)

Unnamed: 0,participant_id,sex,group
0,sub-01,f,overweight
1,sub-02,f,overweight
2,sub-03,f,normalweight
3,sub-04,f,overweight


In [127]:
age = -np.ones(len(df))
df = pd.DataFrame(data={"participant_id": df.participant_id, "age": age, "sex": df.sex})
df.head(4)

Unnamed: 0,participant_id,age,sex
0,sub-01,-1.0,f
1,sub-02,-1.0,f
2,sub-03,-1.0,f
3,sub-04,-1.0,f


In [128]:
dfs[54] = df

In [129]:
delete_datasets.append(60)

In [130]:
delete_datasets

[5, 19, 60]

In [280]:
dfs[64] = pd.read_csv(os.path.join(path,includes_tsv[64],'participants.tsv'),sep='\t')

In [281]:
dfs[64].head(5)

Unnamed: 0,participant_id,sex,age,handedness score
0,sub-01,M,395,0.3
1,sub-02,M,328,1.0
2,sub-04,M,269,0.8
3,sub-05,M,274,0.6
4,sub-06,M,331,0.7


In [282]:
df = dfs[64]

In [283]:
def fix_age_str(age):
    str_age=str(age)
    str_age = str_age.replace(',','.')
    return(float(str_age))       

In [284]:
df.age = df.age.apply(fix_age_str)
df.head(4)

Unnamed: 0,participant_id,sex,age,handedness score
0,sub-01,M,39.5,0.3
1,sub-02,M,32.8,1.0
2,sub-04,M,26.9,0.8
3,sub-05,M,27.4,0.6


In [267]:
dfs[64] = df
dfs[64].head(4)

Unnamed: 0,participant_id,age,sex
0,sub-01,9.796,F
1,sub-02,17.8535,F
2,sub-03,19.5866,F
3,sub-04,13.1034,M


In [285]:
df = dfs[69]
df.head(4)

Unnamed: 0,participant_id,sex,age,handedness,hearing_problems_current,hearing_problems_past,vision_problems_current,vision_problems_past,forrest_seen,forrest_seen_dist,forrest_seen_count,forrest_seen_languages,forrest_ad_known,forrest_av_rating,forrest_av_storydepth,forrest_av_fatigue,forrest_av_feeling,forrest_av_artist_count
0,sub-01,m,30-35,r,n,n,n,n,y,12.0,6.0,german;english,y,2.0,3,2,2,18
1,sub-02,m,35-40,r,n,n,n,n,y,0.0,20.0,german;english,y,4.0,3,2,4,30
2,sub-03,f,20-25,r,n,n,n,n,n,,,,y,4.0,4,1,4,8
3,sub-04,f,20-25,r,n,n,n,n,y,1.0,1.0,german,y,4.0,4,2,4,10


In [286]:
df.age = df.age.apply(fix_agebin)
df.head(4)

Unnamed: 0,participant_id,sex,age,handedness,hearing_problems_current,hearing_problems_past,vision_problems_current,vision_problems_past,forrest_seen,forrest_seen_dist,forrest_seen_count,forrest_seen_languages,forrest_ad_known,forrest_av_rating,forrest_av_storydepth,forrest_av_fatigue,forrest_av_feeling,forrest_av_artist_count
0,sub-01,m,32.5,r,n,n,n,n,y,12.0,6.0,german;english,y,2.0,3,2,2,18
1,sub-02,m,37.5,r,n,n,n,n,y,0.0,20.0,german;english,y,4.0,3,2,4,30
2,sub-03,f,22.5,r,n,n,n,n,n,,,,y,4.0,4,1,4,8
3,sub-04,f,22.5,r,n,n,n,n,y,1.0,1.0,german,y,4.0,4,2,4,10


In [287]:
dfs[69] = df

In [131]:
df = dfs[72]
df.head(4)

Unnamed: 0,participant_id,group,ageatfirstscanyears,handedness,sex,estimateofhoursplayingactionvideogamesperweekinlastyear
0,sub-01,Gamer,36,R,F,11
1,sub-02,Non-gamer,29,R,F,0
2,sub-03,Gamer,36,R,F,13
3,sub-04,Non-gamer,36,R,F,0


In [132]:
delete_datasets.append(72)

In [308]:
df=dfs[27]
df.head(4)

Unnamed: 0,participant_id,age,sex,education,iqverbal,lshs_14,lshsvoice_items4910,lshsaudit_items57,visualscore,visscore_high,...,omissions,falsealarms,correctrejections,trhit,tromission,trfalsealarm,trcorrectrej,trmean,pr,br
0,sub-01,35,1,6,29,5,0,2,3,1.0,...,18,4,41,1535,1649,1837,1413,1608.5,0.513149,0.200937
1,sub-02,26,2,7,20,2,1,1,2,,...,23,17,28,1331,1521,1678,1426,1489.0,0.119454,0.432044
2,sub-03,31,1,5,19,1,0,0,1,0.0,...,27,5,40,1938,1811,1872,1779,1850.0,0.291455,0.168747
3,sub-04,31,2,8,21,3,1,0,2,,...,23,5,38,1870,1931,2421,2061,2070.75,0.363525,0.196394


In [310]:
# "1": "male", "2": "female"
def fix12(sex):
    if sex==1 or sex=='M':
        return 'm'
    elif sex==2 or sex=='F':
        return 'f'
    else:
        print(sex,"--> -1")
        return -1

In [311]:
df.sex = df.sex.apply(fix12)

In [313]:
dfs[27]=df

In [314]:
dfs[27].head(5)

Unnamed: 0,participant_id,age,sex,education,iqverbal,lshs_14,lshsvoice_items4910,lshsaudit_items57,visualscore,visscore_high,...,omissions,falsealarms,correctrejections,trhit,tromission,trfalsealarm,trcorrectrej,trmean,pr,br
0,sub-01,35,m,6,29,5,0,2,3,1.0,...,18,4,41,1535,1649,1837,1413,1608.5,0.513149,0.200937
1,sub-02,26,f,7,20,2,1,1,2,,...,23,17,28,1331,1521,1678,1426,1489.0,0.119454,0.432044
2,sub-03,31,m,5,19,1,0,0,1,0.0,...,27,5,40,1938,1811,1872,1779,1850.0,0.291455,0.168747
3,sub-04,31,f,8,21,3,1,0,2,,...,23,5,38,1870,1931,2421,2061,2070.75,0.363525,0.196394
4,sub-05,33,m,8,25,6,0,1,1,0.0,...,6,19,25,1146,1537,1373,1433,1372.25,0.444249,0.779726


In [317]:
df=dfs[31]
df.head(5)

Unnamed: 0,participant_id,sex,age
0,sub-01,15.1184,M
1,sub-02,26.5325,F
2,sub-03,8.4517,F
3,sub-04,11.3895,M
4,sub-05,12.0794,M


In [318]:
df.columns = ['participant_id','age','sex']
df.head(5)

Unnamed: 0,participant_id,age,sex
0,sub-01,15.1184,M
1,sub-02,26.5325,F
2,sub-03,8.4517,F
3,sub-04,11.3895,M
4,sub-05,12.0794,M


In [319]:
dfs[31] = df

In [322]:
def fix_sex(str_sex_old):
    str_sex = str(str_sex_old).lower().strip()
    if(str_sex=='male' or str_sex=='m'):
        return 'm'
    elif(str_sex=='female' or str_sex=='f'):
        return 'f'
    else:
        print(str_sex,'--> -1')
        return -1

In [321]:
df=dfs[44]
df.head(20)

Unnamed: 0,participant_id,age,sex
0,sub-01,27,F
1,sub-02,29,F
2,sub-03,25,F
3,sub-04,30,F
4,sub-05,38,F
5,sub-06,22,F
6,sub-07,31,M
7,sub-08,36,M
8,sub-09,37,M
9,sub-10,24,F


In [337]:
df=dfs[0]
df.head(1000)

Unnamed: 0,participant_id,age,sex,m_ssrtquant,m_numpumps_avg,m_totalamt_avg,m_numexplavg,m_logit_k,m_an-sn,b_ssrtquant,...,panas_(post-pre)-irritable,panas_(post-pre)-alert,panas_(post-pre)-ashamed,panas_(post-pre)-inspired,panas_(post-pre)-nervous,panas_(post-pre)-determined,panas_(post-pre)-attentive,panas_(post-pre)-jittery,panas_(post-pre)-active,panas_(post-pre)-afraid
0,sub-01,20,F,154.8,5.33,46.0,8,0.0042,0.6,106.7,...,0,0,0,0,0,3,0,0,1,-1
1,sub-02,26,F,259.0,6.17,39.75,15,0.0014,1.65,126.2,...,0,1,0,1,1,0,0,0,0,0
2,sub-03,22,M,176.1,5.36,42.0,10,0.0646,0.665,157.8,...,-1,0,0,1,-1,0,0,0,1,0
3,sub-04,19,M,99.5,5.75,42.5,12,0.0021,0.755,134.5,...,0,1,0,1,-1,1,0,0,0,-1
4,sub-05,20,F,125.4,5.72,44.0,11,0.004,0.15,97.0,...,0,-2,0,2,-4,3,0,-2,-1,-3
5,sub-06,21,M,114.8,5.78,43.25,12,0.0025,1.2,98.8,...,-1,1,-1,0,-2,-2,1,-1,0,-2
6,sub-07,21,M,160.2,5.61,43.0,11,0.1252,0.7,143.7,...,-1,0,-1,1,0,0,0,0,0,0
7,sub-09,20,M,196.8,5.61,35.5,15,0.0016,0.9,169.6,...,1,0,0,0,-1,1,0,1,0,0
8,sub-10,19,F,114.6,5.53,37.25,14,0.0205,0.065,156.0,...,-1,3,0,0,-1,3,1,1,2,-2
9,sub-11,19,F,172.4,5.81,42.0,13,0.0031,0.4,192.3,...,0,-1,0,0,-1,0,-1,-2,1,0


In [424]:
new_df = pd.DataFrame(data={"dataset": [],"id": [] , "age": [], "sex": []})

In [425]:
for i, df in enumerate(dfs):
    if (i not in delete_datasets) and i != 38:
        dataset = [includes_tsv[i] for c in range(len(df))]
        print(i,includes_tsv[i])
        data = pd.DataFrame({"dataset": dataset, "id": df['participant_id'] , "age": df['age'], "sex": df['sex'].apply(fix_sex)})
        new_df = new_df.append(data, ignore_index=True)

0 ds000009
1 ds000245
2 ds000116
3 ds000005
4 ds000224
6 ds000002
7 ds000006
8 ds000202
9 ds000120
10 ds000229
11 ds000221
12 ds000011
13 ds000115
14 ds000219
15 ds000218
16 ds000231
17 ds000102
d --> -1
d --> -1
d --> -1
d --> -1
d --> -1
d --> -1
d --> -1
d --> -1
d --> -1
18 ds000256
20 ds000201
21 ds000234
22 ds000113c
23 ds000233
24 ds000171
25 ds000168
26 ds000017
nan --> -1
nan --> -1
nan --> -1
27 ds000203
28 ds000148
29 ds000217
30 ds000247
nan --> -1
31 ds000119
32 ds000212
33 ds000003
34 ds000236
35 ds000210
36 ds000235
37 ds000109
39 ds000133
40 ds000249
41 ds000228
42 ds000222
43 ds000121
44 ds000232
f  --> -1
f  --> -1
f  --> -1
f  --> -1
f  --> -1
f  --> -1
m  --> -1
m  --> -1
m  --> -1
f  --> -1
45 ds000238
46 ds000214
47 ds000138
48 ds000140
49 ds000030
50 ds000117
51 ds000240
52 ds000246
nan --> -1
53 ds000110
54 ds000213
55 ds000220
56 ds000008
57 ds000007
58 ds000254
59 ds000239
61 ds000053
62 ds000157
63 ds000170
64 ds000244
65 ds000172
66 ds000208
67 ds000177
68 d

In [436]:
import math
drop_rows = []
for index, row in new_df.iterrows():
    try:
        row['age'] = round(float(row['age']),2)
    except ValueError:
        print(index, row['dataset'], 'age:', row['age'],'--> -1')
        row['age'] = -1
    if math.isnan(row['age']):
        row['age'] = -1
        print(index, row['dataset'], 'age:', row['age'],'--> -1')
        
    if row['sex'] not in ['f', 'm','-1']:
        print(index, row['dataset'], 'sex:', row['sex'],'--> -1')
        row['sex'] = '-1'
    
    if row['age']==-1 and row['sex']=='-1':
        drop_rows.append(index)

In [427]:
drop_rows

[1014, 1016, 1020, 1143, 2165]

In [433]:
len(new_df)

2755

In [434]:
new_df = new_df.drop(drop_rows)

In [435]:
len(new_df)

2750

In [437]:
new_df.to_csv('filtered_data.csv', index=False)