# Analyze and Combine GAD datasets

In [23]:
import os
import glob
import numpy as np
import pandas as pd
from fnmatch import fnmatch

In [27]:
def get_files_from_subdirs(pattern, root='datasets/RE/GAD/'):
    all_filenames = []
    for path, subdirs, files in os.walk(root):
        for name in files:
            if fnmatch(name, pattern):
                all_filenames.append(os.path.join(path, name))
    return all_filenames

In [96]:
def combine_files(file_list: list, sep: str, output_fname: str, header):
    # combine all files in the list
    combine_files = pd.concat([pd.read_csv(f, sep, header=header) for f in file_list ], sort=False)
    # export to csv
    if sep is '\t':
        if header is None:
            combine_files.to_csv( '{}.tsv'.format(output_fname), header=False, index=False, encoding='utf-8')
        else:
            combine_files.to_csv( '{}.tsv'.format(output_fname), index=False, encoding='utf-8')
    else:
        if header is None:
            combine_files.to_csv( '{}.csv'.format(output_fname), header=False, index=False, encoding='utf-8')
        else:
            combine_files.to_csv( '{}.csv'.format(output_fname), index=False, encoding='utf-8')
    return combine_files

In [97]:
train_pattern = "*train.tsv"
train_fnames = get_files_from_subdirs(train_pattern)

In [98]:
train_fnames

['datasets/RE/GAD/6/train.tsv',
 'datasets/RE/GAD/10/train.tsv',
 'datasets/RE/GAD/2/train.tsv',
 'datasets/RE/GAD/8/train.tsv',
 'datasets/RE/GAD/3/train.tsv',
 'datasets/RE/GAD/5/train.tsv',
 'datasets/RE/GAD/7/train.tsv',
 'datasets/RE/GAD/4/train.tsv',
 'datasets/RE/GAD/9/train.tsv',
 'datasets/RE/GAD/1/train.tsv']

In [99]:
train_combine_tsvs = combine_files(train_fnames, sep='\t', output_fname='train', header=None)

In [100]:
train_combine_tsvs.shape

(47970, 2)

In [101]:
train_combine_tsvs.head()

Unnamed: 0,0,1
0,The polymorphism of @GENE$ promoter -969(G>C) ...,1
1,"In conclusion, the presence of the @GENE$ gene...",0
2,These results suggest that genetic polymorphis...,0
3,These results do not support @GENE$ 3111C as a...,0
4,The results of our study indicate that GABRA 3...,1


In [75]:
test_pattern = "*test.tsv"
test_fnames = get_files_from_subdirs(test_pattern)
test_combine_tsvs = combine_files(test_fnames, sep='\t', output_fname='test',  header=0)

In [76]:
test_combine_tsvs.shape

(5330, 3)

In [77]:
test_combine_tsvs.head()

Unnamed: 0,index,sentence,label
0,0,"Coupled with previous findings, our data sugge...",1
1,1,These results indicate a possible involvement ...,1
2,2,In the Amsterdam Cohort of homosexual men with...,1
3,3,"We conclude that @GENE$-589*T, but not TNF-al...",1
4,4,We suggest that exon 5 +3953 IL1beta and @GENE...,1


# Analyze and Combine EU-ADR datasets