# Transform Phenotypic Data into a Common Format
This script accesses the extracted data in their original format and applys a mapping to harmonize the data into a common tabular format.

In [66]:
import os
import json

import numpy as np
import pandas as pd

print(os.path.abspath(os.path.curdir))

/Users/nicholsn/Repos/metasearch/crawler/transform


In [3]:
# Path to projects with extracted files.
extract_path = os.path.abspath('../extract')
# Path to where the tranformed projects will go.
xfm_path = os.path.abspath(os.path.curdir)
# Project dirs.
project_names = [i for i in os.listdir(extract_path) if '.' not in i]
extract_dirs = [os.path.join(extract_path, i) for i in project_names]
xfm_dirs = [os.path.join(xfm_path, i) for i in project_names]

In [178]:
def apply_mapping(mapping, df_list):
    """
    Reads a dictionary mapping and list of dataframe, then merges the
    dataframes and convers the column name and values into a common
    format. Note: the dataframes should have a common structure.
    
    Example Mapping Structure
    =========================
    {  
        "DX_GROUP":
      {
        "element": "diagnosis",
        "type": "category",
        "1": "autism",
        "2": "control"
      }
    }
    """
    results = list()
    csv = pd.concat(df_list)
    csv.reset_index(drop=True, inplace=True)
    for col, elem in mapping.iteritems():
        series = csv[col]
        if elem.get('type') == 'category':
            result = series.apply(lambda x: elem.get(str(x)))
        elif elem.get('type') == 'number':
            result = series.apply(lambda x: np.NaN if str(x) in elem.keys() else x)
        else:
            try:
                # Handle ids being read as float.
                result = series.apply(lambda x: str(int(x)) if pd.notnull(x) else x)
            except ValueError as e:
                result = series
        # Concat all csv in a extract dir into one dataframe.        
        df = pd.DataFrame()
        df[elem.get('element')] = result
        results.append(df)
        concat = pd.concat(results, axis=1)
    return concat.groupby(level=0, axis=1).first()

In [177]:
# Read a mapping.json file from each sub directory to process.
for project_name in project_names:
    df_list = list()
    mapping = dict()
    extract_dir = os.path.join(extract_path, project_name)
    xfm_dir = os.path.join(xfm_path, project_name)
    # All files must use the same data dictionary in a given directory.
    extract_files = [os.path.join(extract_dir, i) for i in os.listdir(extract_dir)]
    mapping_file = os.path.join(xfm_dir, 'mapping.json')
    if os.path.exists(mapping_file) and 'indi' in mapping_file:
        with open(mapping_file, 'rb') as fi:
            mapping.update(json.load(fi))
        ext_type = dict(csv=',', tsv='\t')
        for extract_file in extract_files:
            # Process each file.
            ext = extract_file.split('.')[-1]
            sep_type = ext_type.get(ext)
            df = pd.read_csv(extract_file, sep=sep_type)
            df_list.append(df)
        xfm = apply_mapping(mapping, df_list)
        pheno_file = ''.join([project_name, '_', 'phenotype.csv'])
        pheno_path = os.path.join(xfm_dir, pheno_file)
        # Add a column for the specific project.
        xfm['project'] = project_name
        xfm.to_csv(pheno_path, index=False)

In [239]:
g = xfm.groupby(level=0, axis=1).first()

In [240]:
g.first()

Unnamed: 0,age,participant_id,project,sex
0,,sub-0030689,indi,Female
1,,sub-0030690,indi,Female
2,,sub-0030691,indi,Female
3,,sub-0030692,indi,Female
4,,sub-0030693,indi,Female
5,,sub-0030694,indi,Male
6,,sub-0030695,indi,Male
7,,sub-0030696,indi,Male
8,,sub-0030697,indi,Male
9,,sub-0030698,indi,Male
