In [3]:
%matplotlib inline

import pandas as pd, numpy as np, seaborn as sns
import matplotlib.pyplot as plt

from qiime.parse import parse_mapping_file
from qiime.format import format_mapping_file
from skbio.io.util import open_file
from scipy.stats import pearsonr, spearmanr

def load_mf(fn):
    with open_file(fn, 'U') as f:
        mapping_data, header, _ = parse_mapping_file(f)
        _mapping_file = pd.DataFrame(mapping_data, columns=header)
        _mapping_file.set_index('SampleID', inplace=True)
    return _mapping_file

def write_mf(f, _df):
    with open_file(f, 'w') as fp:
        lines = format_mapping_file(['SampleID'] + _df.columns.tolist(),
                                    list(_df.itertuples()))
        fp.write(lines+'\n')

# Obtaining the metadata and removing unused data for the analysis

We have a mapping file that we used for an overall initial analysis, see `Analysis.ipynb`, however we added more metadata (namely `indiv_g_protein_1000kcal_ME_group` and `indiv_g_fat_1000kcal_ME_group`) after that inital analysis was completed. Thus, in trying to ensure `Analysis.ipynb` continues to work, we'll use a new mapping file named **`mapping-file-full.txt`**. I received this file from Jan on October 8.

In [4]:
ls *.txt

log.txt                                 mapping-file.alpha.txt
mapping-file-excelified.txt             mapping-file.txt
mapping-file-full.alpha.L6index.txt     node-attributes.txt
mapping-file-full.alpha.txt             protective-inflammatory.txt
mapping-file-full.txt                   test.edgelist.2.txt
mapping-file.alpha.index.dogbyosis.txt  test.edgelist.txt
mapping-file.alpha.index.txt


In [12]:
st = load_mf('qiita/sample-template.txt')

In [13]:
mf = load_mf('mapping-file-full.txt')

In [17]:
st['indiv_g_fat_1000kcal_ME_group'] = mf['indiv_g_fat_1000kcal_ME_group'].copy()
st['indiv_g_protein_1000kcal_ME_group'] = mf['indiv_g_protein_1000kcal_ME_group'].copy()
st['collection_timestamp'] = mf.collection_date.copy()
st['physical_specimen_location'] = ['Texas A&M'] * len(st)
st.drop('collection_timestamp', axis=1, inplace=True)

# this cell had a value of 558.8, which I confirmed with Jan and the correct value should be 58.8
st = mf.set_value('Nor.C1', 'indiv_g_protein_1000kcal_ME_group', '58.8')

In [18]:
a = st.copy()
b = mf.copy()

for column_name in set(a.columns) & set(b.columns):
    try:
        pd.util.testing.assert_series_equal(a[column_name], b[column_name])
    except AssertionError as e:
        print column_name, 'changed from the old to the new file', str(e)

In [28]:
write_mf('qiita/sample-information.fixed.txt', st)

In [None]:
mf.LIBRARY_CONSTRUCTION_PROTOCOL

In [None]:
#library_construction_protocol, center_name, instrument_model 

In [43]:
pt = load_mf('qiita/prep-template.txt')

In [44]:
pt['library_construction_protocol'] = st.LIBRARY_CONSTRUCTION_PROTOCOL.copy()
pt['center_name'] = ['CCME'] * len(pt)
pt['instrument_model'] = ['Illumina HiSeq 2000'] * len(pt)

In [45]:
write_mf('qiita/prep-information.fixed.txt', pt)

In [31]:
st.illumina_technology.value_counts()

HiSeq    192
Name: illumina_technology, dtype: int64

In [33]:
st.EXPERIMENT_CENTER.value_counts()

Texas A&M    192
Name: EXPERIMENT_CENTER, dtype: int64

In [34]:
st.STUDY_CENTER.value_counts()

Texas A&M    192
Name: STUDY_CENTER, dtype: int64

In [35]:
st.SAMPLE_CENTER.value_counts()

Texas A&M    192
Name: SAMPLE_CENTER, dtype: int64

In [36]:
for i in st.columns:
    if 'center' in i.lower():
        print i

EXPERIMENT_CENTER
RUN_CENTER
SAMPLE_CENTER
STUDY_CENTER


In [37]:
st.RUN_CENTER.value_counts()

CCME    192
Name: RUN_CENTER, dtype: int64