In [1]:
import pandas as pd
import uuid
import numpy as np
from cStringIO import StringIO
from toolz import thread_first, thread_last, pipe
from pandas import DataFrame as df
execfile('./utils.py')

In [2]:
def mapdict(f,d):
    return [f(k,v) for k,v in d.iteritems()]

def tail(x):
    return x[1:]

def generate_sid():
    return str(uuid.uuid4()).split('-')[-1]

def create_well_df(cell_dict):
    return thread_last(cell_dict,
                       (mapdict,lambda k,v: {"Cell Type":k,"Well Name":v}),
                       (map, df),
                       pd.concat)

def create_plate_df(well_df,plate_info, plate_name):
    x = well_df.copy()
    x['Plate Name'] = plate_name
    x['Condition'] = plate_info + ' ' + x['Cell Type']
    x['Plate ID'] = generate_sid()
    return x.drop('Cell Type',axis=1)

In [3]:
conditions = [('PML ssC w RNase A',24),
              ('PML APB w RNase A',25),
              ('PML ssC wo RNase A',26),
              ('PML IF only control no EtOH dehydrat',27),
              ('PML ssC control no FISH',28),
              ('IgG APB w LNA probe before denature',29),
              ('IgG APB w LNA probe after denature',30),
              ('IgG ssC w LNA probe no denature',31)]

cell_dict = {"U2OS": ['A01','A02','B01','B02'],
             "143B": ['A03','A04','B03','B04']}

date = '06-18-2015'

In [4]:
well_df = create_well_df(cell_dict)
condition_lookup = pd.concat([create_plate_df(well_df,a,b) for a,b in conditions])
condition_lookup['Date'] = date

In [5]:
condition_lookup.head()

Unnamed: 0,Well Name,Plate Name,Condition,Plate ID,Date
0,A03,24,PML ssC w RNase A 143B,0cf3a243881c,06-18-2015
1,A04,24,PML ssC w RNase A 143B,0cf3a243881c,06-18-2015
2,B03,24,PML ssC w RNase A 143B,0cf3a243881c,06-18-2015
3,B04,24,PML ssC w RNase A 143B,0cf3a243881c,06-18-2015
0,A01,24,PML ssC w RNase A U2OS,0cf3a243881c,06-18-2015


In [23]:
from pandas import DataFrame as df
import pandas as pd
import numpy as np
from toolz import thread_first,\
                  thread_last,\
                  juxt
from utils import curry_funcs,\
                  drop_matching_columns,\
                  add_normalized_columns,\
                  headers_to_column,\
                  groupby_and_summarize,\
                  identity

curry_funcs(['pd.read_csv',
             'df.dropna',
             'df.rename',
             'map'])

###################################################
### Cell Data Config ##############################
###################################################

# String -> String
def rename_column(col):
    """ Rename column col to remove whitespace, backslashes, prefixes,
        and suffixes (esp. large parenthetic suffix). """
    if col.startswith('Cell:'):
        return col.split('(')[0].lstrip("Cell:").rstrip('/').strip(' ')
    else:
        return col.split('(')[0].rstrip('/').strip(' ')

def check_cell_data(dataframe):
    return dataframe

# type CellConfig = {
#        path      :: String,
#        skiprows  :: Int | [Int],
#        dropcols  :: [RegexString],
#        normcols  :: [[String,[String],[String]]],
#        colrename :: (String -> String),
#        check     :: (DataFrame -> DataFrame | Exception) }

cell_config = dict(
    path = '/notebooks/moldev-data/original/06-19-2015/6.19.15 All ssC data - Jonah Simon.txt',
    plate_delimiter = "ATF",
    delimiter = '\t',
    skiprows = 4,
    dropcols = ['Cell ID',
                'Site ID',
                'MEASUREMENT SET ID',
                '.*ObjectID.*',
                '\.[0-9]*\Z'],
    normcols = [['Normalized APB spots',
                  ['# of APBs'],
                  ['# of FITC spots', '# of TxRed spots']],
                ['Normalized Coloc area',
                  ['Area_Coloc_Avg'],
                  ['Area_FITC','Area_TxRed']],
                ['Normalized Coloc spots',
                  ['# Coloc Spots'],
                  ['# of FITC spots', '# of TxRed spots']]],
    colrename = rename_column,
    check = check_cell_data
    )

###################################################
### Lookup Config #################################
###################################################

def check_lookup_data(dataframe):
    return dataframe

# type LookupConfig = {
#        path      :: String,
#        skiprows  :: Int | [Int],
#        check     :: (DataFrame -> DataFrame | Exception) }

lookup_config = dict(
    path = '../input/conditions_and_wells.csv',
    skiprows = [1],
    check = check_lookup_data
    )

# CellConfig -> DataFrame


data = thread_first(path,
                    open,
                    file.read,
                    (str.split,'ATF'),
                    tail,
                    map(StringIO),
                    map(pd.read_csv(delimiter='\t',skiprows=4)),
                    pd.concat)


def get_cell_data(c):
    return thread_first(c['path'],
                        open,
                        file.read,
                        (str.split,c['plate_delimiter']),
                        tail,
                        map(StringIO),
                        map(pd.read_csv(delimiter=c['delimiter'], skiprows=c['skiprows'])),
                        pd.concat,
                        df.dropna(axis=1,how='all'),
                        (drop_matching_columns,c['dropcols']),
                        df.rename(columns=c['colrename']),
                        (add_normalized_columns,c['normcols']),
                        c['check'])

# data = pd.merge(get_cell_data(cell_config),
#                 get_lookup_data(lookup_config),
#                 on = 'Well Name')

# # Write to files
# data.to_csv('../output/moldev_cleaned.csv',index=False)

In [21]:
data = get_cell_data(cell_config)

In [22]:
data

Unnamed: 0,Well Name,Plate ID,# of FITC spots,Area_FITC,NucArea_Avg,NucIntegrated Intensity_Average,Integrated Intensity_FITC,Area_TxRed,Integrated Intensity_TxRed,# of TxRed spots,...,Total_Int_Intensity_TxRed,FITCinAPB_Int_Intensity_Total,TxRinAPB_Int_Intensity_Total,FITC-TxRed_Area_Avg,FITC-TxRed_Area_Total,FITC-TxRed Spots,Laser focus score,Normalized APB spots,Normalized Coloc area,Normalized Coloc spots
0,A01,28,9,0.314608,166.787186,6676052,15065.444336,0.101124,1533,1,...,1533,,,,,0,29.000000,0,,0.000000
1,A01,28,30,0.380900,220.989655,7825328,45452.234375,,,0,...,,,,,,0,29.000000,0,,0.000000
2,A01,28,13,0.294729,154.607361,6503341,13281.845703,,,0,...,,,,,,0,29.000000,0,,0.000000
3,A01,28,13,0.279171,219.866043,11076027,21914.615234,,,0,...,,,,,,0,29.000000,0,,0.000000
4,A01,28,9,0.463173,88.404846,4023999,35109.667969,,,0,...,,,,,,0,31.000000,0,,0.000000
5,A01,28,15,0.406743,129.124115,5213599,27807.466797,0.101124,1529,1,...,1529,,,,,0,31.000000,0,,0.000000
6,A01,28,13,0.368195,94.652061,4387602,20871.230469,,,0,...,,,,,,0,31.000000,0,,0.000000
7,A01,28,12,0.336144,182.023193,10562550,21516.750000,,,0,...,,,,,,0,31.000000,0,,0.000000
8,A01,28,18,0.294009,168.450119,7898620,19877.945313,,,0,...,,,,,,0,31.000000,0,,0.000000
9,A01,28,25,0.317754,181.079376,8081118,21214.400391,,,0,...,,,,,,0,31.000000,0,,0.000000
