In [None]:
#
# Notebook for computing volatile metrics
#

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Helper function for loading data files
def read_rep_csv(rep_id):
    data_fn = '../{}.csv'.format(rep_id)
    try:
        return pd.read_csv(data_fn)
    except FileNotFoundError:
        pass
    try:
        space_data_fn = data_fn[:-4] + ' ' + data_fn[-4:]
        return pd.read_csv(space_data_fn)
    except FileNotFoundError:
        pass
    dot_data_fn = data_fn[:-6] + '.' + data_fn[-5:]
    return pd.read_csv(dot_data_fn)

In [None]:
# Helper function for standardizing compound names
def standardize_compounds(compounds):
    std_comp = compounds.str.lower()
    
    replace_map = {'β': 'beta', 'γ': 'gamma', 't-':'tau-', 'α': 'alpha', 'δ': 'delta' }
    
    for symbol in replace_map:
        std_comp = std_comp.str.replace(symbol, replace_map[symbol])
    
    return std_comp  

In [None]:
# Load list of data files for all experiments
files_df = pd.read_csv('../Master List of Data Files.csv')

# Add round column and unique ID for each treatment for easier data manipulation
files_df['Round'] = files_df['Experiment ID'].str.slice(start=-1)
files_df['Treatment ID'] = files_df['Round'] + '_' + \
                           files_df['Level 1'] + '_' + \
                           files_df['Level 2']

In [None]:
# Load data for each rep, keeping track of common set of compounds
common_compound_set = None
rep_dfs = []
for index, row in files_df.iterrows():
    rep_id = row['Profile ID']
    rep_df = read_rep_csv(rep_id)
    rep_df['COMPOUND'] = standardize_compounds(rep_df['COMPOUND'])
    rep_df['Concentration'] = rep_df['ABUNDANCE'] / row['Sample Weight (g)']
    rep_df['Round'] = row['Round']
    rep_df['Treatment ID'] = row['Treatment ID']
    rep_dfs.append(rep_df)
    
    rep_compound_set = set(rep_df['COMPOUND'].unique())
    if common_compound_set is None:
        common_compound_set = rep_compound_set
    else:
        common_compound_set = common_compound_set & rep_compound_set

In [None]:
# Concatenate into single dataframe
df = pd.concat(rep_dfs)
df = df.drop(columns='Unnamed: 0')
df = df.reset_index(drop=True)

In [None]:
# Only consider chemicals present across all rounds
df = df[df['COMPOUND'].isin(common_compound_set)]

In [None]:
# Replace NaNs
for index, row in df.iterrows():
    if np.isnan(row['Concentration']):
        treatment_chemical_df = df[(df['Treatment ID'] == row['Treatment ID']) \
                                   & (df['COMPOUND'] == row['COMPOUND'])]
        concentration = treatment_chemical_df.mean()['Concentration']
        df.at[index, 'Concentration'] = concentration
df['Concentration'] = df['Concentration'].fillna(0)

In [None]:
# Compute overall means and stds for each compound
overall_means = df.groupby('COMPOUND')['Concentration'].mean()
overal_stds = df.groupby('COMPOUND')['Concentration'].std()

In [None]:
# Compute mean, std, and control mean for each compound for each round
round_means = {}
round_stds = {}
round_control_means = {}
for r in ['1', '2', '3']:
    round_df = df[df['Round'] == r]
    round_means[r] = round_df.groupby('COMPOUND')['Concentration'].mean()
    round_stds[r] = round_df.groupby('COMPOUND')['Concentration'].std()
    round_control_df = round_df[round_df['Treatment ID'].str.contains('control')]
    round_control_means[r] = round_control_df.groupby('COMPOUND')['Concentration'].mean()

In [None]:
# Compute R-Score for each rep
reps = df['PROFILE.ID'].unique()
treatment_rscores = {}
rep_rscores = []
for rep in reps:
    rep_df = df[df['PROFILE.ID'] == rep]
    treatment_id = rep_df['Treatment ID'].values[0]
    rep_round = rep_df['Round'].values[0]
    compound_ratios = []
    for compound in common_compound_set:
        compound_ratio = float(rep_df[rep_df['COMPOUND'] == compound]['Concentration']) \
                            / round_control_means[rep_round][compound]
        compound_ratios.append(compound_ratio)
    rep_rscore = np.mean(compound_ratios)
    if treatment_id not in treatment_rscores:
        treatment_rscores[treatment_id] = []
    treatment_rscores[treatment_id].append(rep_rscore)

In [None]:
# Compute Chemscore for each rep
reps = df['PROFILE.ID'].unique()
treatment_chemscores = {}
rep_chemscores = []
for rep in reps:
    rep_df = df[df['PROFILE.ID'] == rep]
    treatment_id = rep_df['Treatment ID'].values[0]
    rep_round = rep_df['Round'].values[0]
    compound_zs = []
    for compound in common_compound_set:
        z = (float(rep_df[rep_df['COMPOUND'] == compound]['Concentration']) \
             - round_means[rep_round][compound]) / round_stds[rep_round][compound]
        compound_zs.append(z)
    rep_chemscore = np.mean(compound_zs)
    if treatment_id not in treatment_chemscores:
        treatment_chemscores[treatment_id] = []
    treatment_chemscores[treatment_id].append(rep_chemscore)

In [None]:
# Compute Z-Score for each rep
reps = df['PROFILE.ID'].unique()
treatment_zscores = {}
rep_zscores = []
for rep in reps:
    rep_df = df[df['PROFILE.ID'] == rep]
    treatment_id = rep_df['Treatment ID'].values[0]
    rep_round = rep_df['Round'].values[0]
    compound_zs = []
    for compound in common_compound_set:
        z = (float(rep_df[rep_df['COMPOUND'] == compound]['Concentration']) \
             - overall_means[compound]) / overal_stds[compound]
        compound_zs.append(z)
    rep_zscore = np.mean(compound_zs)
    if treatment_id not in treatment_zscores:
        treatment_zscores[treatment_id] = []
    treatment_zscores[treatment_id].append(rep_zscore)

In [None]:
# Order treatments as in paper
treatment_ids = [
    '1_Phillips_long',
    '1_fluorescent_long',
    '1_ILLUMITEX_long',
    '1_fluorescent_control',
    '1_ILLUMITEX_control',
    '1_Phillips_control',
    '2_ILLUMITEX_long',
    '2_Phillips_long',
    '2_fluorescent_long',
    '2_Phillips_short',
    '2_fluorescent_short',
    '2_fluorescent_control',
    '2_ILLUMITEX_control',
    '2_Phillips_control',
    '3_ILLUMITEX_17 hours/day',
    '3_Phillips_4 hours/day',
    '3_fluorescent_24 hours/day',
    '3_Phillips_14 hours/day',
    '3_fluorescent_8 hours/day',
    '3_ILLUMITEX_10 hours/day',
    '3_fluorescent_control',
    '3_ILLUMITEX_control'
]

# Join metrics
metrics_df = pd.DataFrame({
    'Treatment': treatment_ids,
    'R-Score': [np.mean(treatment_rscores[tid]) for tid in treatment_ids],
    'Chemscore': [np.mean(treatment_chemscores[tid]) for tid in treatment_ids],
    'Z-Score': [np.mean(treatment_zscores[tid]) for tid in treatment_ids],
})
metrics_df