In this notebook the original csv data is processed. It is first separated into two separate DataFrames, one for the 16ms integration time data and another for the 32ms integration time data. The resulting DataFrames are then restructured into a hirearchically indexed DataFrame such that the columns have four levels: growth phase (lag, log, or stat), 'gram-ness' (positive or negative), species (bc, ec, lm, pa, sa, or se), replicate. Refer to the file `README.md` for more details on what each of these are. 

In [1]:
import os

import numpy as np
import pandas as pd

%matplotlib inline

In [2]:
os.getcwd()

'/home/anthony/bacteria-project'

In [3]:
os.listdir()

['.ipynb_checkpoints',
 'data',
 '.idea',
 'experiments',
 'comparisons_plots.pdf',
 'Classification-Gramness-Copy1.ipynb',
 'Data Preparation.ipynb',
 '__pycache__',
 'other_experiments',
 'Data Analysis.ipynb',
 'environment.yml',
 '.gitignore',
 '.git',
 'Classification-Gramness.ipynb',
 'README.md',
 'Classification-Species.ipynb']

In [4]:
os.listdir('data')

['Classific.py',
 'graphs.pdf',
 'growth_stages.csv',
 '16ms_32ms_growth_phase_spectra.csv',
 'bacteria_32ms.csv',
 'PC6allwavelengths.png',
 'bacteria_16ms.csv',
 'wavelengths.csv',
 '16_ms_lag_codes.csv',
 'bacteria.csv']

In [5]:
df = pd.read_csv('data/16ms_32ms_growth_phase_spectra.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1043 entries, 0 to 1042
Columns: 271 entries, wavelength to sa08_32ms_stat
dtypes: float64(271)
memory usage: 2.2 MB


In [7]:
df.describe()

Unnamed: 0,wavelength,bc01_16ms_lag,bc02_16ms_lag,bc03_16ms_lag,bc04_16ms_lag,bc05_16ms_lag,bc06_16ms_lag,bc07_16ms_lag,bc08_16ms_lag,bc09_16ms_lag,...,sa07_16ms_stat,sa08_16ms_stat,sa01_32ms_stat,sa02_32ms_stat,sa03_32ms_stat,sa04_32ms_stat,sa05_32ms_stat,sa06_32ms_stat,sa07_32ms_stat,sa08_32ms_stat
count,1043.0,1043.0,1043.0,1043.0,1043.0,1043.0,1043.0,1043.0,1043.0,1043.0,...,1043.0,1043.0,1043.0,1043.0,1043.0,1043.0,1043.0,1043.0,1043.0,1043.0
mean,594.306442,340.917966,218.282554,410.26545,402.411758,147.930946,304.443366,690.243351,615.625463,508.217097,...,1617.105057,1424.455291,1678.851611,1582.324367,1752.327722,1072.6044,2487.669405,1599.289495,1902.815892,1755.026657
std,112.58039,517.453981,328.263253,632.792734,584.358827,213.381498,457.365956,1076.320701,1006.404767,829.675942,...,2683.151537,2381.357464,2879.515933,2702.094106,2988.042428,1838.334717,4268.994423,2653.006792,3166.96857,2929.745473
min,395.4941,-3.5104,-2.80076,-2.581292,-2.55417,-2.98582,-3.42937,-2.28078,-4.32739,-3.34039,...,-2.56342,-1.46297,-2.10333,-1.784151,-2.26983,-1.51549,-0.65987,-11.2086,-1.79664,-0.44701
25%,497.4822,3.738191,2.116139,3.61456,4.258268,3.017208,3.429759,4.382186,3.291279,3.09311,...,12.687795,10.883005,9.192903,7.764677,8.325252,6.301859,13.16108,11.004515,11.81157,10.710685
50%,596.3744,40.78024,27.02398,48.616896,58.97229,24.39627,37.31628,78.13851,46.40781,41.59408,...,130.2423,104.5619,108.0651,101.703279,111.6902,67.09159,156.4899,129.3464,145.2338,130.1348
75%,692.1706,496.75435,319.17355,571.80101,629.26035,223.73065,446.7392,940.8808,775.15455,630.9647,...,1936.3295,1679.898,1858.899,1765.909212,1966.2215,1192.3965,2769.3,1908.1025,2265.0205,2097.588
max,784.8709,1649.755,1044.867,2050.509445,1755.384,670.9825,1426.308,3542.053,3369.752,2848.575,...,9235.687,8223.58,9870.242,9234.608023,10242.69,6358.552,14776.43,9130.342,10880.64,10086.28


In [8]:
df.isnull().values.any()

False

In [9]:
print(df.columns.values)

['wavelength' 'bc01_16ms_lag' 'bc02_16ms_lag' 'bc03_16ms_lag'
 'bc04_16ms_lag' 'bc05_16ms_lag' 'bc06_16ms_lag' 'bc07_16ms_lag'
 'bc08_16ms_lag' 'bc09_16ms_lag' 'bc10_16ms_lag' 'bc11_16ms_lag'
 'bc12_16ms_lag' 'bc01_32ms_lag' 'bc02_32ms_lag' 'bc03_32ms_lag'
 'bc04_32ms_lag' 'bc05_32ms_lag' 'bc06_32ms_lag' 'bc07_32ms_lag'
 'bc08_32ms_lag' 'bc09_32ms_lag' 'bc10_32ms_lag' 'bc11_32ms_lag'
 'bc12_32ms_lag' 'bc01_16ms_log' 'bc02_16ms_log' 'bc03_16ms_log'
 'bc04_16ms_log' 'bc05_16ms_log' 'bc06_16ms_log' 'bc07_16ms_log'
 'bc08_16ms_log' 'bc09_16ms_log' 'bc10_16ms_log' 'bc11_16ms_log'
 'bc12_16ms_log' 'bc01_32ms_log' 'bc02_32ms_log' 'bc03_32ms_log'
 'bc04_32ms_log' 'bc05_32ms_log' 'bc06_32ms_log' 'bc07_32ms_log'
 'bc08_32ms_log' 'bc09_32ms_log' 'bc10_32ms_log' 'bc11_32ms_log'
 'bc12_32ms_log' 'bc01_16ms_stat' 'bc02_16ms_stat' 'bc03_16ms_stat'
 'bc04_16ms_stat' 'bc05_16ms_stat' 'bc06_16ms_stat' 'bc07_16ms_stat'
 'bc08_16ms_stat' 'bc09_16ms_stat' 'bc10_16ms_stat' 'bc11_16ms_stat'
 'bc12_16ms_stat'

In [10]:
bacteria_prefixes = ['bc', 'ec', 'lm', 'pa', 'sa', 'se']

gram_pos = ['bc', 'lm', 'sa']
gram_neg = ['ec', 'pa', 'se']

growth_phases = ['lag', 'log', 'stat']

In [11]:
def make_df(integration_time):
    """Make a dataframe for a given integration time.
    
    The resulting dataframe has a hierarchical column index where the levels are:
    1. growth phase
    2. gramness (positive/negative)
    3. bacteria species
    4. replicate.
    
    Arguments:
        integration_time: The integration time of the readings to use. Either '16ms' or '32ms'.
        
    Returns: the processed dataframe.
    """
    assert integration_time in ['16ms', '32ms']
    
    result_df = pd.concat([df['wavelength'], df.filter(regex=integration_time)], axis=1)
    result_df.columns = result_df.columns.str.replace("_%s" % integration_time, "")
    
    growth_phase_dfs = []

    for growth_phase in growth_phases:
        growth_phase_df = result_df.filter(regex=growth_phase)
        
        species_dfs = []
        
        for species in bacteria_prefixes:
            species_df = growth_phase_df.filter(regex=species)
            species_df.columns = species_df.columns.str.replace(species, "")
            species_df.columns = species_df.columns.str.replace("_%s" % growth_phase, "")
            species_df.columns = pd.MultiIndex.from_product([[species], species_df.columns])

            if species in gram_pos:
                species_df = pd.concat([species_df], keys=['positive'], axis=1)
            else:
                species_df = pd.concat([species_df], keys=['negative'], axis=1)
                
            species_dfs.append(species_df)
        
        species_multilevel_df = pd.concat(species_dfs, axis=1)
        growth_phase_dfs.append(species_multilevel_df)

    growth_phases_df = pd.concat(growth_phase_dfs, 
                                 axis=1, 
                                 keys=growth_phases, 
                                 names=('growth_phase', 'gramness', 'species', 'replicate'))
    
    return growth_phases_df

In [12]:
bacteria_16ms = make_df(integration_time='16ms')
bacteria_16ms.head()

growth_phase,lag,lag,lag,lag,lag,lag,lag,lag,lag,lag,...,stat,stat,stat,stat,stat,stat,stat,stat,stat,stat
gramness,positive,positive,positive,positive,positive,positive,positive,positive,positive,positive,...,positive,positive,positive,negative,negative,negative,negative,negative,negative,negative
species,bc,bc,bc,bc,bc,bc,bc,bc,bc,bc,...,sa,sa,sa,se,se,se,se,se,se,se
replicate,01,02,03,04,05,06,07,08,09,10,...,06,07,08,01,02,03,04,05,06,07
0,0.0,0.622353,0.064528,-1.09465,-1.49281,-0.97976,-0.24007,0.060945,0.247423,2.628512,...,1.757536,-0.5258,-0.18286,0.918231,1.283523,-0.734724,0.183516,2.187136,-0.7746,0.423758
1,1.56006,-2.11599,0.645279,-2.55417,-0.5598,-0.06123,0.600163,-0.79228,0.123709,0.292028,...,-2.07109,-0.32862,-0.85334,-0.65587,0.192528,-1.269057,-1.34577,-1.0049,-0.19365,1.089653
2,0.715025,0.062234,-1.03245,-0.91221,2.923387,0.489875,0.540146,0.182837,-0.74227,-0.40884,...,-2.07134,-0.98586,-0.18286,0.393521,1.219347,-1.669815,0.367029,-3.36936,-1.0328,-0.36322
3,-0.52006,2.178335,-2.064991,0.851433,3.047967,-1.53094,-1.2004,0.853271,1.360885,-5.7415,...,3.714417,1.117374,2.255372,-1.18063,-3.33734,-0.667963,1.59054,-1.47788,-0.64554,-1.57402
4,0.130014,1.867163,-1.419711,3.709871,2.985793,3e-06,1.260425,-3.04742,2.226921,4.453517,...,-2.64411,0.591557,0.9753,1.246246,1.604498,0.868363,0.91763,-0.88673,1.420193,-1.8162


In [13]:
bacteria_32ms = make_df(integration_time='32ms')
bacteria_32ms.head()

growth_phase,lag,lag,lag,lag,lag,lag,lag,lag,lag,lag,...,stat,stat,stat,stat,stat,stat,stat,stat,stat,stat
gramness,positive,positive,positive,positive,positive,positive,positive,positive,positive,positive,...,positive,positive,positive,negative,negative,negative,negative,negative,negative,negative
species,bc,bc,bc,bc,bc,bc,bc,bc,bc,bc,...,sa,sa,sa,se,se,se,se,se,se,se
replicate,01,02,03,04,05,06,07,08,09,10,...,06,07,08,01,02,03,04,05,06,07
0,-0.08667,-0.45639,0.516225,-1.54062,0.456133,-1.18387,0.800218,0.487562,0.494845,3.310051,...,1.046171,-0.21908,-2e-06,0.655876,-1.28352,-0.578877,-0.48938,-0.39408,-0.73157,0.080716
1,0.996705,-0.16596,-0.043018,-1.0541,0.663466,0.612345,0.040013,0.162518,0.329893,1.829989,...,-2.80327,-0.74487,0.893972,-1.09312,-0.38506,-0.222642,0.448589,0.157629,0.559429,0.121073
2,0.650022,-0.74682,-0.731316,-0.64868,0.995199,-0.20411,-0.44012,-1.097,0.288661,0.97346,...,2e-06,-0.39434,1.178423,-0.34981,-0.38506,0.133583,0.163123,-1.93098,-0.5164,-0.84751
3,-0.34671,0.622385,0.817394,-0.20273,1.327006,0.285776,1.000333,0.203161,0.32991,-2.73398,...,6.001085,2.059493,1.341045,-0.08745,-1.06967,0.311717,0.122343,1.418768,0.086071,0.403608
4,1.993549,1.078807,-0.860428,0.243273,0.24881,0.449092,1.000333,-1.09708,0.659837,4.844247,...,-2.47632,0.438183,0.934681,1.005742,1.839845,0.534377,-0.20392,-0.47291,0.903758,-0.2018


In [14]:
bacteria_16ms.to_csv('data/bacteria_16ms.csv')
bacteria_32ms.to_csv('data/bacteria_32ms.csv')
df['wavelength'].to_csv('data/wavelengths.csv')