In [1]:
myfile='./shorter.data'

In [46]:
import pandas as pd
import re
import math

def floatify(val):
    try:
        return float(val)
    except ValueError:
        return float("nan")
    
def make_key(line):
    val2 = re.sub("\)", "", line[0])
    return float(val2)
    

class CurvesAnalysis(object):
    # TODO: write docs ... basic idea, though is that we create a dict of several pd.Panels.
    # One panel per "group" of data (curves makes 5 such groups)
    # One DataFrame per "measurement" (buckle, opening, minor groove width)
    # One Column per location (usually by base pair number)
    # One row per time slice
    def __init__(self, fname=None):
        self.fname = fname
        
        self.setup = {}
        self.setup['groupA'] = {
            4 : 'xdisp',
            5 : 'ydisp',
            6 : 'inclin',
            7 : 'tip',
            8 : 'ax_bend'
        }
        
        self.setup['groupE'] = {
            3 : "W12",
            4 : "D12",
            5 : "W21",
            6 : "D21"
        }
        
        self.group_labels = ['groupA', 'groupE']
        
        self.panels = {}
        
        self.co_keys = {}
        self.prep_data = {}
        for label in self.group_labels:
            self.co_keys[label] = {}
            self.prep_data[label] = {}
            
        if self.fname is not None:
            self.read_curves_file(fname)
            
    
    def is_data(self, line):
        splitter = line.split()
        try:
            is_data = re.search("[0-9\.]+\)*", splitter[0])
        except IndexError:
            is_data = False
        return is_data
        
    
    def read_curves_file(self, fname=None):
        if fname == None:
            fname = self.fname
        if fname == None:
            raise RuntimeError("No file defined for analysis!")
        f = open(fname, "r")
        category = ""
        countA = 0
        countB = 0
        countC = 0
        countD = 0
        countE = 0
        
        for line in f:
            if re.search("\(A\)", line):
                category = "A"
                countA += 1
            elif re.search("\(B\)", line):
                category = "B"
                countB += 1
            elif re.search("\(C\)", line):
                category = "C"
                countC += 1
            elif re.search("\(D\)", line):
                category = "D"
                countD += 1
            elif re.search("\(E\)", line):
                category = "E"
                countE += 1            

            if category == "A" and self.is_data(line):
                group_label = 'groupA'
                co_keys = range(1,4)
                float_data = range(4,9)
                str_data = []
                splitter = self.line_prep(line, co_keys, float_data, str_data)
                self.add_data(splitter, group_label, co_keys, float_data, str_data)
            elif category == "E" and self.is_data(line):
                group_label = 'groupE'
                co_keys = [1, 2]
                float_data = range(3, 7)
                str_data = []
                splitter = self.line_prep(line, co_keys, float_data, str_data)
                key = make_key(splitter)
                if key - math.floor(key) > 0.01: # we have an x.5 value
                    splitter.insert(1, '---')
                    splitter.insert(1, '---')
                self.add_data(splitter, group_label, co_keys, float_data, str_data)
    
        
        for label in self.group_labels:
            dfs = {}
            for (key, name) in zip(self.setup[label].keys(), self.setup[label].values()):
                dfs[name] = pd.DataFrame(self.prep_data[label][key])
            self.panels[label] = pd.Panel(dfs)
            
        return (countA, countB, countC, countD, countE)
    
    def line_prep(self, line, co_keys, float_data, str_data):
        splitter = line.split()
        global_max = -1
        for indices in [co_keys, float_data, str_data]:
            if len(indices) > 0:
                if max(indices) > global_max:
                    global_max = max(indices)
            
        while len(splitter) < global_max+1:
            splitter.append('---')
            
        return splitter

                
    def add_data(self, splitter, group_label, co_keys, float_data, str_data):
        setup = self.setup[group_label]
        try:
            prep_data = self.prep_data[group_label]
        except KeyError:
            self.prep_data[group_label] = { }
            prep_data = self.prep_data[group_label]
        
        key = make_key(splitter)
        self.co_keys[group_label][key] = [splitter[k] for k in co_keys]
        for col in setup.keys():
            if col in float_data:
                val = floatify(splitter[col])
            elif col in str_data:
                val = str(splitter[col])
            try:
                prep_data[col][key].append(val)
            except KeyError:
                try:
                    prep_data[col][key] = [val]
                except KeyError:
                    prep_data[col] = {}
                    prep_data[col][key] = [val]


In [47]:
curves = CurvesAnalysis(myfile)

In [48]:
curves.panels['groupE']['W12']

Unnamed: 0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,...,25.0,25.5,26.0,26.5,27.0,27.5,28.0,28.5,29.0,29.5
0,,,,7.0,6.4,6.0,6.1,6.0,5.5,5.5,...,5.7,7.1,8.3,7.2,5.6,4.8,,,,
1,,,,,6.1,6.6,6.9,7.1,6.6,6.1,...,6.9,5.9,4.8,4.3,4.1,4.7,,,,
2,,,,4.5,3.5,3.5,5.0,6.3,6.3,6.0,...,7.1,7.8,7.7,6.6,5.2,4.1,4.3,,,
3,,,,6.6,5.6,4.9,5.1,5.2,4.8,5.1,...,4.5,5.3,6.4,5.9,5.1,5.1,,,,


In [49]:
curves.co_keys['groupE']

{1.5: ['---', '---'],
 2.0: ['A', '3'],
 2.5: ['---', '---'],
 3.0: ['A', '4'],
 3.5: ['---', '---'],
 4.0: ['T', '5'],
 4.5: ['---', '---'],
 5.0: ['A', '6'],
 5.5: ['---', '---'],
 6.0: ['T', '7'],
 6.5: ['---', '---'],
 7.0: ['A', '8'],
 7.5: ['---', '---'],
 8.0: ['T', '9'],
 8.5: ['---', '---'],
 9.0: ['T', '10'],
 9.5: ['---', '---'],
 10.0: ['A', '11'],
 10.5: ['---', '---'],
 11.0: ['T', '12'],
 11.5: ['---', '---'],
 12.0: ['A', '13'],
 12.5: ['---', '---'],
 13.0: ['A', '14'],
 13.5: ['---', '---'],
 14.0: ['T', '15'],
 14.5: ['---', '---'],
 15.0: ['A', '16'],
 15.5: ['---', '---'],
 16.0: ['T', '17'],
 16.5: ['---', '---'],
 17.0: ['A', '18'],
 17.5: ['---', '---'],
 18.0: ['T', '19'],
 18.5: ['---', '---'],
 19.0: ['T', '20'],
 19.5: ['---', '---'],
 20.0: ['A', '21'],
 20.5: ['---', '---'],
 21.0: ['T', '22'],
 21.5: ['---', '---'],
 22.0: ['A', '23'],
 22.5: ['---', '---'],
 23.0: ['A', '24'],
 23.5: ['---', '---'],
 24.0: ['T', '25'],
 24.5: ['---', '---'],
 25.0: ['A',

In [50]:
curves.panels['groupE']

<class 'pandas.core.panel.Panel'>
Dimensions: 4 (items) x 4 (major_axis) x 57 (minor_axis)
Items axis: D12 to W21
Major_axis axis: 0 to 3
Minor_axis axis: 1.5 to 29.5