# Getting started

In [1]:
# do basic imports and unpack McMurdo data

from pmagpy import ipmag
from programs import new_builder as nb
from programs import data_model3
import os
import pandas as pd
from pandas import DataFrame
from programs.new_builder import Contribution

wdir = os.path.join(os.getcwd(), "3_0", "McMurdo")
infile = os.path.join(wdir, "lawrence09.v30.txt")
ipmag.download_magic(infile, overwrite=True)


because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



working on:  'contribution'
1  records written to file  ./contribution.txt
contribution  data put in  ./contribution.txt
working on:  'locations'
5  records written to file  ./locations.txt
locations  data put in  ./locations.txt
working on:  'sites'
391  records written to file  ./sites.txt
sites  data put in  ./sites.txt
working on:  'samples'
1418  records written to file  ./samples.txt
samples  data put in  ./samples.txt
working on:  'specimens'
1374  records written to file  ./specimens.txt
specimens  data put in  ./specimens.txt
working on:  'measurements'
25470  records written to file  ./measurements.txt
measurements  data put in  ./measurements.txt
working on:  'ages'
99  records written to file  ./ages.txt
ages  data put in  ./ages.txt
working on:  'images'
431  records written to file  ./images.txt
images  data put in  ./images.txt
working on:  'criteria'
24  records written to file  ./criteria.txt
criteria  data put in  ./criteria.txt
location_1:  McMurdo
directory  ./Locat

True

# Demo several ways of creating a contribution

In [2]:
reload(nb)

# test out various ways of creating a contribution

# make contribution with all defaults from working directory
con = nb.Contribution(wdir)
print con
print 'tables created:', con.tables.keys()

# make contribution with some custom filenames
con = nb.Contribution(wdir, custom_filenames={'specimens': 'custom_specimens.txt'})
print con
print 'tables created:', con.tables.keys()

# make contribution with custom filenames, and only specimen table to start
con = Contribution(wdir, read_tables=['specimens'], custom_filenames={'sites': 'crazy_site_file.txt',
                                                                  'specimens': 'custom_specimens.txt'})
print con
print 'tables created:', con.tables.keys()

# make contribution with a single, mystery file
con = nb.Contribution(wdir, single_file='sites.txt')
print con
print 'tables created:', con.tables.keys()

# make contribution with a single mystery file 
con = Contribution(wdir, single_file='custom_specimens.txt')
print con
print 'tables_created:', con.tables.keys()

-W- No such file: /Users/nebula/Python/PmagPy/3_0/McMurdo/specimens.txt
-W- No such file: /Users/nebula/Python/PmagPy/3_0/McMurdo/samples.txt
-W- No such file: /Users/nebula/Python/PmagPy/3_0/McMurdo/sites.txt
<programs.new_builder.Contribution object at 0x10d016d50>
tables created: ['measurements', 'ages', 'locations', 'criteria', 'images', 'contribution']
-W- No such file: /Users/nebula/Python/PmagPy/3_0/McMurdo/samples.txt
-W- No such file: /Users/nebula/Python/PmagPy/3_0/McMurdo/sites.txt
<programs.new_builder.Contribution object at 0x10d016bd0>
tables created: ['measurements', 'ages', 'locations', 'criteria', 'images', 'contribution', 'specimens']
<programs.new_builder.Contribution object at 0x10cf62a90>
tables created: ['specimens']
-W- Bad file /Users/nebula/Python/PmagPy/3_0/McMurdo/sites.txt
<programs.new_builder.Contribution object at 0x10cf62fd0>
tables created: []
<programs.new_builder.Contribution object at 0x103e12510>
tables_created: ['specimens']


# Demonstrate functionality with a contribution

In [3]:
# create full McMurdo contribution

reload(nb)

con = nb.Contribution(wdir, custom_filenames={'specimens': 'custom_specimens.txt', 'samples': 'custom_samples.txt',
                                             'sites': 'custom_sites.txt'})


### Dealing with criteria

In [4]:
#  grab a copy of the criteria and sites table to play with
criteria = con.tables['criteria'].df.copy()
sites = con.tables['sites'].df.copy()
locations = con.tables['locations'].df.copy()
specimens = con.tables['specimens'].df.copy()
sites.head()

Unnamed: 0_level_0,age,age_sigma,age_unit,analyst_names,citations,criteria_names,description,dir_alpha95,dir_comp_name,dir_dec,...,vadm_n_samples,vadm_sigma,vdm,vdm_n_samples,vdm_sigma,vgp_dm,vgp_dp,vgp_lat,vgp_lon,vgp_n_samples
site_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc01,1.18,0.005,Ma,@ltauxe,This study,DE-SPEC,Direction included in Pmag_Results.,4.2,A,258.6,...,,,,,,,,,,
mc01,1.18,0.005,Ma,@ltauxe,This study,IE-SPEC,,,,,...,,,,,,,,,,
mc01,1.18,0.005,Ma,@klawrenc,This study,DE-SITE,"VGP: Site (geog. coord) A comp: mc01, Site VGP...",4.2,A,258.6,...,,,,,,4.5,8.1,-67.3,95.2,7.0
mc02,0.33,0.01,Ma,@ltauxe,This study,DE-SPEC,Direction included in Pmag_Results.,2.1,A,328.6,...,,,,,,,,,,
mc02,0.33,0.01,Ma,@klawrenc,This study,DE-SITE,"VGP: Site (geog. coord) A comp: mc02, Site VGP...",2.1,A,328.6,...,,,,,,2.5,4.1,79.0,101.2,6.0


In [5]:
# get all criteria for sites

# only criteria with 'site' in table_column_name
cond = criteria.index.str.contains('site')
site_crit = criteria[cond].copy()
# remove table name from index
site_crit.index = site_crit.index.str.replace('sites.', '')
site_crit.index.name = 'column_name'


cols = site_crit.index
list(cols)
cols = list(cols)
cols.append('criteria_names')
site_crit

Unnamed: 0_level_0,citations,criterion_name,criterion_operation,criterion_value,description,table_column_name
column_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dir_alpha95,This study,DE-SITE,<=,180,Criteria for selection of site direction,sites.dir_alpha95
dir_n_samples,This study,DE-SITE,>=,5,Criteria for selection of site direction,sites.dir_n_samples
dir_n_specimens_lines,This study,DE-SITE,>=,4,Criteria for selection of site direction,sites.dir_n_specimens_lines
dir_k,This study,DE-SITE,>=,50,Criteria for selection of site direction,sites.dir_k
int_abs_sigma,This study,IE-SITE,<=,-1,Criteria for selection of site intensity,sites.int_abs_sigma
int_abs_sigma_perc,This study,IE-SITE,<=,15,Criteria for selection of site intensity,sites.int_abs_sigma_perc
int_n_samples,This study,IE-SITE,>=,2,Criteria for selection of site intensity,sites.int_n_samples
dir_polarity,This study,NPOLE,=,n,Criteria for inclusion in normal mean,sites.dir_polarity
dir_polarity,This study,RPOLE,=,r,Criteria for inclusion in reverse mean,sites.dir_polarity


In [6]:
sites.head()[cols]

Unnamed: 0_level_0,dir_alpha95,dir_n_samples,dir_n_specimens_lines,dir_k,int_abs_sigma,int_abs_sigma_perc,int_n_samples,dir_polarity,dir_polarity,criteria_names
site_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
mc01,4.2,7.0,6.0,215.9,,,,r,r,DE-SPEC
mc01,,,,,3.374e-06,20.4,4.0,,,IE-SPEC
mc01,4.2,7.0,,215.9,,,,,,DE-SITE
mc02,2.1,6.0,5.0,1079.2,,,,n,n,DE-SPEC
mc02,2.1,6.0,,1079.2,,,,,,DE-SITE


In [7]:
# create string --> operator conversion    
import operator
ops = {"<": operator.lt, ">": operator.gt, "==": operator.eq, "<=": operator.le, ">=": operator.gt}

# function for applying criteria
def apply_crit(series, crit_series, criteria_type):
    """
    Apply 1 criterion (i.e., 1 row of the criteria table) to another table.
    Return series with boolean values for whether the row passes.
    """
    col_name = crit_series.name
    #print col_name
    # if there's no value, pass == True??  or == False?
    if not series[col_name]:
        return "{} not in row".format(col_name)
    # if we're missing criteria names, then what??
    elif not series['criteria_names']:
        return "no value in criteria_names"
    elif criteria_type not in series['criteria_names']:
        return "{} not in criteria_names for this row".format(criteria_type)
    crit_name = crit_series['criterion_name']
    crit_value = float(crit_series['criterion_value'])
    op_str = crit_series['criterion_operation']
    op = ops[op_str]
    value = float(series[col_name])
    #print value, op_str, crit_value
    result = op(value, crit_value)
    #print op
    #print result
    return result





In [8]:
# apply a criterion to a table
crit_name = 'dir_alpha95'
pass_name = 'pass_' + crit_name
crit1 = site_crit.ix[crit_name]
sites[pass_name] = sites.apply(apply_crit, axis=1, args=(crit1, 'DE-SITE'))
cond = sites[pass_name] == True
sites[cond][[crit_name, pass_name]].head()

Unnamed: 0_level_0,dir_alpha95,pass_dir_alpha95
site_name,Unnamed: 1_level_1,Unnamed: 2_level_1
mc01,4.2,True
mc02,2.1,True
mc03,2.3,True
mc04,4.6,True
mc06,4.8,True


In [9]:
for crit_name in site_crit.index[1:]:
    crit_series = site_crit.ix[crit_name]
    # if there are multiple records for a single crit_name, ignore that one
    if not isinstance(crit_series, pd.Series):
        continue
    sites['pass_' + crit_name] = sites.apply(apply_crit, axis=1, args=(crit_series, "DE-SITE"))

    
cond = sites.columns.str.contains('pass')
sites.head()
sites[sites.columns[cond]].head()

Unnamed: 0_level_0,pass_dir_alpha95,pass_dir_n_samples,pass_dir_n_specimens_lines,pass_dir_k,pass_int_abs_sigma,pass_int_abs_sigma_perc,pass_int_n_samples
site_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
mc01,DE-SITE not in criteria_names for this row,DE-SITE not in criteria_names for this row,DE-SITE not in criteria_names for this row,DE-SITE not in criteria_names for this row,int_abs_sigma not in row,int_abs_sigma_perc not in row,int_n_samples not in row
mc01,dir_alpha95 not in row,dir_n_samples not in row,dir_n_specimens_lines not in row,dir_k not in row,DE-SITE not in criteria_names for this row,DE-SITE not in criteria_names for this row,DE-SITE not in criteria_names for this row
mc01,True,True,dir_n_specimens_lines not in row,True,int_abs_sigma not in row,int_abs_sigma_perc not in row,int_n_samples not in row
mc02,DE-SITE not in criteria_names for this row,DE-SITE not in criteria_names for this row,DE-SITE not in criteria_names for this row,DE-SITE not in criteria_names for this row,int_abs_sigma not in row,int_abs_sigma_perc not in row,int_n_samples not in row
mc02,True,True,dir_n_specimens_lines not in row,True,int_abs_sigma not in row,int_abs_sigma_perc not in row,int_n_samples not in row


### Merging in data from other tables

In [10]:
spec_container = con.tables['specimens']
spec_df = spec_container.df

samp_container = con.tables['samples']
samp_df = samp_container.df

site_container = con.tables['sites']
site_df = site_container.df
site_df.ix[[0]]
spec_df.ix[[0]]
samp_df.ix[[0]]

spec_df[['sample_name']].head()

Unnamed: 0_level_0,sample_name
specimen_name,Unnamed: 1_level_1
mc01a,mc01a
mc01a,mc01a
mc01b,mc01b
mc01b,mc01b
mc01c,mc01c


In [11]:
## general way to import *_name column into a lower-down file
# location name into samples, specimens, or measurements
# site_name into specimens or measurements
# sample_name into measurements


from pmagpy import ipmag
from programs import data_model3
from programs import new_builder as nb
reload(nb)
from programs.new_builder import Contribution

import os
import pandas as pd
from pandas import DataFrame

wdir = os.path.join(os.getcwd(), "3_0", "McMurdo")
infile = os.path.join(wdir, "lawrence09.v30.txt")



col_name = 'location_name'
col_table_name = col_name.split("_")[0] + "s"
df_name = 'measurements'
fname = 'measurements.txt'

my_con = nb.Contribution(wdir, single_file=fname, custom_filenames={'samples': 'custom_samples.txt',
                                                                    'specimens': 'custom_specimens.txt',
                                                                    'sites': 'custom_sites.txt'})


ancestry = ['measurements', 'specimens', 'samples', 'sites', 'locations']

df = my_con.tables[df_name].df


def get_table_name(ind):
    if ind > -1:
        table_name = ancestry[ind]
        name = table_name[:-1] + "_name"
        return table_name, name
    return "", ""

def propagate_col_name_down(col_name, df_name):
    """
    Put the data for "col_name" into dataframe with df_name
    Used to add 'site_name' to specimen table, for example.  
    """
    if col_name == 'location_name' and df_name == 'measurements':
        print "FALSE"
        #return "FALSE"
    df = my_con.tables[df_name].df
    if col_name not in df.columns:
        # get names for each level
        grandparent_table_name = col_name.split('_')[0] + "s"
        grandparent_name = grandparent_table_name[:-1] + "_name"    
        ind = ancestry.index(grandparent_table_name) - 1
        # 
        parent_table_name, parent_name = get_table_name(ind)
        child_table_name, child_name = get_table_name(ind - 1)
        bottom_table_name, bottom_name = get_table_name(ind - 2)
        
  
        print col_name
        print 'grandparent_name', grandparent_name, grandparent_table_name
        print 'parent_name', parent_name, parent_table_name
        print 'child_name', child_name, child_table_name
        print 'bottom_name', bottom_name, bottom_table_name
        
        # merge in bottom level
        if child_name not in df.columns:
            if bottom_table_name not in my_con.tables:
                my_con.add_magic_table(bottom_table_name)
            add_df = my_con.tables[bottom_table_name].df
            df = df.merge(add_df[[child_name]], left_on=[bottom_name], right_index=True, how="left")
    
        # merge in one level above
        if parent_name not in df.columns:
            # add parent_name to df
            if child_table_name not in my_con.tables:
                my_con.add_magic_table(child_table_name)
            add_df = my_con.tables[child_table_name].df
            df = df.merge(add_df[[parent_name]], left_on=[child_name], right_index=True, how="left")
        
        # merge in two levels above
        if grandparent_name not in df.columns:
            if parent_table_name not in my_con.tables:
                my_con.add_magic_table(parent_table_name)
            # add grandparent name to df
            add_df = my_con.tables[parent_table_name].df
            df = df.merge(add_df[[grandparent_name]], left_on=[parent_name], right_index=True, how="left")
            
        return df

    else:
        print '{} already in {}'.format(col_name, df_name)
        return df
    
    
#if not (col_name == 'location_name' and df_name == 'measurements'):
#df[['specimen_name', 'sample_name', col_name]].head()
#df = propagate_col_name(col_name, df_name)
#df[['specimen_name', 'sample_name', 'site_name']].head()

cols = ['specimen_name', 'sample_name', 'site_name', 'location_name']
propagate_col_name_down('location_name', 'measurements')[cols].head()

FALSE
location_name
grandparent_name location_name locations
parent_name site_name sites
child_name sample_name samples
bottom_name specimen_name specimens


Unnamed: 0_level_0,specimen_name,sample_name,site_name,location_name
measurement_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mc01f-LP-DIR-AF1,mc01f,mc01f,mc01,McMurdo
mc01f-LP-DIR-AF1,mc01f,mc01f,mc01,McMurdo
mc01f-LP-DIR-AF1,mc01f,mc01f,mc01,McMurdo
mc01f-LP-DIR-AF2,mc01f,mc01f,mc01,McMurdo
mc01f-LP-DIR-AF2,mc01f,mc01f,mc01,McMurdo


In [12]:
#  merge site_names into specimen table (from sample table)
#  (this snippet has been incorporated into new_builder)

if 'site_name' not in spec_df.columns:
    spec_df = spec_df.merge(samp_df[['site_name']], left_on=['sample_name'], right_index=True, how="left")
        
spec_df.head()[['sample_name', 'site_name']]
#new_df[new_df['site_name_x'] != new_df['site_name_y']]

Unnamed: 0_level_0,sample_name,site_name
specimen_name,Unnamed: 1_level_1,Unnamed: 2_level_1
mc01a,mc01a,mc01
mc01a,mc01a,mc01
mc01b,mc01b,mc01
mc01b,mc01b,mc01
mc01c,mc01c,mc01


In [13]:
# how to update a dictionary:

standard_filenames = {'measurements': 'measurements.txt', 'ages': 'ages.txt', 'sites': 'sites.txt', 'locations': 'locations.txt', 'samples': 'samples.txt', 'criteria': 'criteria.txt', 'images': 'images.txt', 'contribution': 'contribution.txt', 'specimens': 'specimens.txt'}
filenames = {'sites': 'crazy_site_file.txt'}
standard_filenames.update(filenames)

# Putting some pieces together

In [14]:

reload(nb)
from programs.new_builder import Contribution

# build a contribution with several custom filenames
# only build sites table to start
con2 = Contribution(wdir, read_tables=['sites'], custom_filenames={'specimens': 'custom_specimens.txt', 
                                                                   'samples': 'custom_samples.txt',
                                                                   'sites': 'custom_sites.txt'})
print con2.tables
# then add 'samples' table (no need to specify the filename here)
# we are providing data type but no filename
con2.add_magic_table('samples')
print con2.tables
# add site names
con2.propagate_col_name_down('site_name', 'specimens')
#con2.add_magic_table('fake')

# specimens table now has sample AND site names
con2.tables['specimens'].df[['sample_name', 'site_name']].head()

{'sites': <programs.new_builder.MagicDataFrame object at 0x10f970050>}
{'sites': <programs.new_builder.MagicDataFrame object at 0x10f970050>, 'samples': <programs.new_builder.MagicDataFrame object at 0x10d016e10>}


Unnamed: 0_level_0,sample_name,site_name
specimen_name,Unnamed: 1_level_1,Unnamed: 2_level_1
mc01a,mc01a,mc01
mc01a,mc01a,mc01
mc01b,mc01b,mc01
mc01b,mc01b,mc01
mc01c,mc01c,mc01


In [15]:
# add another table to the same contribution
# this time, provide a filename but no data type

con2.add_magic_table(dtype="unknown", fname="criteria.txt")
# criteria table now included
print con2.tables.keys()

['specimens', 'sites', 'samples', 'criteria']


## Scratch