# Pulling data logged by sacred from a MongoDB database 

In [1]:
import pandas as pd
from pymongo import MongoClient
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

def _connect_mongo(host, port, username, password, db):
    """ Helper function for making a connection to mongo """
    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    return conn[db]

def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """
    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)
    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)
    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))
    # Delete the _id
    if no_id:
        del df['_id']
    return df

def parse_results(input_df, model_name):
    """ Parse/filter results into a more "managable" dataframe
        - Helps when you start storing too many models in one database or database gets messy
    
        inputs:
        input_df -- pandas dataframe 
        model_name -- string (serves as an identifer for which model to get results for)
        
        output:
        output_df -- pandas dataframe
    """
    output_df = pd.DataFrame(columns=['accuracy','sensitivity','specificity','auc','config'])
    i =0 
    
    for index, row in input_df.iterrows():
        #print(row.config["model_name"])
        #obs = row.config["num_input"]
        #results = row.result
        try:
            if str(model_name) == str(row.config["model_name"]):
                name = row.experiment["name"]
                results = row.result
                try:
                    output_df.loc[i] = [results[0], results[1], results[2], results[3], row.config]
                    i+=1
                except:
                    print('no results for', row.config["model_name"], 'at index', str(index))
        except:
            print("cant read entry at index", str(index))
            
    return output_df

def make_matrix(df,metric,step=10):
    #obs_times = list(range(10,61,step)) #list(range(60,4,-1)) #[60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10, 5] #[60, 45, 30, 15, 10, 5]
    gap_times = list(range(10,61,step)) #[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] #[5, 10, 15, 30, 45, 60]
    num_tune_samples = list(range(0,51,step))
    matrix = np.zeros((len(gap_times),len(num_tune_samples)))
    print(matrix.shape)
    i = 0
    j = 0
    for i in range(0,len(gap_times)):
        gap = gap_times[i]
        for j in range(0,len(num_tune_samples)):
            samples = num_tune_samples[j]
            try:
                matrix[i,j] = df[(df.lag == gap) & (df.tune_samples == samples)][metric]
            except:
                print(df[(df.lag == gap) & (df.tune_samples == samples)][metric])
                print("messed up value...")
                matrix[i,j] = 0
    return matrix

In [2]:
# Get a dataframe of all the experiments logged in the database
df_whole = read_mongo("brandon_sandbox", "runs")

In [3]:
# Filter down the dataframe to only models with the name 'TEST'
df_results = parse_results(df_whole,'TEST')

cant read entry at index 18
cant read entry at index 19
cant read entry at index 20
cant read entry at index 21
no results for TEST at index 22
no results for TEST at index 24
no results for TEST at index 27
no results for TEST at index 35
no results for TEST at index 119


* Note: The parse results function filters out entries that 1) are errornous or 2) do not have logged results. This could be for a number of reasons such as termination of model training before it is finished running. 

In [4]:
df_results.head()

Unnamed: 0,accuracy,sensitivity,specificity,auc,config
0,0.72,0.76,0.68,0.7584,"{'batch_size': 64, 'checkpoint_dir': '/mnt/dat..."
1,0.766129,0.709677,0.822581,0.819459,"{'batch_size': 64, 'checkpoint_dir': '/mnt/dat..."
2,0.717742,0.854839,0.580645,0.730749,"{'batch_size': 64, 'checkpoint_dir': '/mnt/dat..."
3,0.758065,0.903226,0.612903,0.787201,"{'batch_size': 64, 'checkpoint_dir': '/mnt/dat..."
4,0.766129,0.870968,0.66129,0.80333,"{'batch_size': 64, 'checkpoint_dir': '/mnt/dat..."


* Note: The config is a dict that contains the various parameters used in that particular experiment