In [1]:
import pandas as pd
import json
from collections import Counter

In [2]:
def parse_file(email):
    run_number = 0
    runs = []
    reviews = []
    with open('/tmp/'+email, 'r') as file:
        for line in file:
            parsed_line = json.loads(line)
            if 'query' in parsed_line:
                run_number += 1
                parsed_line['iteration'] = run_number
                runs.append(parsed_line)
            else:
                parsed_line['iteration'] = run_number
                reviews.append(parsed_line)

    runs = pd.DataFrame(runs)
    runs = runs.set_index('iteration')
    
    start_time = runs.iloc[0]['timestamp']
    runs['timestamp'] -= start_time
    
    reviews = pd.DataFrame(reviews)#.drop_duplicates()
    return runs, reviews

def get_reviews_per_iteration(runs, reviews):
    reviews_per_run = Counter(reviews['iteration'])
    for i in range(1, len(runs) + 1):
        if i not in reviews_per_run:
            reviews_per_run[i] = 0
    return pd.Series(reviews_per_run)

def get_per_iteration_metrics(runs, reviews):
    times = runs['timestamp'].values
    review_time = times[1:] - times[:-1]
    review_time = pd.Series(data=review_time, index=range(1,len(runs)))

    backend_time = runs['execution_time']
    queries = runs['query']
    reviews_per_run = get_reviews_per_iteration(runs, reviews)
    
    per_iteration_metrics = pd.concat([reviews_per_run, review_time, backend_time, queries], axis=1)
    per_iteration_metrics.columns = ['number_objects_reviewed', 'review_time', 'backend_time', 'queries']
    return per_iteration_metrics

def get_metrics(email):
    runs, reviews = parse_file(email)
    
    metrics = {}
    metrics['email'] = runs['email'].loc[1]
    metrics['final_query'] = runs.loc[len(runs)]['query']
    metrics['total_time'] = runs.loc[len(runs)]['timestamp']
    metrics['number_iterations'] = len(runs) - 1
    metrics['number_objects_reviewed'] = len(reviews)
    metrics['per_iteration_metrics'] = get_per_iteration_metrics(runs, reviews)
    return metrics

In [3]:
email = 'lucisdp@gmail.com'
get_metrics(email)

{'email': 'lucisdp@gmail.com',
 'final_query': "select distinct make, model, price_msrp\nfrom car_fillblank\nwhere transmission LIKE '%automatic%'\n    and fuel_type NOT LIKE '%premium%'\n    and fuel_type not in ('hydrogen', 'diesel fuel')\n    and drive_type != 'front wheel drive'\n    and body_type not in ('coupe', 'minivan')\n    and class NOT like '%compact%'\n    and make not in ('cadillac')\n    and year >= 2017\n    and price_msrp <= 30000\n    and basic_year >= '5'\norder by price_msrp",
 'number_iterations': 9,
 'number_objects_reviewed': 41,
 'per_iteration_metrics':            number_objects_reviewed  review_time  backend_time  \
 iteration                                                       
 1                                2       23.950         0.016   
 2                                9      105.048         0.021   
 3                                4      242.166         0.017   
 4                                5      132.300         0.028   
 5                  