In [1]:
import homer_db
import pymongo
#import pandas as pd
import matplotlib.pyplot as plt
import dateutil
from collections import defaultdict

First open a connection to pyomngo, default is localhost:27017

In [2]:
conn = pymongo.MongoClient('localhost', 27017)

Use the cell below to re-load the sample data into the database. Using drop_all=True, all data currently in the database will be deleted and the data in 'DataEngineerSampleData.txt' will be loaded in.

In [None]:
confirm = raw_input('Confirm re-load data (YES/No)?')
if confirm == 'YES':
    homer_db.load_data('DataEngineerSampleData.txt', db=conn['homer2'], drop_all=True)

In [5]:
pipeline = [
    {'$match': {'age': {'$gte': 0, '$lte': 100}, 'name': {'$exists': True}}},
    {'$group': {'_id': '$age', 'count': {'$sum': 1}}},  # get how many of each age there are
    {'$sort': {'age': 1}}
]
age_counts = conn['homer2']['users'].aggregate(pipeline)
ages, counts = zip(*[(age_count['_id'], age_count['count']) for age_count in age_counts])
plt.bar(ages, counts)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

In [None]:
pipeline = [
        {'$match': {'age': {'$gte': 0, '$lte': 100}, 'name': {'$exists': 'true'}}},
        {'$group': {'_id': '$age', 'count': {'$sum': 1}}},  # get how many of each age there are
        {'$sort': {'age': 1}}
    ]
db = conn['homer']
age_counts = db['events'].aggregate(pipeline)
age_counts = [(age_count['_id'], age_count['count']) for age_count in age_counts]
df = pd.DataFrame(age_counts)
df = df.rename(columns={0: 'Age', 1: 'Count'})

In [None]:
df.loc[df['Count'].idxmax()]

In [None]:
df2 = df[df['Age'] > 0]
df2 = df2[df2['Age'] < 10]
unclean_mean = df['Age'].mean()
unclean_std = df['Age'].std()
clean_mean = df2['Age'].mean()
clean_std = df2['Age'].std()

In [None]:
plt.bar(df['Age'], df['Count'])
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

In [None]:
session_times = get_lesson_histogram(conn['homer'])

In [None]:
def get_lesson_histogram(db, unit_category=['LTR', 'DTW']):
    from random import random
    unique_users = [user['_p'] for user in db['users'].find({'age': {'$gte': 1, '$lte': 10}})]
    session_times = defaultdict(int)
    num_users = len(unique_users)
    print num_users, '>', num_users * 0.025
    for i, _p in enumerate(unique_users):
        if random() > 0.025:
            continue
        for _, st in homer_db.generate_session_times(db, _p, unit_category):
            st = st / 60 * 60
            session_times[st] += 1
        print (i / float(num_users)) * 100
    return session_times

In [None]:
from collections import defaultdict
session_times_minutes = defaultdict(int)
for k,v in session_times.iteritems():
    k = round(k / 60) + 1
    if k < 60:
        session_times_minutes[k] += v

In [None]:
total = sum(session_times_minutes.values())
times, counts = zip(*[(k, v / float(total)) for k,v in session_times_minutes.iteritems()])
plt.bar(times, counts)
plt.title('Lesson Time Distribution')
plt.xlabel('Time')
plt.ylabel('Count')
plt.show()

In [None]:
target = 0.999 * total
temp = 0
for t in session_times_minutes.keys():
    temp += session_times_minutes[t]
    if temp >= target:
        print t
        break

In [3]:
print homer_db.get_average_times(conn['homer2'], 'EQYWS29T8S', count_outliers=True)

{'monthly': 89.0, 'outliers': [17, 4], 'daily': 4.0, 'weekly': 30.0}


In [6]:
def get_lessons_with_ages(db):
    pipeline = [
        {'$match': {'age': {'$gte': 1, '$lte': 10}}},
        {'$lookup': {'from': 'lessons', 'localField': '_p', 'foreignField': '_p', 'as': 'lessondata'}},
        {'$project': {'age': 1, '_p': 1, 'lessondata.manuscripttitle': 1, 'lessondata.unitcategory': 1}},
        {'$unwind': '$lessondata'},
        {'$group': {'_id': '$age',
                    'manuscripts': {'$addToSet': '$lessondata.manuscripttitle'},
                    'categories': {'$addToSet': '$lessondata.unitcategory'}}}
    ]
    return db['users2'].aggregate(pipeline)

In [None]:
lessons_by_age = get_lessons_with_ages(conn['homer2'])