In [69]:
height_list = [72, 74, 65, 68]
weight_list = [175, 155, 144, 123]

In [70]:
# Doesn't work the Pythonic way
weight / height ** 2

TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int'

In [None]:
def calc_bmi(w, h):
    return w / h ** 2

list(map(calc_bmi, weight, height))

In [None]:
import numpy as np

In [None]:
np_height = np.array(height_list)
np_weight = np.array(weight_list)

In [None]:
bmi = np_weight / np_height ** 2 * 1000
bmi

In [None]:
np.array([1, 'string', True])

In [None]:
height_list + weight_list

In [None]:
np_weight + np_height

In [None]:
new_array = np.array([1, 2, 3])
new_array + new_array

In [None]:
bmi[1]

In [None]:
bmi > 30

In [None]:
bmi[bmi > 30]

## 2D Arrays

In [None]:
import random

np_2d = np.array(([random.randint(0, 10) for i in range(0, 10)],
                [random.randint(0, 10) for i in range(0, 10)]))

np_2d

In [None]:
np_2d.shape

In [None]:
x = np.array([
    [28, 82],
    [24, 42],
    [12, 21],
    [65, 65],
    [90, 9]
])

In [None]:
np.mean(x[:, 1])

In [None]:
np.mean(x[0, :])

# Analyzing Data

## 1. Get the data

In [185]:
import json

with open("stuff.json", 'rb') as f:
    data = json.load(f)
    
data['1']['stuff']

['Hello', 'There']

In [149]:
import unicodecsv

enrollments = []
# f = open('enrollments.csv', 'rb')
# reader = unicodecsv.DictReader(f)
# for i in reader:
#     enrollments.append(i)
# f.close()

# Automatically closes the file
with open('enrollments.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    enrollments = list(reader)

enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', '2014-11-10'),
             ('cancel_date', '2015-01-14'),
             ('days_to_cancel', '65'),
             ('is_enrolled', 'True'),
             ('is_canceled', 'True')])

In [150]:
engagements = []
with open('student_engagement.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    engagements = list(reader)
    
engagements[0]

OrderedDict([('acct', '0'),
             ('utc_date', '2015-01-09'),
             ('num_courses_visited', '1.0'),
             ('total_minutes_visited', '11.6793745'),
             ('lessons_completed', '0.0'),
             ('projects_completed', '0.0')])

In [151]:
submissions = []
with open('submissions.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    submissions = list(reader)

submissions[0]

OrderedDict([('creation_date', '2015-01-14'),
             ('completion_date', '2015-01-16'),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

## 2. Cleaning the Data

In [152]:
# fix date types

from datetime import datetime as dt

def convert_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
def convert_int(integer):
    if integer == '':
        return None
    else:
        return int(integer)

In [153]:
for e in enrollments:
    e['join_date'] = convert_date(e['join_date'])
    e['cancel_date'] = convert_date(e['cancel_date'])
    e['days_to_cancel'] = convert_int(e['days_to_cancel'])
    e['is_enrolled'] = e['is_enrolled'] == str(True)
    e['is_canceled'] = e['is_canceled'] == str(True)

In [154]:
enrollments[2]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', datetime.datetime(2015, 1, 27, 0, 0)),
             ('cancel_date', datetime.datetime(2015, 1, 27, 0, 0)),
             ('days_to_cancel', 0),
             ('is_enrolled', True),
             ('is_canceled', True)])

In [155]:
for e in engagements:
    e['utc_date'] = convert_date(e['utc_date'])
    e['num_courses_visited'] = int(float(e['num_courses_visited']))
    e['lessons_completed'] = int(float(e['lessons_completed']))
    e['projects_completed'] = int(float(e['projects_completed']))
    e['total_minutes_visited'] = float(e['total_minutes_visited'])

In [156]:
engagements[4]

OrderedDict([('acct', '0'),
             ('utc_date', datetime.datetime(2015, 1, 13, 0, 0)),
             ('num_courses_visited', 1),
             ('total_minutes_visited', 64.7796776667),
             ('lessons_completed', 0),
             ('projects_completed', 0)])

In [157]:
for s in submissions:
    s['creation_date'] = convert_date(s['creation_date'])
    s['completion_date'] = convert_date(s['completion_date'])

In [158]:
submissions[4]

OrderedDict([('creation_date', datetime.datetime(2015, 2, 17, 0, 0)),
             ('completion_date', datetime.datetime(2015, 3, 3, 0, 0)),
             ('assigned_rating', 'INCOMPLETE'),
             ('account_key', '434'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

In [159]:
len(enrollments)

1640

In [160]:
len(engagements)

136240

In [162]:
# Run this only once!
# Change the engagements['acct'] to engagements['account_key']

# for e in engagements:
#     e['account_key'] = e['acct']
#     del[e['acct']]

In [170]:
# Using DRY principles, create a function to get unique data across
# all data sets

def unique_info(data):
    unique_info = set()
    for i in data:
        unique_info.add(i['account_key'])
    return unique_info

In [178]:
print(f"Unique engagements: {len(unique_info(engagements))}")
print(f"Unique enrollments: {len(unique_info(enrollments))}")
print(f"Unique submissions: {len(unique_info(submissions))}")

Unique engagements: 1237
Unique enrollments: 1302
Unique submissions: 743


In [188]:
# More unique enrollments than there are engagements
len(unique_info(enrollments)) - len(unique_info(engagements))
# list(unique_)

65