In [3]:
height = [72, 74, 65, 68]
weight = [165, 150, 144, 178]

In [4]:
# Doesn't work
weight / height ** 2

TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int'

In [None]:
# Pythonic method - works, but inefficient
def calc_bmi(w, h):
    return w / h ** 2

list(map(calc_bmi, weight, height))

In [None]:
# With numpy array, it can perform calculations over the entire array
import numpy as np

In [None]:
np_height = np.array(height)
np_height

In [None]:
np_weight = np.array(weight)
np_weight

In [None]:
bmi = np_weight / np_height ** 2 * 1000
bmi

In [None]:
# np_arrays should only contain one type
np.array([1, 'hello', True])

In [None]:
np_array = np.array([1, 2, 3])

In [None]:
# adding 2 python lists will merge the 2 lists together
p_l = [1, 2, 3]
p_l + p_l

In [None]:
# adding 2 numpy arrays will get you the sum of their respective indeces
np_array + np_array

## Indexing, Subsetting, Conditions

In [None]:
bmi[1]

In [None]:
bmi > 33

In [None]:
bmi[bmi > 33]

## 2D Arrays

In [None]:
import random

np_2d = np.array([[random.randint(1, 10) for i in range(0, 10)],
                  [random.randint(1, 10) for i in range(0, 10)]])

np_2d

In [None]:
np_2d.shape

In [None]:
x = np.array([[28, 18],
              [34, 14],
              [32, 16],
              [26, 23],
              [23, 17]])

x

In [None]:
# 1. Get the average height
np.mean(x[:,1])

In [None]:
# 2. Get the average weight
np.mean(x[0,:])

## 1. Find the Data

In [5]:
import unicodecsv

## Longer version of code (replaced with shorter, equivalent version below)

# enrollments = []
# f = open('enrollments.csv', 'rb')
# reader = unicodecsv.DictReader(f)
# for row in reader:
#     enrollments.append(row)
# f.close()

with open('enrollments.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f) # Converts each row into a dictionary
    enrollments = list(reader)

In [6]:
#####################################
#                 1                 #
#####################################

## Read in the data from daily_engagement.csv and project_submissions.csv 
## and store the results in the below variables.
## Then look at the first row of each table.

with open('daily_engagement.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f) # Converts each row into a dictionary
    engagements = list(reader)

with open('project_submissions.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f) # Converts each row into a dictionary
    submissions = list(reader)

In [7]:
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', '2014-11-10'),
             ('cancel_date', '2015-01-14'),
             ('days_to_cancel', '65'),
             ('is_enrolled', 'True'),
             ('is_canceled', 'True')])

In [8]:
engagements[0]

OrderedDict([('acct', '0'),
             ('utc_date', '2015-01-09'),
             ('num_courses_visited', '1.0'),
             ('total_minutes_visited', '11.6793745'),
             ('lessons_completed', '0.0'),
             ('projects_completed', '0.0')])

In [9]:
submissions[0]

OrderedDict([('creation_date', '2015-01-14'),
             ('completion_date', '2015-01-16'),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

## 2. Data Cleaning

In [10]:
# Fix data types

from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

In [11]:
# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_enrolled'] = enrollment['is_enrolled'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])

In [12]:
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', datetime.datetime(2014, 11, 10, 0, 0)),
             ('cancel_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('days_to_cancel', 65),
             ('is_enrolled', True),
             ('is_canceled', True)])

In [13]:
for engagement in engagements:
    engagement['lessons_completed'] = int(float(engagement['lessons_completed']))
    engagement['num_courses_visited'] = int(float(engagement['num_courses_visited']))
    engagement['projects_completed'] = int(float(engagement['projects_completed']))
    engagement['total_minutes_visited'] = float(engagement['total_minutes_visited'])
    engagement['utc_date'] = parse_date(engagement['utc_date'])
    
engagements[0]

OrderedDict([('acct', '0'),
             ('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
             ('num_courses_visited', 1),
             ('total_minutes_visited', 11.6793745),
             ('lessons_completed', 0),
             ('projects_completed', 0)])

In [14]:
for submission in submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])
    
submissions[0]

OrderedDict([('creation_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('completion_date', datetime.datetime(2015, 1, 16, 0, 0)),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

In [15]:
# Think of 5 questions to ask from the data

#### 1. Find unique enrollees

In [43]:
def unique_info(data):
    unique_info = set()
    for i in data:
        unique_info.add(i['account_key'])

In [19]:
len(enrollments)

1640

In [30]:
unique_students = set()
for e in enrollments:
    unique_students.add(e['account_key'])
    
len(unique_students)

1302

In [31]:
len(engagements)

136240

In [41]:
unique_engagements = set()
for e in engagements:
    unique_engagements.add(e['account_key'])
    
len(unique_engagements)

1237

In [36]:
unique_submissions = set()
for s in submissions:
    unique_submissions.add(s['account_key'])
    
len(unique_submissions)

743

In [37]:
# Discrepancies
# 1. More unique students in enrollment v engagement
# 2. Column naming was incongruent across the different tables. We're repeating ourself, so we'll need to create a function for this later
########## Rename 'acct' to 'account_key', or vice versa

In [42]:
# Rename the engagment 'acct' column name to 'account_key'. RUN THIS ONLY ONCE

# for e in engagements:
#     e['account_key'] = e['acct']
#     del[e['acct']]