In [25]:
height_list = [72, 74, 65, 68]
weight_list = [175, 155, 144, 123]

In [26]:
# Doesn't work the Pythonic way
weight / height ** 2

NameError: name 'weight' is not defined

In [27]:
def calc_bmi(w, h):
    return w / h ** 2

list(map(calc_bmi, weight, height))

NameError: name 'weight' is not defined

In [28]:
import numpy as np

In [29]:
np_height = np.array(height_list)
np_weight = np.array(weight_list)

In [30]:
bmi = np_weight / np_height ** 2 * 1000
bmi

array([33.75771605, 28.30533236, 34.08284024, 26.60034602])

In [31]:
np.array([1, 'string', True])

array(['1', 'string', 'True'], dtype='<U21')

In [32]:
height_list + weight_list

[72, 74, 65, 68, 175, 155, 144, 123]

In [33]:
np_weight + np_height

array([247, 229, 209, 191])

In [34]:
new_array = np.array([1, 2, 3])
new_array + new_array

array([2, 4, 6])

In [35]:
bmi[1]

28.305332359386412

In [36]:
bmi > 30

array([ True, False,  True, False])

In [37]:
bmi[bmi > 30]

array([33.75771605, 34.08284024])

## 2D Arrays

In [38]:
import random

np_2d = np.array(([random.randint(0, 10) for i in range(0, 10)],
                [random.randint(0, 10) for i in range(0, 10)]))

np_2d

array([[ 1,  9,  3,  8,  8,  4,  4, 10,  5,  5],
       [ 9, 10,  0, 10,  5,  4,  7,  3,  2,  8]])

In [39]:
np_2d.shape

(2, 10)

In [40]:
x = np.array([
    [28, 82],
    [24, 42],
    [12, 21],
    [65, 65],
    [90, 9]
])

In [41]:
np.mean(x[:, 1])

43.8

In [42]:
np.mean(x[0, :])

55.0

# Analyzing Data

## 1. Get the data

In [43]:
import json

with open("stuff.json", 'rb') as f:
    data = json.load(f)
    
data['1']['stuff']

FileNotFoundError: [Errno 2] No such file or directory: 'stuff.json'

In [44]:
import unicodecsv

enrollments = []
# f = open('enrollments.csv', 'rb')
# reader = unicodecsv.DictReader(f)
# for i in reader:
#     enrollments.append(i)
# f.close()

# Automatically closes the file
with open('enrollments.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    enrollments = list(reader)

enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', '2014-11-10'),
             ('cancel_date', '2015-01-14'),
             ('days_to_cancel', '65'),
             ('is_enrolled', 'True'),
             ('is_canceled', 'True')])

In [45]:
engagements = []
with open('student_engagement.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    engagements = list(reader)
    
engagements[0]

OrderedDict([('acct', '0'),
             ('utc_date', '2015-01-09'),
             ('num_courses_visited', '1.0'),
             ('total_minutes_visited', '11.6793745'),
             ('lessons_completed', '0.0'),
             ('projects_completed', '0.0')])

In [46]:
submissions = []
with open('submissions.csv', 'rb') as f:
    reader = unicodecsv.DictReader(f)
    submissions = list(reader)

submissions[0]

OrderedDict([('creation_date', '2015-01-14'),
             ('completion_date', '2015-01-16'),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

## 2. Cleaning the Data

In [47]:
# fix date types

from datetime import datetime as dt

def convert_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
def convert_int(integer):
    if integer == '':
        return None
    else:
        return int(integer)

In [48]:
for e in enrollments:
    e['join_date'] = convert_date(e['join_date'])
    e['cancel_date'] = convert_date(e['cancel_date'])
    e['days_to_cancel'] = convert_int(e['days_to_cancel'])
    e['is_enrolled'] = e['is_enrolled'] == str(True)
    e['is_canceled'] = e['is_canceled'] == str(True)

In [49]:
enrollments[2]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', datetime.datetime(2015, 1, 27, 0, 0)),
             ('cancel_date', datetime.datetime(2015, 1, 27, 0, 0)),
             ('days_to_cancel', 0),
             ('is_enrolled', True),
             ('is_canceled', True)])

In [50]:
for e in engagements:
    e['utc_date'] = convert_date(e['utc_date'])
    e['num_courses_visited'] = int(float(e['num_courses_visited']))
    e['lessons_completed'] = int(float(e['lessons_completed']))
    e['projects_completed'] = int(float(e['projects_completed']))
    e['total_minutes_visited'] = float(e['total_minutes_visited'])

In [51]:
engagements[4]

OrderedDict([('acct', '0'),
             ('utc_date', datetime.datetime(2015, 1, 13, 0, 0)),
             ('num_courses_visited', 1),
             ('total_minutes_visited', 64.7796776667),
             ('lessons_completed', 0),
             ('projects_completed', 0)])

In [52]:
for s in submissions:
    s['creation_date'] = convert_date(s['creation_date'])
    s['completion_date'] = convert_date(s['completion_date'])

In [53]:
submissions[4]

OrderedDict([('creation_date', datetime.datetime(2015, 2, 17, 0, 0)),
             ('completion_date', datetime.datetime(2015, 3, 3, 0, 0)),
             ('assigned_rating', 'INCOMPLETE'),
             ('account_key', '434'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

In [54]:
len(enrollments)

1640

In [55]:
len(engagements)

136240

In [56]:
# Run this only once!
# Change the engagements['acct'] to engagements['account_key']

# for e in engagements:
#     e['account_key'] = e['acct']
#     del[e['acct']]

In [57]:
# Using DRY principles, create a function to get unique data across
# all data sets

def unique_info(data):
    unique_info = set()
    for i in data:
        unique_info.add(i['account_key'])
    return unique_info

In [59]:
print(f"Unique engagements: {len(unique_info(engagements))}")
print(f"Unique enrollments: {len(unique_info(enrollments))}")
print(f"Unique submissions: {len(unique_info(submissions))}")

KeyError: 'account_key'

In [60]:
# More unique enrollments than there are engagements
len(unique_info(enrollments)) - len(unique_info(engagements))
# list(unique_)

KeyError: 'account_key'

In [246]:
# num of students who were active longer than a day
num_students = 0
for i in range(len(enrollments)):
    if enrollments[i]['join_date'] != enrollments[i]['cancel_date']:
        num_students += 1
    
num_students

1548

In [276]:
# Find how many people who remained active for longer than 7 days
num_people = 0
for e in enrollments:
    if e['days_to_cancel'] is not None and e['days_to_cancel'] > 7:
        num_people+=1

num_people

574

In [304]:
# Find average amount of people who remain active for longer than 7 days
under_7 = []
above_7 = []
for e in enrollments:
    if e['days_to_cancel'] is not None and e['days_to_cancel'] < 7:
        under_7.append(e['days_to_cancel'])
    if e['days_to_cancel'] is not None and e['days_to_cancel'] > 7:
        above_7.append(e['days_to_cancel'])
under_7 = len(under_7)
print(f"Less than 7 days: {under_7}")
above_7 = len(above_7)
print(f"More than 7 days: {above_7}")
average = above_7 // (under_7 + above_7)
print(f"Average of students active longer than 7 days: {average}")

Less than 7 days: 374
More than 7 days: 574
Average of students active longer than 7 days: 0


In [278]:
unique_info(enrollments)

{'1304',
 '18',
 '743',
 '706',
 '1190',
 '1289',
 '1118',
 '977',
 '1233',
 '917',
 '1009',
 '664',
 '667',
 '897',
 '380',
 '284',
 '1229',
 '1159',
 '391',
 '425',
 '248',
 '15',
 '1255',
 '328',
 '472',
 '1274',
 '82',
 '254',
 '1130',
 '97',
 '124',
 '1177',
 '85',
 '67',
 '639',
 '944',
 '1061',
 '412',
 '1111',
 '244',
 '397',
 '629',
 '896',
 '441',
 '125',
 '607',
 '462',
 '353',
 '950',
 '912',
 '394',
 '1022',
 '122',
 '222',
 '266',
 '713',
 '344',
 '1273',
 '260',
 '592',
 '103',
 '204',
 '347',
 '181',
 '825',
 '697',
 '894',
 '510',
 '418',
 '497',
 '778',
 '999',
 '1049',
 '798',
 '882',
 '180',
 '603',
 '522',
 '131',
 '1035',
 '651',
 '573',
 '1281',
 '1235',
 '226',
 '1213',
 '571',
 '77',
 '837',
 '268',
 '1194',
 '630',
 '982',
 '990',
 '58',
 '238',
 '155',
 '118',
 '453',
 '429',
 '1045',
 '1180',
 '146',
 '148',
 '1046',
 '733',
 '362',
 '403',
 '741',
 '488',
 '737',
 '1104',
 '947',
 '626',
 '363',
 '4',
 '1231',
 '300',
 '24',
 '30',
 '376',
 '305',
 '773',
 