In [1]:
# activating the debugger
%pdb


Automatic pdb calling has been turned ON


In [2]:
# necessary backages
import unicodecsv
from datetime import datetime as dt


In [3]:
def read_csv(file_path):
    'takes a path of csv file and retuns list it converted to a list'
    with open(file_path, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)
enrollments =  read_csv('../../enrollments.csv')   
daily_engagement =  read_csv('../../daily_engagement.csv')
project_submissions  = read_csv('../../project_submissions.csv')

print project_submissions[0]
print daily_engagement[0]

{u'lesson_key': u'3176718735', u'processing_state': u'EVALUATED', u'account_key': u'256', u'assigned_rating': u'UNGRADED', u'completion_date': u'2015-01-16', u'creation_date': u'2015-01-14'}
{u'lessons_completed': u'0.0', u'num_courses_visited': u'1.0', u'total_minutes_visited': u'11.6793745', u'projects_completed': u'0.0', u'acct': u'0', u'utc_date': u'2015-01-09'}


In [4]:
# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

{u'account_key': u'448',
 u'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'days_to_cancel': 65,
 u'is_canceled': True,
 u'is_udacity': True,
 u'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 u'status': u'canceled'}

In [5]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
daily_engagement[0]

{u'acct': u'0',
 u'lessons_completed': 0,
 u'num_courses_visited': 1,
 u'projects_completed': 0,
 u'total_minutes_visited': 11.6793745,
 u'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [6]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

project_submissions[0]

{u'account_key': u'256',
 u'assigned_rating': u'UNGRADED',
 u'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 u'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'lesson_key': u'3176718735',
 u'processing_state': u'EVALUATED'}

In [7]:
for engagement_record in daily_engagement:
    engagement_record['account_key'] = engagement_record['acct']
    del(engagement_record['acct'])
daily_engagement[0]['account_key']     

u'0'

In [8]:
def find_unique(table):
    """
    inputs:
        - table : list of dict.
        - key :the primary key of the table
    outputs: 
        the number of unquie keys
    
    """
    unique_set = set()
    for e in table:
        unique_set.add(e['account_key'])
    return unique_set

In [9]:
# the number of enrollments
enrollment_num_rows = len(enrollments)            
enrollment_num_unique_students = len(find_unique(enrollments))
print enrollment_num_rows
print enrollment_num_unique_students

1640
1302


In [10]:
# the number of daily_engagement
engagement_num_rows = len(daily_engagement)            # Replace this with your code
engagement_num_unique_students = find_unique(daily_engagement)

print engagement_num_rows
print len(find_unique(daily_engagement))

136240
1237


In [11]:
# get the number of project_submissions
submission_num_rows = len(project_submissions)
submission_num_unique_students =  find_unique(project_submissions)
print submission_num_rows
print len(submission_num_unique_students)

3642
743


In [12]:
only_in_enrollment = []
for e in enrollments:
    
    student = e['account_key']
    if student not in engagement_num_unique_students:
        one_day_enrollment.append(enrollment)
print len(only_in_enrollment)

71


In [14]:
not_a_day_enrollment = []
for e in enrollments:
    student = e['account_key']
    if student not in engagement_num_unique_students and e["join_date"] != e["cancel_date"] :
        not_a_day_enrollment.append(enrollment)
print len(not_a_day_enrollment)

3


In [None]:
# Create a set of the account keys for all Udacity test accounts
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)

In [None]:
# Given some data with an account_key field, removes any records corresponding to Udacity test accounts
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

In [None]:
# Remove Udacity test accounts from all three tables
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

print len(non_udacity_enrollments)
print len(non_udacity_engagement)
print len(non_udacity_submissions)