# Starting intro to data science in Udacity

In [1]:
import unicodecsv

In [47]:
def read_csv(filename):
    with open(filename, "rb") as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

enrollments = read_csv("enrollments.csv")
daily_engagements = read_csv("daily_engagement_full.csv")
project_submissions = read_csv("project_submissions.csv")

In [3]:
daily_e_small = read_csv('daily_engagement.csv')

In [4]:
from datetime import datetime as dt

def parse_date(date):
    if date == '' or date == None: 
        return None
    else:
        if (type(date) != dt):
            return dt.strptime(date, '%Y-%m-%d')
        else:
            return date
    
def parse_int(num):
    if num == '' or num == None:
        return None
    else:
        if (type(num) != int):
            return int(num)
        else:
            return num
def parse_float(num):
    if num == '' or num == None:
        return None
    else:
        if (type(num) != float):
            return float(num)
        else:
            return num

In [5]:
print (enrollments[0])

{u'status': u'canceled', u'is_udacity': u'True', u'is_canceled': u'True', u'join_date': u'2014-11-10', u'account_key': u'448', u'cancel_date': u'2015-01-14', u'days_to_cancel': u'65'}


In [6]:
print (daily_engagements[0])

{u'lessons_completed': u'0.0', u'sibling_key': u'ud359', u'course_title': u'Intro to Data Science', u'has_visited': u'0.0', u'registration_date': u'2014-08-05', u'course_key': u'ud359-nd', u'total_minutes_visited': u'0.0', u'projects_completed': u'0.0', u'account_key': u'2257038596', u'acct': u'448', u'subscription_start': u'2014-11-05', u'utc_date': u'2014-11-05'}


In [7]:
print (project_submissions[0])

{u'lesson_key': u'3176718735', u'processing_state': u'EVALUATED', u'account_key': u'256', u'assigned_rating': u'UNGRADED', u'completion_date': u'2015-01-16', u'creation_date': u'2015-01-14'}


In [48]:
for enrollment in enrollments:
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_int(enrollment['days_to_cancel'])
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True' 
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True' 
print (enrollments[0])

{u'status': u'canceled', u'is_udacity': True, u'is_canceled': True, u'join_date': datetime.datetime(2014, 11, 10, 0, 0), u'account_key': u'448', u'cancel_date': datetime.datetime(2015, 1, 14, 0, 0), u'days_to_cancel': 65}


In [49]:
for engagement in daily_engagements:
    engagement['lessons_completed'] = parse_int(float(engagement['lessons_completed']))
    #engagement['num_courses_visited'] = parse_int(float(engagement['num_courses_visited']))
    engagement['total_minutes_visited'] = parse_float(engagement['total_minutes_visited'])
    engagement['projects_completed'] = parse_int(float(engagement['projects_completed']))
    engagement['utc_date'] = parse_date(engagement['utc_date'])
    engagement['has_visited'] = engagement['has_visited'] == '1.0'
    engagement['registration_date'] = parse_date(engagement['registration_date'])
    engagement['total_minutes_visited'] = parse_int(float(engagement['total_minutes_visited']))
    engagement['subscription_start'] = parse_date(engagement['subscription_start'])
print (daily_engagements[0])

{u'lessons_completed': 0, u'sibling_key': u'ud359', u'course_title': u'Intro to Data Science', u'has_visited': False, u'registration_date': datetime.datetime(2014, 8, 5, 0, 0), u'course_key': u'ud359-nd', u'total_minutes_visited': 0, u'projects_completed': 0, u'account_key': u'2257038596', u'acct': u'448', u'subscription_start': datetime.datetime(2014, 11, 5, 0, 0), u'utc_date': datetime.datetime(2014, 11, 5, 0, 0)}


In [130]:
for engagement in daily_e_small:
    engagement['utc_date'] = parse_date(engagement['utc_date'])

In [50]:
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])
print (project_submissions[0])

{u'lesson_key': u'3176718735', u'processing_state': u'EVALUATED', u'account_key': u'256', u'assigned_rating': u'UNGRADED', u'completion_date': datetime.datetime(2015, 1, 16, 0, 0), u'creation_date': datetime.datetime(2015, 1, 14, 0, 0)}


## Questions:
   ### - Paid users tend to complete more project than free users?
   ### - How many users complete a course without login in every day?
   ### - Are free users most likely to drop out the course?
   ### - How many users complete lessons on more than a course in a day?
   ### - Did any user spent time on a free related course in a day?

## Investigating the data:

#### Find the number of rows of each csv:

In [11]:
len(enrollments)

1640

In [12]:
len(daily_engagements)

2309239

In [13]:
len(project_submissions)

3642

#### Find the number of unique students:

In [14]:
def get_unique_student(data):
    data_account_keys = set()
    for record in data:
        data_account_keys.add(record['account_key'])
    return data_account_keys

In [15]:
enrollment_account_keys = get_unique_student(enrollments)
engagement_account_keys = get_unique_student(daily_engagements)
submission_account_keys = get_unique_student(project_submissions)

print (len(enrollment_account_keys))
print (len(engagement_account_keys))
print (len(submission_account_keys))

1302
1237
743


#### Problems with this data: 
    - There are more unique stundents in enrollment than in engagement
    - In the course the is diference between two column names 'account_key' and 'acct' that doen's happens in this dataset

In [16]:
for engagement in daily_e_small:
    engagement['account_key'] = engagement.pop('acct')

In [57]:
for engagement in daily_engagements:
    engagement['account_pkey'] = engagement.pop('acct')

In [18]:
daily_e_small[0]['account_key']

u'0'

#### Why are students missing from daily engagements?
    - Identify surprising data points: Any enrollment record with no corresponding engagement data
    - Print some surprising data points

In [36]:
engagement_keys = set()
surprising_points = []
for engagement in daily_engagements:
    engagement_keys.add(engagement['account_pkey'])
for enrollment in enrollments:
    if enrollment['account_key'] not in engagement_keys:
        surprising_points.append(enrollment)

In [37]:
len(surprising_points)

71

In [35]:
surprising_points[0]

{u'account_key': u'448',
 u'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 u'days_to_cancel': 65,
 u'is_canceled': True,
 u'is_udacity': True,
 u'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 u'status': u'canceled'}

In [None]:
for point in surprising_points:
    print point['days_to_cancel'] 

In [None]:
for point in surprising_points:
    print point['is_canceled']

In [None]:
for point in surprising_points:
    print point['is_udacity']

In [40]:
count = 0
special_surprising_points = []
for point in surprising_points:
    if point['days_to_cancel'] > 1 or point['days_to_cancel'] == None:
        count += 1
        special_surprising_points.append(point)
print count

print special_surprising_points

3
[{u'status': u'canceled', u'is_udacity': True, u'is_canceled': True, u'join_date': datetime.datetime(2015, 1, 10, 0, 0), u'account_key': u'1304', u'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), u'days_to_cancel': 59}, {u'status': u'canceled', u'is_udacity': True, u'is_canceled': True, u'join_date': datetime.datetime(2015, 3, 10, 0, 0), u'account_key': u'1304', u'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), u'days_to_cancel': 99}, {u'status': u'current', u'is_udacity': True, u'is_canceled': False, u'join_date': datetime.datetime(2015, 2, 25, 0, 0), u'account_key': u'1101', u'cancel_date': None, u'days_to_cancel': None}]


###### I conclude that most of the surprising data points are from:
    - accounts that were canceled, most of them the same day of creation
    - Weren't from udacity

#### Remove dacity test accounts records

In [86]:
len(enrollments)

1622

In [84]:
udacity_keys = set([x['account_key'] for x in enrollments if x['is_udacity']])

In [85]:
udacity_keys

set()

In [71]:
print(len(udacity_keys))
print (len(daily_engagements))
daily_engagements = [x for x in daily_engagements if x['account_pkey'] not in udacity_keys]
print (len(daily_engagements))

6
2309239
2299629


In [102]:
daily_e_small = [x for x in daily_e_small if x['account_key'] not in udacity_keys]

In [73]:
print(len(project_submissions))
project_submissions = [x for x in project_submissions if x['account_key'] not in udacity_keys]
print(len(project_submissions))

3642
3634


In [83]:
print(len(enrollments))
enrollments = [x for x in enrollments if x['account_key'] not in udacity_keys]
print(len(enrollments))

1622
1622


### The Question: How do numbers in the daily engagement table differ for students who pass the first project?

##### Students that haven't canceled or have been enrolled for more than 7 days

In [123]:
paid_students = [[x['account_key'], x['join_date']] for x in enrollments if x['days_to_cancel'] == None or x['days_to_cancel'] > 7]
paid_students_final = {}
for v in paid_students:
    if v[0] not in paid_students_final or v[1] > paid_students_final[v[0]]:
        paid_students_final[v[0]] = v[1]
paid_students = paid_students_final

In [124]:
len(paid_students)

995

In [125]:
def get_paid_students(data, key='account_key'):
    return [x for x in data if x[key] in paid_students]

In [126]:
enrollments = get_paid_students(enrollments)
daily_engagements = get_paid_students(daily_engagements, key='account_pkey')
daily_e_small = get_paid_students(daily_e_small)
project_submissions = get_paid_students(project_submissions)

print(len(enrollments))
print(len(daily_engagements))
print(len(project_submissions))

1293
2281112
3618


In [127]:
def within_one_week(engagement_date, join_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7

In [132]:
engagement_paid_first_week = [x for x in daily_e_small if within_one_week(x['utc_date'], paid_students[x['account_key']])]

In [133]:
len(engagement_paid_first_week)

21508