In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

from src.logger import logging

In [3]:
from winotify import Notification, audio

toast = Notification(app_id='365 Learning Data Challenge',
                     title='merged_info_purchase',
                     msg='Code execution done.')

toast.set_audio(audio.Default, loop=False)

In [4]:
course_info_df = pd.read_csv('data/raw/365_course_info.csv')
course_ratings_df = pd.read_csv('data/raw/365_course_ratings.csv')
exam_info_df = pd.read_csv('data/raw/365_exam_info.csv')
quiz_info_df = pd.read_csv('data/raw/365_quiz_info.csv')
student_engagement_df = pd.read_csv('data/raw/365_student_engagement.csv')
student_exams_df = pd.read_csv('data/raw/365_student_exams.csv')
student_hub_questions_df = pd.read_csv('data/raw/365_student_hub_questions.csv')
student_info_df = pd.read_csv('data/raw/365_student_info.csv')
student_learning_df = pd.read_csv('data/raw/365_student_learning.csv')
student_purchases_df = pd.read_csv('data/raw/365_student_purchases.csv')
student_quizzes_df = pd.read_csv('data/raw/365_student_quizzes.csv')

In [8]:
student_hub_questions_df

Unnamed: 0,hub_question_id,student_id,date_question_asked
0,7619,258803,2022-01-03
1,7630,258803,2022-01-04
2,7662,259342,2022-01-05
3,7667,259391,2022-01-05
4,7668,259174,2022-01-05
...,...,...,...
822,10386,287473,2022-10-19
823,10387,287473,2022-10-19
824,10388,291121,2022-10-19
825,10404,274798,2022-10-20


In [76]:
course_ratings_df.date_rated = pd.to_datetime(course_ratings_df.date_rated)
student_engagement_df.date_engaged = pd.to_datetime(student_engagement_df.date_engaged)
student_exams_df.date_exam_completed = pd.to_datetime(student_exams_df.date_exam_completed)
student_hub_questions_df.date_question_asked = pd.to_datetime(student_hub_questions_df.date_question_asked)
student_info_df.date_registered = pd.to_datetime(student_info_df.date_registered)
student_learning_df.date_watched = pd.to_datetime(student_learning_df.date_watched)
student_purchases_df.date_purchased = pd.to_datetime(student_purchases_df.date_purchased)

In [77]:
student_learning_minutes_watched = student_learning_df[['student_id', 'minutes_watched']].groupby('student_id').sum().reset_index()

merged_student_learning_engagement = pd.merge(student_learning_minutes_watched, student_engagement_df, on='student_id')
merged_student_learning_engagement = merged_student_learning_engagement.drop(['date_engaged', 'engagement_id'], axis=1)
merged_student_learning_engagement = merged_student_learning_engagement.groupby(['student_id', 'minutes_watched']).agg({'engagement_quizzes': np.sum, 'engagement_exams': np.sum, 'engagement_lessons': np.sum}).reset_index()

merged_student_learning_engagement

Unnamed: 0,student_id,minutes_watched,engagement_quizzes,engagement_exams,engagement_lessons
0,258798,0.3,0,0,1
1,258800,531.2,10,5,29
2,258803,620.0,7,4,19
3,258804,14.2,1,0,1
4,258807,1.8,0,0,1
...,...,...,...,...,...
18159,295509,0.8,0,0,1
18160,295510,0.1,0,0,1
18161,295511,7.8,1,0,1
18162,295512,10.4,1,0,1


In [78]:
merged_info_purchase = pd.merge(student_info_df, student_purchases_df, on='student_id', how='left')
merged_info_purchase = merged_info_purchase.drop_duplicates(subset='student_id', keep='last')

In [79]:
def is_paid_tier(student_id: str):
    '''
    returns: [active_purchase, ]
    '''

    if student_id not in student_purchases_df.values:
        return False, False

    df = merged_info_purchase[merged_info_purchase.student_id == student_id]

    latest_date = datetime(2022, 10, 20).date()
    latest_date_purchased = df.date_purchased.iloc[0].date()

    latest_purchase_type = df.purchase_type.iloc[0]

    difference = (latest_date - latest_date_purchased).days

    if latest_purchase_type == 'Monthly':
        if difference <= 30: return True, True
        if 30 < difference <= 30 + 90: return False, True
        if difference > 30 + 90: return False, False
    
    if latest_purchase_type == 'Quarterly':
        if difference <= 90: return True, True
        if 30 < difference <= 90 + 90: return False, True
        if difference > 90 + 90: return False, False
    
    if latest_purchase_type == 'Annual':
        if difference <= 365: return True, True
        if 30 < difference <= 365 + 90: return False, True
        if difference > 365 + 90: return False, False

    # if latest_purchase_type == 'Monthly':
    #     return pd.Series([difference.days <= 30])
    # if latest_purchase_type == 'Quarterly':
    #     return pd.Series([difference.days <= 90])
    # if latest_purchase_type == 'Annual':
    #     return pd.Series([difference.days <= 365])

In [80]:
merged_info_purchase['paid'] = merged_info_purchase.student_id.apply(lambda id : is_paid_tier(id))

toast.show()

In [81]:
merged_info_purchase['active_purchase'] = merged_info_purchase.paid.apply(lambda list : list[0])
merged_info_purchase['potential_stay'] = merged_info_purchase.paid.apply(lambda list : list[1])

merged_info_purchase.to_csv('data/processed/merged_info_purchase_students.csv', index=False)

In [89]:
merged_info_purchase_students = pd.read_csv('data/processed/merged_info_purchase_students.csv')
merged_info_purchase_students.head()

Unnamed: 0,student_id,student_country,date_registered,purchase_id,purchase_type,date_purchased,paid,active_purchase,potential_stay
0,258798,IN,2022-01-01,,,,"(False, False)",False,False
1,258799,CO,2022-01-01,,,,"(False, False)",False,False
2,258800,CA,2022-01-01,15781.0,Annual,2022-01-01,"(True, True)",True,True
3,258801,IN,2022-01-01,,,,"(False, False)",False,False
4,258802,US,2022-01-01,,,,"(False, False)",False,False


In [90]:
final_dataset = pd.merge(merged_info_purchase_students, merged_student_learning_engagement, on='student_id', how='left')
final_dataset = final_dataset.drop(['date_registered', 'purchase_id', 'date_purchased', 'paid', 'purchase_type'], axis=1)

final_dataset

Unnamed: 0,student_id,student_country,active_purchase,potential_stay,minutes_watched,engagement_quizzes,engagement_exams,engagement_lessons
0,258798,IN,False,False,0.3,0.0,0.0,1.0
1,258799,CO,False,False,,,,
2,258800,CA,True,True,531.2,10.0,5.0,29.0
3,258801,IN,False,False,,,,
4,258802,US,False,False,,,,
...,...,...,...,...,...,...,...,...
35225,295511,CM,False,False,7.8,1.0,0.0,1.0
35226,295512,CA,False,False,10.4,1.0,0.0,1.0
35227,295513,SA,False,False,0.1,0.0,0.0,1.0
35228,295514,NL,False,False,,,,


In [91]:
final_dataset = final_dataset.fillna(0)
# final_dataset.purchase_type = final_dataset.apply(lambda cols : replace_purchase_type(cols.paid, cols.purchase_type), axis=1)
final_dataset

Unnamed: 0,student_id,student_country,active_purchase,potential_stay,minutes_watched,engagement_quizzes,engagement_exams,engagement_lessons
0,258798,IN,False,False,0.3,0.0,0.0,1.0
1,258799,CO,False,False,0.0,0.0,0.0,0.0
2,258800,CA,True,True,531.2,10.0,5.0,29.0
3,258801,IN,False,False,0.0,0.0,0.0,0.0
4,258802,US,False,False,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
35225,295511,CM,False,False,7.8,1.0,0.0,1.0
35226,295512,CA,False,False,10.4,1.0,0.0,1.0
35227,295513,SA,False,False,0.1,0.0,0.0,1.0
35228,295514,NL,False,False,0.0,0.0,0.0,0.0
