In [None]:
from pathlib import Path
import pandas as pd
from fontTools.ttLib.tables.otTraverse import dfs_base_table

import src.MoodleLogsAlgorithms.data_preprocessing as pp
import src.MoodleLogsAlgorithms.function_utils as fu
import src.DataExploration.data_description as de
import src.MoodleLogsAlgorithms.data_cleaning as cl
import src.MoodleLogsAlgorithms.data_integration as di
import src.MoodleLogsAlgorithms.data_selection as ds

PATH_JSON = Path("../Datasets/JSON_files/")
PATH_PICKLE = Path("../Datasets/xAPI_logs/PICKLE_files/")
PATH_PREPROC = Path("../Datasets/xAPI_logs/PREPROCESSED_files/")
COURSES = [125, 141, 153, 313, 1539, 2961, 3135, 3559, 3789, 3791]

COURSES_NAMES = ['Modélisation Numérique en Physique - S1-23',  # 125
                 'Short talks - S2-23',  # 141
                 'Informatique pour Géosciences 1 - S1-23',  # 153
                 'Minéralogie - S1-23',  # 313
                 'LU2IN011 - Représentation et méthodes numériques - S1-23',  # 1539
                 'Diversité des Interactions Marines - S1-23',  # 2961
                 'Label vert 2 - S1-23',  # 3135
                 'Modélisation Numérique en Physique - S2-23',  # 3559
                 'Introduction aux enjeux environnementaux IMTT-ENV-S2-23',  # 3789
                 'IMTT-GES-S2-23'  # 3791
                 ]

for course in COURSES:
    # ----------------------------
    # DATA PREPROCESSING
    # ----------------------------
    pp.read_json_file(course=course, path_json=PATH_JSON, path_pickle=PATH_PICKLE)
    print('Course: ', course)
    df = pd.read_pickle(f"{PATH_PICKLE}/cours_{course}.pkl")
    # fix mistakes
    df = fu.patch(df, f"{PATH_PICKLE}/prior_statements.pkl")  # TODO: remove on GitHub
    # preprocess dataframe data
    df = pp.filter_and_rename_columns(df)
    df = pp.merge_duplicates_missing_values(df)
    df = pp.add_component_event_name(df)
    df = pp.extract_ids(df)
    df = pp.add_course_area(df)
    df = pp.redefine_component(df)
    df = pp.redefine_event_name(df)
    # fix mistakes
    df = fu.patch_modified_names(df)  # TODO: remove on GitHub

    # preprocess user roles
    role_table = pp.get_role_table(df, course)
    # check for fake students
    students_to_remove = pp.detect_potential_fake_students(df, course)
    role_table = pp.remove_fake_students(role_table, students_to_remove)
    # assign roles
    df = pp.assign_roles(role_table, df, course)

    # ----------------------------
    # DATA INTEGRATION
    # ----------------------------
    # ICAP
    df = di.integrate_icap_framework(df)
    # groups
    group_table = di.get_group_table(df, course)
    df = di.assign_groups(group_table, df, course)
    # activity status
    df = di.add_activity_status(df)

    # ----------------------------
    # DATA SELECTION
    # ----------------------------
    # filter data based on the timestamp
    df = ds.filter_semester_data(df, course)

    # ----------------------------
    # DATA CLEANING
    # ----------------------------
    # clean the dataset from worthless events
    df = cl.clean_events(df)

    PATH_PREPROC = Path("../Datasets/xAPI_logs/PREPROCESSED_files")
    # save the dataset for the analysis
    df.to_pickle(f"{PATH_PREPROC}/cours_{course}.pkl")


In [None]:

import json

file_path = '/Users/rotelli/PycharmProjects/analyse-de-cours-moodle/src/Datasets/xAPI_logs/JSON_files/prior_statements.json'
with open(file_path, mode='r') as file:
    json_file = json.load(file)

# normalise the JSON file
df = fu.normalise_json(json_file)
# save the normalised file as pickle
converted_file_path = '/Users/rotelli/PycharmProjects/analyse-de-cours-moodle/src/Datasets/xAPI_logs/JSON_files/prior_statements.pkl'
df.to_pickle(converted_file_path)

In [None]:
converted_file_path = '/Users/rotelli/PycharmProjects/analyse-de-cours-moodle/src/Datasets/xAPI_logs/JSON_files/prior_statements.pkl'

In [None]:
prior = pd.read_pickle(converted_file_path)
len(prior)

# Big files

In [None]:
from pathlib import Path
import pandas as pd
import src.MoodleLogsAlgorithms.data_preprocessing as pp
import src.MoodleLogsAlgorithms.function_utils as fu
import src.MoodleLogsAlgorithms.data_cleaning as cl
import src.MoodleLogsAlgorithms.data_integration as di

PATH_JSON = Path("../Datasets/xAPI_logs/JSON_files/")
PATH_PICKLE = Path("../Datasets/xAPI_logs/PICKLE_files/chunks")
PATH_PREPROC = Path("../Datasets/xAPI_logs/PREPROCESSED_files/chunks")

COURSE_1527 = ['1527A', '1527B', '1527C', '1527D', '1527E', '1527F', '1527G']  # S2 OK
COURSE_1587 = ['1587A', '1587B', '1587C', '1587D', '1587E', '1587F', '1587G']  # S1
COURSE_2781 = ['2781A', '2781B', '2781C', '2781D']  # S2 OK
COURSE_3499 = ['3499A', '3499B', '3499C', '3499D', '3499E', '3499F', '3499G', '3499H', '3499I']  # S2

COURSES_NAMES = ['Organisation moléculaire du vivant - S2'  # 1527
                 'Organisation cellulaire du vivant - S1-23',  # 1587
                 'Organisation et fonctions des organismes photosynthétiques - S2-23',  # 2781
                 'Mécanique - Physique 2 - S2-23',  # 3499
                 ]

for course in COURSE_3499:
    # ----------------------------
    # DATA PREPROCESSING
    # ----------------------------
    #print('Course: ', course)
    #pp.preprocess_json_files(course=course, path_json=PATH_JSON, path_pickle=PATH_PICKLE)
    print('Course: ', course)
    df = pd.read_pickle(f"{PATH_PICKLE}/cours_{course}.pkl")
    # fix mistakes
    df = fu.patch(df, f"{PATH_PICKLE}/prior_statements.pkl")  # TODO: remove on GitHub
    # preprocess dataframe data
    df = pp.filter_and_rename_columns(df)
    df = pp.merge_duplicates_missing_values(df)
    df = pp.add_component_event_name(df)
    df = pp.extract_ids(df)
    df = pp.add_course_area(df)
    df = pp.redefine_component(df)
    df = pp.redefine_event_name(df)
    df = pp.add_activity_status(df)
    # fix mistakes
    df = fu.patch_modified_names(df)  # TODO: remove on GitHub

    # save the dataset for the analysis
    df.to_pickle(f"{PATH_PREPROC}/cours_{course}.pkl")

In [None]:
from pathlib import Path
import pandas as pd

PATH_PREPROC = Path("../Datasets/xAPI_logs/PREPROCESSED_files/chunks")

COURSE_1527 = ['1527A', '1527B', '1527C', '1527D', '1527E', '1527F', '1527G']  # S2
COURSE_1587 = ['1587A', '1587B', '1587C', '1587D', '1587E', '1587F', '1587G']  # S1
COURSE_2781 = ['2781A', '2781B', '2781C', '2781D']  # S2
COURSE_3499 = ['3499A', '3499B', '3499C', '3499D', '3499E', '3499F', '3499G', '3499H', '3499I']  # S2

global_course = pd.DataFrame()

course_id = 3499

for course in COURSE_3499:
    print('Course: ', course)
    df = pd.read_pickle(f"{PATH_PREPROC}/cours_{course}.pkl")
    global_course = pd.concat((global_course, df), axis=0)

no_duplicate = global_course.drop_duplicates(subset=['User', 'Timestamp', 'Action_verb', 'Object_id', 'Object_type',
                                                     'Context', 'Description', 'Path', 'Component',
                                                     'Event_name', 'CourseID', 'ObjectID', 'ItemID', 'QuestionID',
                                                     'Course_Area', 'Status'])

# preprocess user roles
df = no_duplicate.copy()
role_table = pp.get_role_table(df, course_id)
# check for fake students

students_to_remove = pp.detect_potential_fake_students(df, course_id)
role_table = pp.remove_fake_students(role_table, students_to_remove)

# assign roles
df = pp.assign_roles(role_table, df, course_id)

# ----------------------------
# DATA INTEGRATION
# ----------------------------
# ICAP
df = di.integrate_icap_framework(df)
# groups
group_table = di.get_group_table(df, course_id)
df = di.assign_groups(group_table, df, course_id)

# ----------------------------
# DATA SELECTION
# ----------------------------
# filter data based on the timestamp
df = pp.filter_semester_data(df, course_id)

# ----------------------------
# DATA CLEANING
# ----------------------------
# clean the dataset from worthless events
df = cl.clean_events(df)

# save the dataset for the analysis
PATH_PREPROC = Path("../Datasets/xAPI_logs/PREPROCESSED_files")
df.to_pickle(f"{PATH_PREPROC}/cours_{course_id}.pkl")