In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

Download data from https://www.kaggle.com/datasets/aljarah/xAPI-Edu-Data?resource=download

In line with https://arxiv.org/pdf/2012.11788.pdf we limit to students born in Kuwait and Jordan (the two largest places of birth) to see how demographic shift changes the model we learn.

In [98]:
DATA_DIR = '../../' # where is raw data stored?
OUTPUT_DIR = '../../../data/' # where to save processed train/test files?

In [99]:
data = pd.read_csv(DATA_DIR + "xAPI-Edu-Data.csv")

In [100]:
gender = {'M': 0, 'F': 1}
PlaceofBirth = {'KuwaIT': 0, 'lebanon': 2, 'Egypt': 2, 'SaudiArabia': 2, 'USA': 2, 'Jordan': 1,
       'venzuela': 2, 'Iran': 2, 'Tunis': 2, 'Morocco': 2, 'Syria': 2, 'Iraq': 2,
       'Palestine':2 , 'Lybia':2}
NationalITy = {'KW': 0, 'lebanon':2, 'Egypt':2, 'SaudiArabia':2, 'USA':2, 'Jordan':1,
       'venzuela':2, 'Iran':2, 'Tunis':2, 'Morocco':2, 'Syria':2, 'Palestine':2,
       'Iraq':2, 'Lybia':2}
StageID = {'lowerlevel':0, 'MiddleSchool':1, 'HighSchool':2}
GradeID = {'G-04':4, 'G-07':7, 'G-08':8, 'G-06':6, 'G-05':5, 'G-09':9, 'G-12':12, 'G-11':11,
       'G-10':10, 'G-02':2}
SectionID = {'A':0, 'B':1, 'C':2}
Topic = {'IT':0, 'Math':1, 'Arabic':2, 'Science':3, 'English':4, 'Quran':5, 'Spanish':6,
       'French':7, 'History':8, 'Biology':9, 'Chemistry':10, 'Geology':11}
Semester = {'F':0, 'S':1}
Relation = {'Father' : 0, 'Mum': 1}
ParentAnsweringSurvey = {'Yes':0, 'No':1}
ParentschoolSatisfaction = {'Good':0, 'Bad':1}
StudentAbsenceDays = {'Under-7':0, 'Above-7':1}
label = {'L':0, 'M':1, 'H':2}

In [101]:
data.rename(columns={'Class' : 'label'}, inplace=True)

In [102]:
for feature in ['gender', 'NationalITy', 'PlaceofBirth', 'StageID', 'GradeID','SectionID', 'Topic', 'Semester','Relation', 
                'ParentAnsweringSurvey', 'ParentschoolSatisfaction', 'StudentAbsenceDays', 'label']:
    data[feature] = data[feature].map(eval(feature))

In [103]:
data.rename(columns={'NationalITy': 'Nationality', 'VisITedResources': 'VisitedResources'}, inplace=True)

In [104]:
# drop data with nationality=2 
data = data[data.Nationality != 2]

In [105]:
# one-hot encode PlaceOfBirth, StageID, SectionID, Topic, label
data = pd.get_dummies(data, columns=['PlaceofBirth', 'StageID', 'SectionID', 'Topic'])

In [106]:
data.head()

Unnamed: 0,gender,Nationality,GradeID,Semester,Relation,raisedhands,VisitedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,...,Topic_2,Topic_3,Topic_4,Topic_5,Topic_6,Topic_7,Topic_8,Topic_9,Topic_10,Topic_11
0,0,0,4,0,0,15,16,2,20,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,4,0,0,20,20,3,25,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,4,0,0,10,7,0,30,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,4,0,0,30,25,5,35,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,4,0,0,40,50,12,50,1,...,0,0,0,0,0,0,0,0,0,0


In [107]:
data_kuwait = data[data.Nationality == 0]
data_jordan = data[data.Nationality == 1]

In [108]:
data_train_kuwait, data_test_kuwait = train_test_split(data_kuwait, test_size=0.2, 
                                                random_state=1129, stratify=data_kuwait['label'])
data_train_jordan, data_test_jordan = train_test_split(data_jordan, test_size=0.2, 
                                                random_state=1129, stratify=data_jordan['label'])

In [110]:
# sanity check data distribution across labels
print(len(data_train_kuwait[data_train_kuwait.label == 0])/len(data_train_kuwait), len(data_train_kuwait[data_train_kuwait.label == 1])/len(data_train_kuwait), len(data_train_kuwait[data_train_kuwait.label == 2])/len(data_train_kuwait))
print(len(data_test_kuwait[data_test_kuwait.label == 0])/len(data_test_kuwait), len(data_test_kuwait[data_test_kuwait.label == 1])/len(data_test_kuwait), len(data_test_kuwait[data_test_kuwait.label == 2])/len(data_test_kuwait))
print(len(data_train_jordan[data_train_jordan.label == 0])/len(data_train_jordan), len(data_train_jordan[data_train_jordan.label == 1])/len(data_train_jordan), len(data_train_jordan[data_train_jordan.label == 2])/len(data_train_jordan))
print(len(data_test_jordan[data_test_jordan.label == 0])/len(data_test_jordan), len(data_test_jordan[data_test_jordan.label == 1])/len(data_test_jordan), len(data_test_jordan[data_test_jordan.label == 2])/len(data_test_jordan))

0.3776223776223776 0.4195804195804196 0.20279720279720279
0.3888888888888889 0.4166666666666667 0.19444444444444445
0.21897810218978103 0.4744525547445255 0.30656934306569344
0.2 0.4857142857142857 0.3142857142857143


In [111]:
# save to csv
data_train_kuwait.to_csv(OUTPUT_DIR + "data/data_kuwait_train.csv", index=False)
data_test_kuwait.to_csv(OUTPUT_DIR + "data/data_kuwait_test.csv", index=False)
data_train_jordan.to_csv(OUTPUT_DIR + "data/data_jordan_train.csv", index=False)
data_test_jordan.to_csv(OUTPUT_DIR + "data/data_jordan_test.csv", index=False)