In [1]:
import os
import pandas as pd
import numpy as np


main_publisher = 'OYC'

script_dir = os.path.dirname(os.path.realpath('__file__'))
path = os.path.join(script_dir, '../Data/' + main_publisher + '/')

df_chapters = pd.read_csv(path + 'chapters.csv', delimiter = '|')
df_chapters_embeddings = pd.read_csv(path + 'embeddings_chapters.csv', delimiter = '|', index_col=0)
df_concepts = pd.read_csv(path + 'concepts_bis.csv', delimiter = '|')
df_concepts_embeddings = pd.read_csv(path + 'embeddings_concepts_bis.csv', delimiter = '|', index_col=0)
df_classes = pd.read_csv(path + 'classes_bis.csv', delimiter = '|')
df_classes_embeddings = pd.read_csv(path + 'embeddings_classes_bis.csv', delimiter = '|', index_col=0)
df_precedences_episodes = pd.read_csv(path + 'precedences_episodes.csv', delimiter = '|')
df_precedences_series = pd.read_csv(path + 'precedences_series.csv', delimiter = '|')

df_concepts['Concept'] = df_concepts['Concept'].apply(lambda x : x.split('/')[-1])

df_classes = df_classes.dropna()
print(f'{df_chapters["Cid"].isna().sum().sum():04d} NaN values in chapters.')
print(f'{df_concepts.isna().sum().sum():04d} Nan values in concepts.')
print(f'{df_classes.isna().sum().sum():04d} Nan values in classes.')
print(f'{df_precedences_episodes.isna().sum().sum():04d} Nan values in episdes precedences.')
print(f'{df_precedences_series.isna().sum().sum():04d} Nan values in series precedences.')

0000 NaN values in chapters.
0000 Nan values in concepts.
0000 Nan values in classes.
0000 Nan values in episdes precedences.
0000 Nan values in series precedences.


In [2]:
from utils import *

unique_oer_id = id_mapper(df_chapters['Cid'], 'OER')
unique_concept_id =  id_mapper(df_concepts['Concept'], 'Concept')
unique_class_id =  id_mapper(df_classes['Class'], 'Class')

In [3]:
oer_covers_concept_subject = edge_construction(df1 = df_concepts, df2 = unique_oer_id, col = 'mappedID', 
                                       how = 'left', right_on = 'OER')
oer_covers_concept_pr = edge_construction(df1 = df_concepts, df2 = unique_oer_id, col = 'PR', 
                                          how = 'right', right_on = 'OER')
oer_covers_concept_object = edge_construction(df1 = df_concepts, df2 = unique_concept_id, col = 'mappedID', 
                                       how = 'left', right_on = 'Concept')

oer_before_oer_ep_subject = edge_construction(df1 = df_precedences_episodes, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Before', right_on = 'OER')
oer_before_oer_ep_object = edge_construction(df1 = df_precedences_episodes, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'After', right_on = 'OER')
oer_before_oer_sr_subject = edge_construction(df1 = df_precedences_series, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Before', right_on = 'OER')
oer_before_oer_sr_object = edge_construction(df1 = df_precedences_series, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'After', right_on = 'OER')

concept_belongs_class_subject = edge_construction(df1 = df_classes, df2 = unique_concept_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Concept', right_on = 'Concept')
concept_belongs_class_object = edge_construction(df1 = df_classes, df2 = unique_class_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Class', right_on = 'Class')

oer_covers_concept = torch.stack([oer_covers_concept_subject, oer_covers_concept_object], dim = 0).long()
oer_covers_concept_rev = torch.stack([oer_covers_concept_object, oer_covers_concept_subject], dim = 0).long()
oer_before_oer_ep = torch.stack([oer_before_oer_ep_subject, oer_before_oer_ep_object], dim = 0).long()
oer_before_oer_sr = torch.stack([oer_before_oer_sr_subject, oer_before_oer_sr_object], dim = 0).long()
concept_belongs_class = torch.stack([concept_belongs_class_subject, concept_belongs_class_object], dim = 0).long()
concept_belongs_class_rev = torch.stack([concept_belongs_class_object, concept_belongs_class_subject], dim = 0).long()
print(oer_covers_concept.shape)
print(oer_covers_concept_rev.shape)
print(oer_before_oer_ep.shape)
print(oer_before_oer_sr.shape)
print(concept_belongs_class.shape)
print(concept_belongs_class_rev.shape)

torch.Size([2, 16786])
torch.Size([2, 16786])
torch.Size([2, 2097])
torch.Size([2, 423])
torch.Size([2, 58295])
torch.Size([2, 58295])


In [4]:
chapters_embeddings_tmp = {}
concepts_embeddings_tmp = {} 
classes_embeddings_tmp = {}

chapters_r = range(len(df_chapters['Cid'].unique()))
concepts_c = range(len(df_concepts['Concept'].unique()))
classes_c = range(len(df_classes['Class'].unique()))

chapters_embeddings = np.zeros(shape=(len(chapters_r), 768))
concepts_embeddings = np.zeros(shape=(len(concepts_c), 768))
classes_embeddings = np.zeros(shape=(len(classes_c), 768))


i = 0
for r in chapters_r:
    chapters_embeddings_tmp[r] = list(filter(None, df_chapters_embeddings['BERT'][r].strip("[]\n").replace("'","").split(" ")))
    chapters_embeddings_tmp[r] = [float(f) for f in chapters_embeddings_tmp[r]]
    for a in range(len(chapters_embeddings_tmp[r])):
            chapters_embeddings[i][a] = chapters_embeddings_tmp[r][a]
    i += 1

i = 0
for r in concepts_c:
    concepts_embeddings_tmp[r] = list(filter(None, df_concepts_embeddings['BERT'][r].strip("[]\n").replace("'","").split(" ")))
    concepts_embeddings_tmp[r] = [float(f) for f in concepts_embeddings_tmp[r]]
    for a in range(len(concepts_embeddings_tmp[r])):
            concepts_embeddings[i][a] = concepts_embeddings_tmp[r][a]
    i += 1   

i = 0
for r in classes_c:
    classes_embeddings_tmp[r] = list(filter(None, df_classes_embeddings['BERT'][r].strip("[]\n").replace("'","").split(" ")))
    classes_embeddings_tmp[r] = [float(f) for f in classes_embeddings_tmp[r]]
    for a in range(len(classes_embeddings_tmp[r])):
            classes_embeddings[i][a] = classes_embeddings_tmp[r][a]
    i += 1

chapters_embeddings = torch.from_numpy(chapters_embeddings).to(torch.float32)
concepts_embeddings = torch.from_numpy(concepts_embeddings).to(torch.float32)
classes_embeddings = torch.from_numpy(classes_embeddings).to(torch.float32)

In [5]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()
data['OER'].node_id = torch.tensor(unique_oer_id['mappedID'].values)
data['OER'].x = chapters_embeddings
data['Concept'].node_id = torch.tensor(unique_concept_id['mappedID'].values)
data['Concept'].x = concepts_embeddings
data['Class'].node_id = torch.tensor(unique_class_id['mappedID'].values)
data['Class'].x = classes_embeddings
data['OER', 'covers', 'Concept'].edge_index = oer_covers_concept
data['Concept', 'rev_covers', 'OER'].edge_index = oer_covers_concept_rev

data['OER', 'covers', 'Concept'].edge_attr = oer_covers_concept_pr
print(oer_before_oer_ep.shape)
data['OER', 'before_sr', 'OER'].edge_index = oer_before_oer_sr
data['OER', 'before_ep', 'OER'].edge_index = oer_before_oer_ep
data['Concept', 'belongs', 'Class'].edge_index = concept_belongs_class
data['Class', 'rev_belongs', 'Concept'].edge_index = concept_belongs_class_rev

#data = T.ToUndirected()(data)
data.validate()
print(data)

  from .autonotebook import tqdm as notebook_tqdm


torch.Size([2, 2097])
HeteroData(
  [1mOER[0m={
    node_id=[2550],
    x=[2550, 768]
  },
  [1mConcept[0m={
    node_id=[6007],
    x=[6007, 768]
  },
  [1mClass[0m={
    node_id=[292],
    x=[292, 768]
  },
  [1m(OER, covers, Concept)[0m={
    edge_index=[2, 16786],
    edge_attr=[16830]
  },
  [1m(Concept, rev_covers, OER)[0m={ edge_index=[2, 16786] },
  [1m(OER, before_sr, OER)[0m={ edge_index=[2, 423] },
  [1m(OER, before_ep, OER)[0m={ edge_index=[2, 2097] },
  [1m(Concept, belongs, Class)[0m={ edge_index=[2, 58295] },
  [1m(Class, rev_belongs, Concept)[0m={ edge_index=[2, 58295] }
)


In [6]:
import random

def seed_everything(seed=0):                                                  
       random.seed(seed)                                                            
       torch.manual_seed(seed)                                                      
       torch.cuda.manual_seed_all(seed)                                             
       np.random.seed(seed)                                                         
       os.environ['PYTHONHASHSEED'] = str(seed)                                     
       torch.backends.cudnn.deterministic = True                                    
       torch.backends.cudnn.benchmark = False

seed_everything()

In [7]:
agnostic = False
if agnostic:
    num_val = 0.5
    num_test = 0.5
else:
    num_val = 0.1
    num_test = 0.1
transform = T.RandomLinkSplit(
    num_val = num_val,
    num_test = num_test,
    disjoint_train_ratio = 0.0,
    neg_sampling_ratio = 1.0,
    add_negative_train_samples = True,
    edge_types=('OER', 'before_sr', 'OER')
)

train_data, val_data, test_data = transform(data)
print(f'{len(train_data["OER", "before_sr", "OER"].edge_label.detach().numpy())}\t Edges for training')
print(f'{len(val_data["OER", "before_sr", "OER"].edge_label.detach().numpy())}\t Edges for validation')
print(f'{len(test_data["OER", "before_sr", "OER"].edge_label.detach().numpy())}\t Edges for testing')

678	 Edges for training
84	 Edges for validation
84	 Edges for testing


In [8]:
X_train_index = train_data["OER", "before_sr", "OER"].edge_label_index.numpy()
Y_train = train_data["OER", "before_sr", "OER"].edge_label.numpy()
X_val_index = val_data["OER", "before_sr", "OER"].edge_label_index.numpy()
Y_val = val_data["OER", "before_sr", "OER"].edge_label.numpy()
X_test_index = test_data["OER", "before_sr", "OER"].edge_label_index.numpy()
Y_test = test_data["OER", "before_sr", "OER"].edge_label.numpy()
print(X_train_index.shape, Y_train.shape)
print(X_val_index.shape, Y_val.shape)
print(X_test_index.shape, Y_test.shape)


(2, 678) (678,)
(2, 84) (84,)
(2, 84) (84,)


In [9]:
X_train = np.concatenate([chapters_embeddings[X_train_index[0]],chapters_embeddings[X_train_index[1]]], axis=1)
X_val = np.concatenate([chapters_embeddings[X_val_index[0]],chapters_embeddings[X_val_index[1]]], axis=1)
X_test = np.concatenate([chapters_embeddings[X_test_index[0]],chapters_embeddings[X_test_index[1]]], axis=1)
print(X_train.shape, Y_train.shape)
print(X_val.shape, Y_val.shape)
print(X_test.shape, Y_test.shape)

(678, 1536) (678,)
(84, 1536) (84,)
(84, 1536) (84,)


In [10]:
#X_val = np.concatenate([X_val, X_test], axis = 0)
#Y_val = np.concatenate([Y_val, Y_test], axis = 0)
print(X_val.shape, Y_val.shape)


(84, 1536) (84,)


In [11]:
df_results = pd.DataFrame()

In [12]:
from classification import classify_cv

classify_cv(X_train, Y_train)

Decision Tree :
0.66 mean accuracy
 0.74 max accuracy
 0.62 min accuracy
 0.04 standard deviation
{'criterion': 'gini', 'max_depth': 2, 'random_state': 0, 'splitter': 'random'}
Random Forest :
0.73 mean accuracy
 0.76 max accuracy
 0.68 min accuracy
 0.03 standard deviation
{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 200, 'random_state': 0}
SVM :
0.81 mean accuracy
 0.85 max accuracy
 0.77 min accuracy
 0.03 standard deviation
{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Gaussian Naive Bayes :
0.70 mean accuracy
 0.74 max accuracy
 0.66 min accuracy
 0.03 standard deviation
KNN :
0.74 mean accuracy
 0.79 max accuracy
 0.70 min accuracy
 0.03 standard deviation
{'n_neighbors': 10, 'weights': 'uniform'}


{'Decision tree': 0.66,
 'Random Forest': 0.73,
 'SVM': 0.81,
 'GNB': 0.7,
 'KNN': 0.74}

In [55]:
Y_test_pred = reg.predict(X_test)
Y_test_pred_labels = (Y_test_pred > 0.5) * 1

auc_score = roc_auc_score(Y_test, Y_test_pred_labels)
precision = precision_score(Y_test, Y_test_pred_labels, zero_division = np.nan)
accuracy = accuracy_score(Y_test, Y_test_pred_labels)
f1 = f1_score(Y_test, Y_test_pred_labels, average='macro')
recall = recall_score(Y_test, Y_test_pred_labels, average='macro')

results['Test AUC'] = auc_score
results['Test Precision'] = precision
results['Test Accuracy'] = accuracy
results['Test Recall'] = recall
results['Test F1'] = f1

print(accuracy)
print(auc_score)
print(f1)
print(precision)
print(recall)

0.5892857142857143
0.5892857142857142
0.586962377660052
0.577639751552795
0.5892857142857142


In [41]:
df_results = pd.concat([df_results, pd.DataFrame([results])], ignore_index = True)
df_results.head(20)

Unnamed: 0,Model,Validation AUC,Validation Precision,Validation Accuracy,Validation Recall,Validation F1,Test AUC,Test Precision,Test Accuracy,Test Recall,Test F1
0,KNN,0.72619,0.686275,0.72619,0.72619,0.723011,0.654762,0.614035,0.654762,0.654762,0.64339
1,GNB,0.72619,0.711111,0.72619,0.72619,0.725841,0.72619,0.72093,0.72619,0.72619,0.726152
2,SVM,0.797619,0.755102,0.797619,0.797619,0.796204,0.785714,0.76087,0.785714,0.785714,0.785227
3,Linear,0.571429,0.578947,0.571429,0.571429,0.570455,0.654762,0.632653,0.654762,0.654762,0.652348


In [42]:
df_results.to_csv('baselines.csv', sep='|')