In [1]:
import os
import pandas as pd
import numpy as np


main_publisher = 'OYC'

script_dir = os.path.dirname(os.path.realpath('__file__'))
path = os.path.join(script_dir, '../Data/' + main_publisher + '/')

df_chapters = pd.read_csv(path + 'chapters.csv', delimiter = '|')
df_chapters_embeddings = pd.read_csv(path + 'embeddings_fasttext.csv', delimiter = '|', index_col=0)
df_concepts = pd.read_csv(path + 'concepts_bis.csv', delimiter = '|')
df_concepts_embeddings = pd.read_csv(path + 'embeddings_concepts_bis.csv', delimiter = '|', index_col=0)
df_classes = pd.read_csv(path + 'classes_bis.csv', delimiter = '|')
df_classes_embeddings = pd.read_csv(path + 'embeddings_classes_bis.csv', delimiter = '|', index_col=0)
df_precedences_episodes = pd.read_csv(path + 'precedences_episodes.csv', delimiter = '|')
df_precedences_series = pd.read_csv(path + 'precedences_series.csv', delimiter = '|')

df_concepts['Concept'] = df_concepts['Concept'].apply(lambda x : x.split('/')[-1])

df_classes = df_classes.dropna()
print(f'{df_chapters["Cid"].isna().sum().sum():04d} NaN values in chapters.')
print(f'{df_concepts.isna().sum().sum():04d} Nan values in concepts.')
print(f'{df_classes.isna().sum().sum():04d} Nan values in classes.')
print(f'{df_precedences_episodes.isna().sum().sum():04d} Nan values in episdes precedences.')
print(f'{df_precedences_series.isna().sum().sum():04d} Nan values in series precedences.')

0000 NaN values in chapters.
0000 Nan values in concepts.
0000 Nan values in classes.
0000 Nan values in episdes precedences.
0000 Nan values in series precedences.


In [2]:
from utils import *

unique_oer_id = id_mapper(df_chapters['Cid'], 'OER')
unique_concept_id =  id_mapper(df_concepts['Concept'], 'Concept')
unique_class_id =  id_mapper(df_classes['Class'], 'Class')

In [3]:
oer_covers_concept_subject = edge_construction(df1 = df_concepts, df2 = unique_oer_id, col = 'mappedID', 
                                       how = 'left', right_on = 'OER')
oer_covers_concept_pr = edge_construction(df1 = df_concepts, df2 = unique_oer_id, col = 'PR', 
                                          how = 'right', right_on = 'OER')
oer_covers_concept_object = edge_construction(df1 = df_concepts, df2 = unique_concept_id, col = 'mappedID', 
                                       how = 'left', right_on = 'Concept')

oer_before_oer_ep_subject = edge_construction(df1 = df_precedences_episodes, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Before', right_on = 'OER')
oer_before_oer_ep_object = edge_construction(df1 = df_precedences_episodes, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'After', right_on = 'OER')
oer_before_oer_sr_subject = edge_construction(df1 = df_precedences_series, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Before', right_on = 'OER')
oer_before_oer_sr_object = edge_construction(df1 = df_precedences_series, df2 = unique_oer_id, col = 'mappedID', 
                                   how = 'left', left_on = 'After', right_on = 'OER')

concept_belongs_class_subject = edge_construction(df1 = df_classes, df2 = unique_concept_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Concept', right_on = 'Concept')
concept_belongs_class_object = edge_construction(df1 = df_classes, df2 = unique_class_id, col = 'mappedID', 
                                   how = 'left', left_on = 'Class', right_on = 'Class')

oer_covers_concept = torch.stack([oer_covers_concept_subject, oer_covers_concept_object], dim = 0).long()
oer_covers_concept_rev = torch.stack([oer_covers_concept_object, oer_covers_concept_subject], dim = 0).long()
oer_before_oer_ep = torch.stack([oer_before_oer_ep_subject, oer_before_oer_ep_object], dim = 0).long()
oer_before_oer_sr = torch.stack([oer_before_oer_sr_subject, oer_before_oer_sr_object], dim = 0).long()
concept_belongs_class = torch.stack([concept_belongs_class_subject, concept_belongs_class_object], dim = 0).long()
concept_belongs_class_rev = torch.stack([concept_belongs_class_object, concept_belongs_class_subject], dim = 0).long()
print(oer_covers_concept.shape)
print(oer_covers_concept_rev.shape)
print(oer_before_oer_ep.shape)
print(oer_before_oer_sr.shape)
print(concept_belongs_class.shape)
print(concept_belongs_class_rev.shape)

torch.Size([2, 16786])
torch.Size([2, 16786])
torch.Size([2, 2097])
torch.Size([2, 423])
torch.Size([2, 58295])
torch.Size([2, 58295])


In [4]:
import fasttext

model_fasttext = fasttext.load_model("cc.en.300.bin")



In [5]:
import re

# initializing string
test_str = df_chapters.Text.values[0]

# printing original string
print("The original string is : " + test_str)

# Removing punctuations in string
# Using regex
res = re.sub(r'[^\w\s]', '', test_str)

# printing result
print("The string after punctuation filter : " + res)


The original string is : Professor Jonathan Holloway :  
 “Fellow citizens, pardon me, and allow me to ask, why am I called upon to speak here today? What have I or those I represent to do with your national independence? Are the great principles of political freedom and of natural justice, embodied in that Declaration of Independence, extended to us? And am I, therefore, called upon to bring our humble offering to the national altar, and to confess the benefits, and express devout gratitude for the blessings resulting from your independence to us?  Would to God, both for your sakes and ours, that an affirmative answer could be truthfully returned to these questions. But such is not the state of the case. I say it with a sad sense of the disparity between us. I am not included within the pale of this glorious anniversary! Your high independence only reveals the immeasurable distance between us. The blessings in which you this day rejoice are not enjoyed in common. The rich inheritance 

In [6]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

sentence_embeddings = []
for sentences in df_chapters.Text.values:
    word_embs = []
    sentences = re.sub(r'[^\w\s]', '', sentences)
    words = word_tokenize(sentences)
    words = [w for w in words if not w.lower() in stop_words]
    for word in words:
        word_embedding = model_fasttext.get_word_vector(word)
        word_embs.append(word_embedding)
    if word_embs:
        word_embs = sum(word_embs) / len(word_embs)
    else:
        # Handle the case when none of the words are in the model's vocabulary
        word_embs = None
    sentence_embeddings.append(word_embs)

v1 = sentence_embeddings[0]
v2 = sentence_embeddings[1]
cos_sim = np.dot(v1, v2) / (np.linalg.norm(v1, 2) * np.linalg.norm(v2, 2))
print(cos_sim)
print(len(sentence_embeddings))

0.94540006
2550


In [7]:
i = 0
chapters_embeddings_tmp = {}
chapters_r = range(len(df_chapters['Cid'].unique()))
chapters_embeddings_bis = np.zeros(shape=(len(chapters_r), 768))
for r in chapters_r:
    chapters_embeddings_tmp[r] = [float(f) for f in sentence_embeddings[r]]
    for a in range(len(chapters_embeddings_tmp[r])):
            chapters_embeddings_bis[i][a] = chapters_embeddings_tmp[r][a]
    i += 1
chapters_embeddings = torch.from_numpy(chapters_embeddings_bis).to(torch.float32)

In [8]:
#chapters_embeddings_tmp = {}
concepts_embeddings_tmp = {} 
classes_embeddings_tmp = {}

#chapters_r = range(len(df_chapters['Cid'].unique()))
concepts_c = range(len(df_concepts['Concept'].unique()))
classes_c = range(len(df_classes['Class'].unique()))

#chapters_embeddings = np.zeros(shape=(len(chapters_r), 768))
concepts_embeddings = np.zeros(shape=(len(concepts_c), 768))
classes_embeddings = np.zeros(shape=(len(classes_c), 768))


'''i = 0
for r in chapters_r:
    chapters_embeddings_tmp[r] = list(filter(None, df_chapters_embeddings['FastText'][r].strip("[]\n").replace("'","").split(" ")))
    chapters_embeddings_tmp[r] = [float(f) for f in chapters_embeddings_tmp[r]]
    for a in range(len(chapters_embeddings_tmp[r])):
            chapters_embeddings[i][a] = chapters_embeddings_tmp[r][a]
    i += 1'''

i = 0
for r in concepts_c:
    concepts_embeddings_tmp[r] = list(filter(None, df_concepts_embeddings['BERT'][r].strip("[]\n").replace("'","").split(" ")))
    concepts_embeddings_tmp[r] = [float(f) for f in concepts_embeddings_tmp[r]]
    for a in range(len(concepts_embeddings_tmp[r])):
            concepts_embeddings[i][a] = concepts_embeddings_tmp[r][a]
    i += 1   

i = 0
for r in classes_c:
    classes_embeddings_tmp[r] = list(filter(None, df_classes_embeddings['BERT'][r].strip("[]\n").replace("'","").split(" ")))
    classes_embeddings_tmp[r] = [float(f) for f in classes_embeddings_tmp[r]]
    for a in range(len(classes_embeddings_tmp[r])):
            classes_embeddings[i][a] = classes_embeddings_tmp[r][a]
    i += 1

#chapters_embeddings = torch.from_numpy(chapters_embeddings).to(torch.float32)
concepts_embeddings = torch.from_numpy(concepts_embeddings).to(torch.float32)
classes_embeddings = torch.from_numpy(classes_embeddings).to(torch.float32)

In [9]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()
data['OER'].node_id = torch.tensor(unique_oer_id['mappedID'].values)
data['OER'].x = chapters_embeddings
data['Concept'].node_id = torch.tensor(unique_concept_id['mappedID'].values)
data['Concept'].x = concepts_embeddings
data['Class'].node_id = torch.tensor(unique_class_id['mappedID'].values)
data['Class'].x = classes_embeddings
data['OER', 'covers', 'Concept'].edge_index = oer_covers_concept
data['Concept', 'rev_covers', 'OER'].edge_index = oer_covers_concept_rev

data['OER', 'covers', 'Concept'].edge_attr = oer_covers_concept_pr
print(oer_before_oer_ep.shape)
data['OER', 'before_sr', 'OER'].edge_index = oer_before_oer_sr
data['OER', 'before_ep', 'OER'].edge_index = oer_before_oer_ep
data['Concept', 'belongs', 'Class'].edge_index = concept_belongs_class
data['Class', 'rev_belongs', 'Concept'].edge_index = concept_belongs_class_rev

#data = T.ToUndirected()(data)
data.validate()
print(data)

torch.Size([2, 2097])
HeteroData(
  [1mOER[0m={
    node_id=[2550],
    x=[2550, 768]
  },
  [1mConcept[0m={
    node_id=[6007],
    x=[6007, 768]
  },
  [1mClass[0m={
    node_id=[292],
    x=[292, 768]
  },
  [1m(OER, covers, Concept)[0m={
    edge_index=[2, 16786],
    edge_attr=[16830]
  },
  [1m(Concept, rev_covers, OER)[0m={ edge_index=[2, 16786] },
  [1m(OER, before_sr, OER)[0m={ edge_index=[2, 423] },
  [1m(OER, before_ep, OER)[0m={ edge_index=[2, 2097] },
  [1m(Concept, belongs, Class)[0m={ edge_index=[2, 58295] },
  [1m(Class, rev_belongs, Concept)[0m={ edge_index=[2, 58295] }
)


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import random

def seed_everything(seed=0):                                                  
       random.seed(seed)                                                            
       torch.manual_seed(seed)                                                      
       torch.cuda.manual_seed_all(seed)                                             
       np.random.seed(seed)                                                         
       os.environ['PYTHONHASHSEED'] = str(seed)                                     
       torch.backends.cudnn.deterministic = True                                    
       torch.backends.cudnn.benchmark = False

seed_everything()

In [11]:
agnostic = False
if agnostic:
    num_val = 0.5
    num_test = 0.5
else:
    num_val = 0.1
    num_test = 0.1
transform = T.RandomLinkSplit(
    num_val = num_val,
    num_test = num_test,
    disjoint_train_ratio = 0.0,
    neg_sampling_ratio = 1.0,
    add_negative_train_samples = True,
    edge_types=('OER', 'before_sr', 'OER')
)

train_data, val_data, test_data = transform(data)
print(f'{len(train_data["OER", "before_sr", "OER"].edge_label.detach().numpy())}\t Edges for training')
print(f'{len(val_data["OER", "before_sr", "OER"].edge_label.detach().numpy())}\t Edges for validation')
print(f'{len(test_data["OER", "before_sr", "OER"].edge_label.detach().numpy())}\t Edges for testing')

678	 Edges for training
84	 Edges for validation
84	 Edges for testing


In [12]:
X_train_index = train_data["OER", "before_sr", "OER"].edge_label_index.numpy()
Y_train = train_data["OER", "before_sr", "OER"].edge_label.numpy()
X_val_index = val_data["OER", "before_sr", "OER"].edge_label_index.numpy()
Y_val = val_data["OER", "before_sr", "OER"].edge_label.numpy()
X_test_index = test_data["OER", "before_sr", "OER"].edge_label_index.numpy()
Y_test = test_data["OER", "before_sr", "OER"].edge_label.numpy()
print(X_train_index.shape, Y_train.shape)
print(X_val_index.shape, Y_val.shape)
print(X_test_index.shape, Y_test.shape)

(2, 678) (678,)
(2, 84) (84,)
(2, 84) (84,)


In [13]:
X_train = np.concatenate([chapters_embeddings[X_train_index[0]],chapters_embeddings[X_train_index[1]]], axis=1)
X_val = np.concatenate([chapters_embeddings[X_val_index[0]],chapters_embeddings[X_val_index[1]]], axis=1)
X_test = np.concatenate([chapters_embeddings[X_test_index[0]],chapters_embeddings[X_test_index[1]]], axis=1)
print(X_train.shape, Y_train.shape)
print(X_val.shape, Y_val.shape)
print(X_test.shape, Y_test.shape)

(678, 1536) (678,)
(84, 1536) (84,)
(84, 1536) (84,)


In [14]:
#X_val = np.concatenate([X_val, X_test], axis = 0)
#Y_val = np.concatenate([Y_val, Y_test], axis = 0)
print(X_val.shape, Y_val.shape)

(84, 1536) (84,)


In [15]:
df_results = pd.DataFrame()

In [16]:
from classification import classify_cv

classify_cv(X_train, Y_train)

Logistic Regression - L2 :
0.55 mean accuracy
 0.61 max accuracy
 0.49 min accuracy
 0.04 standard deviation
{'max_iter': 50, 'random_state': 0, 'solver': 'liblinear'}
Decision Tree :


In [29]:
df_results.to_csv('baselines_fasttext_oyc.csv', sep='|')