In [3]:
import pandas as pd
import re
from gensim.parsing.preprocessing import STOPWORDS, strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_short, stem_text
import pickle
import spacy
import en_core_web_sm
spacy.load('en_core_web_sm')
from transformers import BertTokenizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


$1.Preprocessing$

In [4]:
# Loading the dataset to a data frame
data_df = pd.read_csv('Data.xlsx - Data.csv')


In [5]:
# Check dataframe
data_df

Unnamed: 0,ID,entity,ent_id,sentence_original,class_ID,class_name,sentence_id
0,90812,interviews,qualitative interview,Especially for the problem of reduced identity...,1,belongs_to_article,507_6563_6594
1,90770,knowledge management,knowledge management,This makes it challenging for them to organize...,3,background_information,507_4787_4813
2,90500,reference model,reference modelling,It can be foreseen that managing such a compre...,3,background_information,505_7541_7567
3,90417,UML,unified modeling language,"In particular, redefinition in UML allows one ...",3,background_information,505_4085_4121
4,90361,reference modeling,reference modelling,"At the same time, redundancy in a reference mo...",2,related_work,505_2464_2508
...,...,...,...,...,...,...,...
995,407,privacy,privacy,"Thus, we hypothesize that users' perceptions o...",1,belongs_to_article,2_1783_1859
996,293,Privacy,privacy,Security and Privacy of Personal Health Record...,3,background_information,2_2_13
997,205,social networks,social network,Having a trusted member of their social networ...,3,background_information,1_4833_4866
998,57,hotels,hotel industry,Migrants who have strong ties to their social ...,3,background_information,1_1275_1344


In [6]:
# Remove spaces, puctuation and numbers from original senteces, 
# because they do not convey relevant information for text analysis.
data_df['sentence_preprocessed'] = data_df['sentence_original'].apply(strip_numeric)
data_df['sentence_preprocessed'] = data_df['sentence_preprocessed'].apply(strip_punctuation)
data_df['sentence_preprocessed'] = data_df['sentence_preprocessed'].apply(strip_multiple_whitespaces)

# Transform all letters to lower case -->  words with capital letter might confuse the machine.
# Exp: For machine 'What' is not the same as 'what'
data_df['sentence_preprocessed']=data_df['sentence_preprocessed'].str.lower()

In [7]:
data_df

Unnamed: 0,ID,entity,ent_id,sentence_original,class_ID,class_name,sentence_id,sentence_preprocessed
0,90812,interviews,qualitative interview,Especially for the problem of reduced identity...,1,belongs_to_article,507_6563_6594,especially for the problem of reduced identity...
1,90770,knowledge management,knowledge management,This makes it challenging for them to organize...,3,background_information,507_4787_4813,this makes it challenging for them to organize...
2,90500,reference model,reference modelling,It can be foreseen that managing such a compre...,3,background_information,505_7541_7567,it can be foreseen that managing such a compre...
3,90417,UML,unified modeling language,"In particular, redefinition in UML allows one ...",3,background_information,505_4085_4121,in particular redefinition in uml allows one t...
4,90361,reference modeling,reference modelling,"At the same time, redundancy in a reference mo...",2,related_work,505_2464_2508,at the same time redundancy in a reference mod...
...,...,...,...,...,...,...,...,...
995,407,privacy,privacy,"Thus, we hypothesize that users' perceptions o...",1,belongs_to_article,2_1783_1859,thus we hypothesize that users perceptions of ...
996,293,Privacy,privacy,Security and Privacy of Personal Health Record...,3,background_information,2_2_13,security and privacy of personal health record...
997,205,social networks,social network,Having a trusted member of their social networ...,3,background_information,1_4833_4866,having a trusted member of their social networ...
998,57,hotels,hotel industry,Migrants who have strong ties to their social ...,3,background_information,1_1275_1344,migrants who have strong ties to their social ...


In [8]:
# Sort stopwords from nltk.
# According to Chatgpt, nltk has a more comprehensive stopword list than Gensim. 
# And entity classification often involves working with a variety of text data where removing irrelevant words is crucial. Thus, we are using nltk.
sorted_nltk_stopwords = sorted(nltk_stopwords)
print("NLTK stopwords:", sorted_nltk_stopwords)



NLTK stopwords: ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't

In [9]:
# Trimming stopword list, as negations are essencial, they could change the meaning of the whole sentence. 
stopwords_to_remove = ["aren't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
                       'needn', "needn't", 'no', 'nor', 'not', "shan't", 'shouldn', "shouldn't",
                       'wasn', "wasn't", "weren't", 'wouldn', "wouldn't"]

for word in stopwords_to_remove:
    if word in nltk_stopwords:
        nltk_stopwords.remove(word)
sorted_nltk_stopwords = sorted(nltk_stopwords)
print("NLTK stopwords:", sorted_nltk_stopwords)

NLTK stopwords: ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'more', 'most', 'my', 'myself', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', 'she', "she's", 'should', "should've", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'th

In [10]:
# Define the function to filter stopwords from text
def filter_stopwords(text, stopwords):
    words = text.split()
    words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(words)

# Apply the function to filter stopwords
data_df['sentence_preprocessed'] = data_df['sentence_preprocessed'].apply(lambda x: filter_stopwords(x, nltk_stopwords))

# Short words are not removed as we have abbreviations in our text, and that might be useful. Exp: IoT, RFID, AI, ML


In [11]:
# Check
data_df

Unnamed: 0,ID,entity,ent_id,sentence_original,class_ID,class_name,sentence_id,sentence_preprocessed
0,90812,interviews,qualitative interview,Especially for the problem of reduced identity...,1,belongs_to_article,507_6563_6594,especially problem reduced identity efforts vt...
1,90770,knowledge management,knowledge management,This makes it challenging for them to organize...,3,background_information,507_4787_4813,makes challenging organize meetings conversati...
2,90500,reference model,reference modelling,It can be foreseen that managing such a compre...,3,background_information,505_7541_7567,foreseen managing comprehensive reference mode...
3,90417,UML,unified modeling language,"In particular, redefinition in UML allows one ...",3,background_information,505_4085_4121,particular redefinition uml allows one modify ...
4,90361,reference modeling,reference modelling,"At the same time, redundancy in a reference mo...",2,related_work,505_2464_2508,time redundancy reference model typically avoi...
...,...,...,...,...,...,...,...,...
995,407,privacy,privacy,"Thus, we hypothesize that users' perceptions o...",1,belongs_to_article,2_1783_1859,thus hypothesize users perceptions security se...
996,293,Privacy,privacy,Security and Privacy of Personal Health Record...,3,background_information,2_2_13,security privacy personal health records cloud...
997,205,social networks,social network,Having a trusted member of their social networ...,3,background_information,1_4833_4866,trusted member social networks new home plays ...
998,57,hotels,hotel industry,Migrants who have strong ties to their social ...,3,background_information,1_1275_1344,migrants strong ties social network expand soc...


In [12]:
# Stemming: Convert the word to the root form
data_df['sentence_stemmed'] = data_df['sentence_preprocessed'].apply(stem_text)

In [13]:
# Lemmatization:Convert the word to the linguistic root form


#initialize en
en_core_web_sm = en_core_web_sm.load()
# Extracting the lemmas and join (chatgpt)
def extract_lemmas(text):
# Process the text with the model
    doc = en_core_web_sm(text)
# Extract lemmas for each token in the document
    lemmas = [token.lemma_ for token in doc]
# Join the lemmas into a single string
    return " ".join(lemmas)

data_df['sentence_lemmas'] = data_df['sentence_preprocessed'].apply(extract_lemmas)


In [14]:
# Tokenization (BERT): A predefined word set like BERT. Then the text is matched to a token in the word set. If a word is missing, the word will be splitted until there is a match.

#load the Bert tokenizer
bert_uncased = BertTokenizer.from_pretrained('bert-base-uncased')

data_df['sentence_tokenized'] = data_df['sentence_preprocessed'].apply(lambda text: bert_uncased.tokenize(text))

In [15]:
data_df

Unnamed: 0,ID,entity,ent_id,sentence_original,class_ID,class_name,sentence_id,sentence_preprocessed,sentence_stemmed,sentence_lemmas,sentence_tokenized
0,90812,interviews,qualitative interview,Especially for the problem of reduced identity...,1,belongs_to_article,507_6563_6594,especially problem reduced identity efforts vt...,especi problem reduc ident effort vt identifi ...,especially problem reduce identity effort vt i...,"[especially, problem, reduced, identity, effor..."
1,90770,knowledge management,knowledge management,This makes it challenging for them to organize...,3,background_information,507_4787_4813,makes challenging organize meetings conversati...,make challeng organ meet convers well perform ...,make challenge organize meeting conversation w...,"[makes, challenging, organize, meetings, conve..."
2,90500,reference model,reference modelling,It can be foreseen that managing such a compre...,3,background_information,505_7541_7567,foreseen managing comprehensive reference mode...,foreseen manag comprehens refer model challeng...,foresee manage comprehensive reference model c...,"[fore, ##see, ##n, managing, comprehensive, re..."
3,90417,UML,unified modeling language,"In particular, redefinition in UML allows one ...",3,background_information,505_4085_4121,particular redefinition uml allows one modify ...,particular redefinit uml allow on modifi data ...,particular redefinition uml allow one modify d...,"[particular, red, ##ef, ##ini, ##tion, um, ##l..."
4,90361,reference modeling,reference modelling,"At the same time, redundancy in a reference mo...",2,related_work,505_2464_2508,time redundancy reference model typically avoi...,time redund refer model typic avoid see e g co...,time redundancy reference model typically avoi...,"[time, red, ##unda, ##ncy, reference, model, t..."
...,...,...,...,...,...,...,...,...,...,...,...
995,407,privacy,privacy,"Thus, we hypothesize that users' perceptions o...",1,belongs_to_article,2_1783_1859,thus hypothesize users perceptions security se...,thu hypothes user percept secur sec privaci pr...,thus hypothesize user perception security sec ...,"[thus, h, ##yp, ##oth, ##es, ##ize, users, per..."
996,293,Privacy,privacy,Security and Privacy of Personal Health Record...,3,background_information,2_2_13,security privacy personal health records cloud...,secur privaci person health record cloud compu...,security privacy personal health record cloud ...,"[security, privacy, personal, health, records,..."
997,205,social networks,social network,Having a trusted member of their social networ...,3,background_information,1_4833_4866,trusted member social networks new home plays ...,trust member social network new home plai impo...,trust member social network new home play impo...,"[trusted, member, social, networks, new, home,..."
998,57,hotels,hotel industry,Migrants who have strong ties to their social ...,3,background_information,1_1275_1344,migrants strong ties social network expand soc...,migrant strong ti social network expand social...,migrant strong tie social network expand socia...,"[migrants, strong, ties, social, network, expa..."


In [16]:
data_df.to_csv('preprocessed_data.csv', index=False)

Next step --> see  embeddings sentence.ipynb and embeddings entity.ipynb

With embeddings from embeddings sentence.ipynb and embeddings entity.ipynb
we create a dataframe / csv (data_set_for_model_training.csv) for model training

In [17]:
# Loading embeddings
sentence_embeddings_df = pd.read_csv('sentence_embeddings.csv')
entity_embeddings_df = pd.read_csv('entity_embeddings.csv')

In [20]:
filtered_data_df = data_df[['class_ID', 'class_name']]
for_model_training_df = pd.concat([ filtered_data_df, sentence_embeddings_df, entity_embeddings_df], axis=1)

In [21]:
for_model_training_df

Unnamed: 0,class_ID,class_name,sentence_embeddings_1,sentence_embeddings_2,sentence_embeddings_3,sentence_embeddings_4,sentence_embeddings_5,sentence_embeddings_6,sentence_embeddings_7,sentence_embeddings_8,...,entity_embeddings_759,entity_embeddings_760,entity_embeddings_761,entity_embeddings_762,entity_embeddings_763,entity_embeddings_764,entity_embeddings_765,entity_embeddings_766,entity_embeddings_767,entity_embeddings_768
0,1,belongs_to_article,-0.392759,-0.160916,-0.236987,0.017865,-0.185202,-0.047439,0.304725,0.294224,...,-0.154193,-0.001004,0.246955,0.011575,0.192595,-0.444240,0.069651,-0.204989,0.597808,-0.304289
1,3,background_information,-0.446915,0.060831,0.135484,-0.070987,-0.312183,-0.161954,0.375770,0.113498,...,0.014048,0.100635,0.286525,0.060596,0.377712,-0.413099,0.120353,-0.101091,0.408687,-0.029155
2,3,background_information,-0.458058,-0.035062,0.076249,-0.175648,-0.267310,-0.133200,0.138207,0.147976,...,0.041876,0.201030,0.153229,-0.000881,0.230243,-0.232192,0.060281,-0.094907,0.490895,0.094150
3,3,background_information,-0.490693,-0.300984,-0.054388,-0.211093,-0.005908,-0.158410,0.255847,0.303913,...,-0.156720,0.290073,0.214583,-0.193672,0.426431,-0.621425,0.149215,-0.127817,0.514537,-0.273813
4,2,related_work,-0.236580,0.074777,0.081607,-0.124781,-0.228186,-0.334135,0.245351,0.184129,...,-0.002138,0.202907,0.136389,0.057703,0.229738,-0.305836,0.028529,-0.203359,0.449917,-0.028422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,belongs_to_article,-0.336847,0.030665,0.035846,-0.009195,-0.038633,-0.021274,-0.058902,0.115932,...,-0.067549,0.174661,0.244805,-0.157010,0.277910,-0.622183,0.050523,-0.080757,0.651381,-0.170470
996,3,background_information,-0.487982,-0.325495,0.213906,0.040076,-0.150274,0.061100,0.082486,0.150475,...,-0.067549,0.174661,0.244805,-0.157010,0.277910,-0.622183,0.050523,-0.080757,0.651381,-0.170470
997,3,background_information,-0.242651,0.007893,0.132302,-0.216239,0.201438,0.183189,0.205393,0.111767,...,-0.121257,0.100536,0.228765,0.053457,0.394366,-0.529538,0.120893,-0.100373,0.436331,-0.098676
998,3,background_information,-0.469185,0.167338,-0.004530,-0.293314,-0.088417,0.203098,0.707114,0.249263,...,-0.050234,0.148283,0.247628,-0.220014,0.352250,-0.642989,0.074952,-0.083675,0.745693,-0.333353


In [22]:
for_model_training_df.to_csv('data_set_for_model_training.csv', index=False)