In [1]:
import pandas as pd
import re
from gensim.parsing.preprocessing import STOPWORDS, strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_short, stem_text
import pickle
import spacy
import en_core_web_sm
spacy.load('en_core_web_sm')
from transformers import BertTokenizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


$1.Preprocessing$

In [2]:
# Loading the dataset to a data frame
data_df = pd.read_csv('Data.xlsx - Data.csv')

In [3]:
# Check dataframe
data_df

Unnamed: 0,ID,entity,ent_id,sentence_original,class_ID,class_name,sentence_id
0,90812,interviews,qualitative interview,Especially for the problem of reduced identity...,1,belongs_to_article,507_6563_6594
1,90770,knowledge management,knowledge management,This makes it challenging for them to organize...,3,background_information,507_4787_4813
2,90500,reference model,reference modelling,It can be foreseen that managing such a compre...,3,background_information,505_7541_7567
3,90417,UML,unified modeling language,"In particular, redefinition in UML allows one ...",3,background_information,505_4085_4121
4,90361,reference modeling,reference modelling,"At the same time, redundancy in a reference mo...",2,related_work,505_2464_2508
...,...,...,...,...,...,...,...
995,407,privacy,privacy,"Thus, we hypothesize that users' perceptions o...",1,belongs_to_article,2_1783_1859
996,293,Privacy,privacy,Security and Privacy of Personal Health Record...,3,background_information,2_2_13
997,205,social networks,social network,Having a trusted member of their social networ...,3,background_information,1_4833_4866
998,57,hotels,hotel industry,Migrants who have strong ties to their social ...,3,background_information,1_1275_1344


In [4]:
# Remove spaces, puctuation and numbers from original senteces, 
# because they do not convey relevant information for text analysis.
data_df['sentence_preprocessed'] = data_df['sentence_original'].apply(strip_numeric)
data_df['sentence_preprocessed'] = data_df['sentence_preprocessed'].apply(strip_punctuation)
data_df['sentence_preprocessed'] = data_df['sentence_preprocessed'].apply(strip_multiple_whitespaces)

# Transform all letters to lower case -->  words with capital letter might confuse the machine.
# Exp: For machine 'What' is not the same as 'what'
data_df['sentence_preprocessed']=data_df['sentence_preprocessed'].str.lower()

In [5]:
data_df

Unnamed: 0,ID,entity,ent_id,sentence_original,class_ID,class_name,sentence_id,sentence_preprocessed
0,90812,interviews,qualitative interview,Especially for the problem of reduced identity...,1,belongs_to_article,507_6563_6594,especially for the problem of reduced identity...
1,90770,knowledge management,knowledge management,This makes it challenging for them to organize...,3,background_information,507_4787_4813,this makes it challenging for them to organize...
2,90500,reference model,reference modelling,It can be foreseen that managing such a compre...,3,background_information,505_7541_7567,it can be foreseen that managing such a compre...
3,90417,UML,unified modeling language,"In particular, redefinition in UML allows one ...",3,background_information,505_4085_4121,in particular redefinition in uml allows one t...
4,90361,reference modeling,reference modelling,"At the same time, redundancy in a reference mo...",2,related_work,505_2464_2508,at the same time redundancy in a reference mod...
...,...,...,...,...,...,...,...,...
995,407,privacy,privacy,"Thus, we hypothesize that users' perceptions o...",1,belongs_to_article,2_1783_1859,thus we hypothesize that users perceptions of ...
996,293,Privacy,privacy,Security and Privacy of Personal Health Record...,3,background_information,2_2_13,security and privacy of personal health record...
997,205,social networks,social network,Having a trusted member of their social networ...,3,background_information,1_4833_4866,having a trusted member of their social networ...
998,57,hotels,hotel industry,Migrants who have strong ties to their social ...,3,background_information,1_1275_1344,migrants who have strong ties to their social ...


In [6]:
# Sort stopwords from nltk.
# According to Chatgpt, nltk has a more comprehensive stopword list than Gensim. 
# And entity classification often involves working with a variety of text data where removing irrelevant words is crucial. Thus, we are using nltk.
sorted_nltk_stopwords = sorted(nltk_stopwords)
print("NLTK stopwords:", sorted_nltk_stopwords)



NLTK stopwords: ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't

In [7]:
# Trimming stopword list, as negations are essencial, they could change the meaning of the whole sentence. 
stopwords_to_remove = ["aren't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
                       'needn', "needn't", 'no', 'nor', 'not', "shan't", 'shouldn', "shouldn't",
                       'wasn', "wasn't", "weren't", 'wouldn', "wouldn't"]

for word in stopwords_to_remove:
    if word in nltk_stopwords:
        nltk_stopwords.remove(word)
sorted_nltk_stopwords = sorted(nltk_stopwords)
print("NLTK stopwords:", sorted_nltk_stopwords)

NLTK stopwords: ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'more', 'most', 'my', 'myself', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', 'she', "she's", 'should', "should've", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'th

In [8]:
# Define the function to filter stopwords from text
def filter_stopwords(text, stopwords):
    words = text.split()
    words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(words)

# Apply the function to filter stopwords
data_df['sentence_preprocessed'] = data_df['sentence_preprocessed'].apply(lambda x: filter_stopwords(x, nltk_stopwords))

# Short words are not removed as we have abbreviations in our text, and that might be useful. Exp: IoT, RFID, AI, ML


In [9]:
# Check
data_df

Unnamed: 0,ID,entity,ent_id,sentence_original,class_ID,class_name,sentence_id,sentence_preprocessed
0,90812,interviews,qualitative interview,Especially for the problem of reduced identity...,1,belongs_to_article,507_6563_6594,especially problem reduced identity efforts vt...
1,90770,knowledge management,knowledge management,This makes it challenging for them to organize...,3,background_information,507_4787_4813,makes challenging organize meetings conversati...
2,90500,reference model,reference modelling,It can be foreseen that managing such a compre...,3,background_information,505_7541_7567,foreseen managing comprehensive reference mode...
3,90417,UML,unified modeling language,"In particular, redefinition in UML allows one ...",3,background_information,505_4085_4121,particular redefinition uml allows one modify ...
4,90361,reference modeling,reference modelling,"At the same time, redundancy in a reference mo...",2,related_work,505_2464_2508,time redundancy reference model typically avoi...
...,...,...,...,...,...,...,...,...
995,407,privacy,privacy,"Thus, we hypothesize that users' perceptions o...",1,belongs_to_article,2_1783_1859,thus hypothesize users perceptions security se...
996,293,Privacy,privacy,Security and Privacy of Personal Health Record...,3,background_information,2_2_13,security privacy personal health records cloud...
997,205,social networks,social network,Having a trusted member of their social networ...,3,background_information,1_4833_4866,trusted member social networks new home plays ...
998,57,hotels,hotel industry,Migrants who have strong ties to their social ...,3,background_information,1_1275_1344,migrants strong ties social network expand soc...


In [10]:
# Stemming: Convert the word to the root form
data_df['sentence_stemmed'] = data_df['sentence_preprocessed'].apply(stem_text)

In [11]:
# Lemmatization:Convert the word to the linguistic root form


#initialize en
en_core_web_sm = en_core_web_sm.load()
# Extracting the lemmas and join (chatgpt)
def extract_lemmas(text):
# Process the text with the model
    doc = en_core_web_sm(text)
# Extract lemmas for each token in the document
    lemmas = [token.lemma_ for token in doc]
# Join the lemmas into a single string
    return " ".join(lemmas)

data_df['sentence_lemmas'] = data_df['sentence_preprocessed'].apply(extract_lemmas)


In [12]:
# Tokenization (BERT): A predefined word set like BERT. Then the text is matched to a token in the word set. If a word is missing, the word will be splitted until there is a match.

#load the Bert tokenizer
bert_uncased = BertTokenizer.from_pretrained('bert-base-uncased')

data_df['sentence_tokenized'] = data_df['sentence_preprocessed'].apply(lambda text: bert_uncased.tokenize(text))

In [13]:
data_df

Unnamed: 0,ID,entity,ent_id,sentence_original,class_ID,class_name,sentence_id,sentence_preprocessed,sentence_stemmed,sentence_lemmas,sentence_tokenized
0,90812,interviews,qualitative interview,Especially for the problem of reduced identity...,1,belongs_to_article,507_6563_6594,especially problem reduced identity efforts vt...,especi problem reduc ident effort vt identifi ...,especially problem reduce identity effort vt i...,"[especially, problem, reduced, identity, effor..."
1,90770,knowledge management,knowledge management,This makes it challenging for them to organize...,3,background_information,507_4787_4813,makes challenging organize meetings conversati...,make challeng organ meet convers well perform ...,make challenge organize meeting conversation w...,"[makes, challenging, organize, meetings, conve..."
2,90500,reference model,reference modelling,It can be foreseen that managing such a compre...,3,background_information,505_7541_7567,foreseen managing comprehensive reference mode...,foreseen manag comprehens refer model challeng...,foresee manage comprehensive reference model c...,"[fore, ##see, ##n, managing, comprehensive, re..."
3,90417,UML,unified modeling language,"In particular, redefinition in UML allows one ...",3,background_information,505_4085_4121,particular redefinition uml allows one modify ...,particular redefinit uml allow on modifi data ...,particular redefinition uml allow one modify d...,"[particular, red, ##ef, ##ini, ##tion, um, ##l..."
4,90361,reference modeling,reference modelling,"At the same time, redundancy in a reference mo...",2,related_work,505_2464_2508,time redundancy reference model typically avoi...,time redund refer model typic avoid see e g co...,time redundancy reference model typically avoi...,"[time, red, ##unda, ##ncy, reference, model, t..."
...,...,...,...,...,...,...,...,...,...,...,...
995,407,privacy,privacy,"Thus, we hypothesize that users' perceptions o...",1,belongs_to_article,2_1783_1859,thus hypothesize users perceptions security se...,thu hypothes user percept secur sec privaci pr...,thus hypothesize user perception security sec ...,"[thus, h, ##yp, ##oth, ##es, ##ize, users, per..."
996,293,Privacy,privacy,Security and Privacy of Personal Health Record...,3,background_information,2_2_13,security privacy personal health records cloud...,secur privaci person health record cloud compu...,security privacy personal health record cloud ...,"[security, privacy, personal, health, records,..."
997,205,social networks,social network,Having a trusted member of their social networ...,3,background_information,1_4833_4866,trusted member social networks new home plays ...,trust member social network new home plai impo...,trust member social network new home play impo...,"[trusted, member, social, networks, new, home,..."
998,57,hotels,hotel industry,Migrants who have strong ties to their social ...,3,background_information,1_1275_1344,migrants strong ties social network expand soc...,migrant strong ti social network expand social...,migrant strong tie social network expand socia...,"[migrants, strong, ties, social, network, expa..."


In [14]:
data_df.to_csv('preprocessed_data.csv', index=False)

Next step see embeddings entity.ipynb and embeddings sentence.ipynb

Load embeddings from embeddings entity.ipynb and embeddings sentence.ipynb.
Create data frame/ csv for model training

In [15]:
entity_embeddings_df = pd.read_csv('entity_embeddings.csv')
sentence_embeddings_df = pd.read_csv('sentence_embeddings.csv')

In [16]:
filtered_data_df = data_df[['class_ID', 'class_name']]
for_model_training_df = pd.concat([filtered_data_df, entity_embeddings_df, sentence_embeddings_df], axis=1)

In [17]:
for_model_training_df

Unnamed: 0,class_ID,class_name,entity_embeddings_1,entity_embeddings_2,entity_embeddings_3,entity_embeddings_4,entity_embeddings_5,entity_embeddings_6,entity_embeddings_7,entity_embeddings_8,...,sentence_embeddings_759,sentence_embeddings_760,sentence_embeddings_761,sentence_embeddings_762,sentence_embeddings_763,sentence_embeddings_764,sentence_embeddings_765,sentence_embeddings_766,sentence_embeddings_767,sentence_embeddings_768
0,1,belongs_to_article,-0.493046,-0.127504,0.356902,0.213740,-0.282353,0.044531,0.379340,-0.124128,...,0.017093,-0.022737,0.154470,-0.062244,0.018911,0.027818,0.030458,-0.326121,0.005032,0.635024
1,3,background_information,-0.618523,-0.097255,0.311899,0.086365,-0.167940,0.105449,0.252365,-0.019911,...,0.217384,-0.187295,0.311242,0.138686,0.105849,-0.209970,-0.040666,-0.159178,0.217096,0.299105
2,3,background_information,-0.658833,-0.184234,0.145825,0.208594,-0.222928,0.187397,0.152150,-0.052557,...,0.087419,-0.325568,-0.086733,-0.164657,-0.044922,0.176228,-0.106638,-0.419674,0.036499,0.447414
3,3,background_information,-0.640721,-0.255062,0.486257,0.048884,-0.352733,0.011589,0.224444,-0.191657,...,0.428795,-0.181079,0.330739,0.126934,-0.027351,-0.176352,0.148738,-0.554723,-0.036231,0.812043
4,2,related_work,-0.625892,-0.109920,0.290425,0.179737,-0.143044,0.105934,0.187089,-0.116364,...,0.215896,-0.316871,-0.150677,-0.105422,0.026321,0.287165,-0.136124,-0.674244,-0.115754,0.823650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,belongs_to_article,-0.645166,-0.155494,0.348832,0.127759,-0.200844,-0.147906,0.446472,-0.003472,...,0.160017,-0.254421,0.144746,-0.065655,0.040110,0.247874,-0.064700,-0.326328,-0.010185,0.934169
996,3,background_information,-0.645166,-0.155494,0.348832,0.127759,-0.200844,-0.147906,0.446472,-0.003472,...,0.229470,0.025018,0.137159,0.063406,0.278236,-0.192878,-0.004302,-0.127661,0.478828,0.311341
997,3,background_information,-0.373015,-0.055043,0.339640,0.184498,-0.205556,0.056685,0.196215,0.111421,...,0.095540,-0.293471,0.070165,0.059316,0.231394,-0.103824,-0.211086,-0.327272,-0.147619,0.448026
998,3,background_information,-0.663387,-0.171357,0.443749,0.074684,-0.298812,-0.271282,0.444449,-0.104071,...,0.279231,-0.450037,0.101883,-0.068950,0.211745,-0.147089,-0.405728,-0.567095,-0.239641,0.677701


In [None]:
# for_model_training_df.to_csv('data_set_for_model_training.csv', index=False)