In [1]:
import pandas as pd
import re
from gensim.parsing.preprocessing import STOPWORDS, strip_tags, strip_numeric, strip_punctuation, strip_multiple_whitespaces, remove_stopwords, strip_short, stem_text
import pickle
import spacy
import en_core_web_sm
spacy.load('en_core_web_sm')
from transformers import BertTokenizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


$1.Preprocessing$

In [2]:
# Loading the dataset to a data frame
data_df = pd.read_csv('Dataset part/Data.xlsx - Data.csv')

In [3]:
# Check dataframe
data_df

Unnamed: 0,ID,entity,ent_id,sentence_original,class_ID,class_name,sentence_id
0,90812,interviews,qualitative interview,Especially for the problem of reduced identity...,1,belongs_to_article,507_6563_6594
1,90770,knowledge management,knowledge management,This makes it challenging for them to organize...,3,background_information,507_4787_4813
2,90500,reference model,reference modelling,It can be foreseen that managing such a compre...,3,background_information,505_7541_7567
3,90417,UML,unified modeling language,"In particular, redefinition in UML allows one ...",3,background_information,505_4085_4121
4,90361,reference modeling,reference modelling,"At the same time, redundancy in a reference mo...",2,related_work,505_2464_2508
...,...,...,...,...,...,...,...
995,407,privacy,privacy,"Thus, we hypothesize that users' perceptions o...",1,belongs_to_article,2_1783_1859
996,293,Privacy,privacy,Security and Privacy of Personal Health Record...,3,background_information,2_2_13
997,205,social networks,social network,Having a trusted member of their social networ...,3,background_information,1_4833_4866
998,57,hotels,hotel industry,Migrants who have strong ties to their social ...,3,background_information,1_1275_1344


In [4]:
# Remove spaces, puctuation and numbers from original senteces, 
# because they do not convey relevant information for text analysis.
data_df['sentence_preprocessed'] = data_df['sentence_original'].apply(strip_numeric)
data_df['sentence_preprocessed'] = data_df['sentence_preprocessed'].apply(strip_punctuation)
data_df['sentence_preprocessed'] = data_df['sentence_preprocessed'].apply(strip_multiple_whitespaces)

# Transform all letters to lower case -->  words with capital letter might confuse the machine.
# Exp: For machine 'What' is not the same as 'what'
data_df['sentence_preprocessed']=data_df['sentence_preprocessed'].str.lower()

In [5]:
data_df

Unnamed: 0,ID,entity,ent_id,sentence_original,class_ID,class_name,sentence_id,sentence_preprocessed
0,90812,interviews,qualitative interview,Especially for the problem of reduced identity...,1,belongs_to_article,507_6563_6594,especially for the problem of reduced identity...
1,90770,knowledge management,knowledge management,This makes it challenging for them to organize...,3,background_information,507_4787_4813,this makes it challenging for them to organize...
2,90500,reference model,reference modelling,It can be foreseen that managing such a compre...,3,background_information,505_7541_7567,it can be foreseen that managing such a compre...
3,90417,UML,unified modeling language,"In particular, redefinition in UML allows one ...",3,background_information,505_4085_4121,in particular redefinition in uml allows one t...
4,90361,reference modeling,reference modelling,"At the same time, redundancy in a reference mo...",2,related_work,505_2464_2508,at the same time redundancy in a reference mod...
...,...,...,...,...,...,...,...,...
995,407,privacy,privacy,"Thus, we hypothesize that users' perceptions o...",1,belongs_to_article,2_1783_1859,thus we hypothesize that users perceptions of ...
996,293,Privacy,privacy,Security and Privacy of Personal Health Record...,3,background_information,2_2_13,security and privacy of personal health record...
997,205,social networks,social network,Having a trusted member of their social networ...,3,background_information,1_4833_4866,having a trusted member of their social networ...
998,57,hotels,hotel industry,Migrants who have strong ties to their social ...,3,background_information,1_1275_1344,migrants who have strong ties to their social ...


In [6]:
# Sort stopwords from nltk.
# According to Chatgpt, nltk has a more comprehensive stopword list than Gensim. 
# And entity classification often involves working with a variety of text data where removing irrelevant words is crucial. Thus, we are using nltk.
sorted_nltk_stopwords = sorted(nltk_stopwords)
print("NLTK stopwords:", sorted_nltk_stopwords)



NLTK stopwords: ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't

In [7]:
# Trimming stopword list, as negations are essencial, they could change the meaning of the whole sentence. 
stopwords_to_remove = ["aren't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
                       'needn', "needn't", 'no', 'nor', 'not', "shan't", 'shouldn', "shouldn't",
                       'wasn', "wasn't", "weren't", 'wouldn', "wouldn't"]

for word in stopwords_to_remove:
    if word in nltk_stopwords:
        nltk_stopwords.remove(word)
sorted_nltk_stopwords = sorted(nltk_stopwords)
print("NLTK stopwords:", sorted_nltk_stopwords)

NLTK stopwords: ['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'more', 'most', 'my', 'myself', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', 'she', "she's", 'should', "should've", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'th

In [8]:
# Define the function to filter stopwords from text
def filter_stopwords(text, stopwords):
    words = text.split()
    words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(words)

# Apply the function to filter stopwords
data_df['sentence_preprocessed'] = data_df['sentence_preprocessed'].apply(lambda x: filter_stopwords(x, nltk_stopwords))

# Short words are not removed as we have abbreviations in our text, and that might be useful. Exp: IoT, RFID, AI, ML


In [9]:
# Check
data_df

Unnamed: 0,ID,entity,ent_id,sentence_original,class_ID,class_name,sentence_id,sentence_preprocessed
0,90812,interviews,qualitative interview,Especially for the problem of reduced identity...,1,belongs_to_article,507_6563_6594,especially problem reduced identity efforts vt...
1,90770,knowledge management,knowledge management,This makes it challenging for them to organize...,3,background_information,507_4787_4813,makes challenging organize meetings conversati...
2,90500,reference model,reference modelling,It can be foreseen that managing such a compre...,3,background_information,505_7541_7567,foreseen managing comprehensive reference mode...
3,90417,UML,unified modeling language,"In particular, redefinition in UML allows one ...",3,background_information,505_4085_4121,particular redefinition uml allows one modify ...
4,90361,reference modeling,reference modelling,"At the same time, redundancy in a reference mo...",2,related_work,505_2464_2508,time redundancy reference model typically avoi...
...,...,...,...,...,...,...,...,...
995,407,privacy,privacy,"Thus, we hypothesize that users' perceptions o...",1,belongs_to_article,2_1783_1859,thus hypothesize users perceptions security se...
996,293,Privacy,privacy,Security and Privacy of Personal Health Record...,3,background_information,2_2_13,security privacy personal health records cloud...
997,205,social networks,social network,Having a trusted member of their social networ...,3,background_information,1_4833_4866,trusted member social networks new home plays ...
998,57,hotels,hotel industry,Migrants who have strong ties to their social ...,3,background_information,1_1275_1344,migrants strong ties social network expand soc...


In [10]:
# Stemming: Convert the word to the root form
data_df['sentence_stemmed'] = data_df['sentence_preprocessed'].apply(stem_text)

In [12]:
# Lemmatization:Convert the word to the linguistic root form


#initialize en
en_core_web_sm = en_core_web_sm.load()
# Extracting the lemmas and join (chatgpt)
def extract_lemmas(text):
# Process the text with the model
    doc = en_core_web_sm(text)
# Extract lemmas for each token in the document
    lemmas = [token.lemma_ for token in doc]
# Join the lemmas into a single string
    return " ".join(lemmas)

data_df['sentence_lemmas'] = data_df['sentence_preprocessed'].apply(extract_lemmas)


In [None]:
# Tokenization: A predefined word set like BERT. Then the text is matched to a token in the word set. If a word is missing, the word will be splitted until there is a match.
