## Relation Extraction using StackingClassifier Model

### Importing necessary libraries for data manipulation, visualization, and natural language processing

In [1]:
# NLTK library is used for natural language processing because of its specific functionalities available

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
!pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer



### Libraries for extracting features from text for machine learning algorithms and for encoding categorical labels

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

### Downloading the necessary NLTK data packages for tokenization, part-of-speech tagging, lemmatization, and stopwords.

In [3]:
nltk.download('punkt',  quiet=True)
nltk.download('averaged_perceptron_tagger',  quiet=True)
nltk.download('wordnet',  quiet=True)
nltk.download('stopwords',  quiet=True)

True

### Loading the "sem_eval_2010_task_8" dataset from the Hugging Face `datasets` library for NLP tasks

In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16


In [5]:
from datasets import load_dataset
dataset = load_dataset("sem_eval_2010_task_8")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/673k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/231k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2717 [00:00<?, ? examples/s]

### Converting the dataset into pandas DataFrames for training and testing, facilitating easier data manipulation

In [6]:
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])

In [46]:
train_df

Unnamed: 0,sentence,relation,e1,e2,e1_index,e2_index,cleaned_sentence,tokens,pos_tags,lemmatized_tokens
0,The system as described above has its greatest...,3,configuration,elements,73,107,the system as described above has its greatest...,"[the, system, as, described, above, has, its, ...","[(the, DT), (system, NN), (as, IN), (described...","[the, system, a, described, above, ha, it, gre..."
1,The <e1>child</e1> was carefully wrapped and b...,18,child,cradle,4,60,the child was carefully wrapped and bound into...,"[the, child, was, carefully, wrapped, and, bou...","[(the, DT), (child, NN), (was, VBD), (carefull...","[the, child, wa, carefully, wrapped, and, boun..."
2,The <e1>author</e1> of a keygen uses a <e2>dis...,11,author,disassembler,4,39,the author of a keygen uses a disassembler to ...,"[the, author, of, a, keygen, uses, a, disassem...","[(the, DT), (author, NN), (of, IN), (a, DT), (...","[the, author, of, a, keygen, us, a, disassembl..."
3,A misty <e1>ridge</e1> uprises from the <e2>su...,18,ridge,surge,8,40,a misty ridge uprises from the surge,"[a, misty, ridge, uprises, from, the, surge]","[(a, DT), (misty, JJ), (ridge, NN), (uprises, ...","[a, misty, ridge, uprises, from, the, surge]"
4,The <e1>student</e1> <e2>association</e2> is t...,12,student,association,4,21,the student association is the voice of the un...,"[the, student, association, is, the, voice, of...","[(the, DT), (student, NN), (association, NN), ...","[the, student, association, is, the, voice, of..."
...,...,...,...,...,...,...,...,...,...,...
7995,When the <e1>notice</e1> is sent by <e2>fax</e...,18,notice,fax,9,36,when the notice is sent by fax the notice is n...,"[when, the, notice, is, sent, by, fax, the, no...","[(when, WRB), (the, DT), (notice, NN), (is, VB...","[when, the, notice, is, sent, by, fax, the, no..."
7996,The <e1>herbicide</e1> is derived from a natur...,8,herbicide,antibiotic,4,49,the herbicide is derived from a natural antibi...,"[the, herbicide, is, derived, from, a, natural...","[(the, DT), (herbicide, NN), (is, VBZ), (deriv...","[the, herbicide, is, derived, from, a, natural..."
7997,"To test this, we placed a kitchen <e1>match</e...",6,match,jar,34,56,to test this we placed a kitchen match in the ...,"[to, test, this, we, placed, a, kitchen, match...","[(to, TO), (test, VB), (this, DT), (we, PRP), ...","[to, test, this, we, placed, a, kitchen, match..."
7998,The farmers and city officials in the region h...,18,farmers,market,87,104,the farmers and city officials in the region h...,"[the, farmers, and, city, officials, in, the, ...","[(the, DT), (farmers, NNS), (and, CC), (city, ...","[the, farmer, and, city, official, in, the, re..."


In [47]:
relation_labels_dict = {
    '0': 'Cause-Effect(e1,e2)',
    '1': 'Cause-Effect(e2,e1)',
    '2': 'Component-Whole(e1,e2)',
    '3': 'Component-Whole(e2,e1)',
    '4': 'Content-Container(e1,e2)',
    '5': 'Content-Container(e2,e1)',
    '6': 'Entity-Destination(e1,e2)',
    '7': 'Entity-Destination(e2,e1)',
    '8': 'Entity-Origin(e1,e2)',
    '9': 'Entity-Origin(e2,e1)',
    '10': 'Instrument-Agency(e1,e2)',
    '11': 'Instrument-Agency(e2,e1)',
    '12': 'Member-Collection(e1,e2)',
    '13': 'Member-Collection(e2,e1)',
    '14': 'Message-Topic(e1,e2)',
    '15': 'Message-Topic(e2,e1)',
    '16': 'Product-Producer(e1,e2)',
    '17': 'Product-Producer(e2,e1)',
    '18': 'Other'
}

### The function "preprocess_data" preprocesses the text data by performing several NLP tasks:
1. Extracting entities : Extracts named entities (e1 and e2) from a sentence and identifies their indices. This is useful for relation classification tasks where the relationship between specific entities in a sentence is examined.
2. Cleaning text : Cleans the sentence by removing HTML tags, converting to lowercase, and removing punctuation. This standardizes the text data, making it more amenable to processing and analysis.   
3. Tokenizing : Tokenizes the sentence into individual words. This is the first step in processing text, as it transforms a string (sentence) into a list of tokens (words).
4. Part-of-Speech Tagging : Applies part-of-speech tagging to tokens. This is important for understanding the grammatical structure of sentences and for specific processing tasks such as lemmatization, which require knowledge of a word's part of speech.
5. Lemmatizing : Lemmatizes the tokens, converting them to their base or dictionary form. Unlike stemming, lemmatization considers the context and converts the word to its meaningful base form, which is a valid word itself.

In [7]:
def preprocess_data(dataframe):

    # Extracting entities and their indices
    def extract_entities(sentence):
        e1_match = re.search(r'<e1>(.*?)<\/e1>', sentence)
        e2_match = re.search(r'<e2>(.*?)<\/e2>', sentence)

        e1 = e1_match.group(1) if e1_match else ''
        e2 = e2_match.group(1) if e2_match else ''

        e1_index = e1_match.start() if e1_match else -1
        e2_index = e2_match.start() if e2_match else -1

        return e1, e2, e1_index, e2_index

    dataframe['e1'], dataframe['e2'], dataframe['e1_index'], dataframe['e2_index'] = zip(*dataframe['sentence'].apply(extract_entities))

    # Cleaning the sentence
    def clean_data(sentence):
        sentence = re.sub(r'<\/?e[12]>', '', sentence)
        sentence = sentence.lower()
        sentence = re.sub(r'[^a-zA-Z0-9\s]', '', sentence)
        return sentence

    dataframe['cleaned_sentence'] = dataframe['sentence'].apply(clean_data)

    # Tokenize, POS tag, and lemmatize
    def tokenize_text(sentence):
        return nltk.word_tokenize(sentence)

    def pos_tagging(tokens):
        return nltk.pos_tag(tokens)

    def lemmatization(tokens):
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(token) for token in tokens]

    dataframe['tokens'] = dataframe['cleaned_sentence'].apply(tokenize_text)
    dataframe['pos_tags'] = dataframe['tokens'].apply(pos_tagging)
    dataframe['lemmatized_tokens'] = dataframe['tokens'].apply(lemmatization)

    return dataframe

### Preprocessing training and testing datasets

In [8]:
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

### Vectorizing the text data using TF-IDF to convert text to a matrix of TF-IDF features

Here we transform the text into numerical vectors by measuring the importance of each term and the frequency in the document (TF) adjusted by its rarity across all documents (IDF). The TF-IDF value increases proportionally to the number of times a word appears in the document

1. TF(t,d) = (Number of times term t appears in a document d) / (Total number of terms in the document d)
2. IDF(t,D) = log_e(Total number of documents in the corpus D / Number of documents with term t in them)
3. TF-IDF(t, d, D) = TF(t, d) × IDF(t, D)



In [9]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(train_df['cleaned_sentence'])
x_test = vectorizer.transform(test_df['cleaned_sentence'])

### Encoding the labels into a format suitable for classification models and transforming the labels for test data.

In [10]:
le = LabelEncoder()
y_train = le.fit_transform(train_df['relation'])
y_test = le.transform(test_df['relation'])

### Importing necessary modules from scikit-learn for the SVC model and evaluation metrics

In [11]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

#### SVC Model Training

#### Instantiate the SVC model with specific hyperparameters:
1. kernel = 'linear' : Specifies the use of a linear kernel. This is suitable for text classification tasks where the feature space is high-dimensional. A linear kernel helps in finding a linear decision boundary in this space.

2. C = 10 : The regularization parameter. A larger value of C implies a smaller margin. Here, it is set to 10 to penalize misclassifications more, which can be useful for imbalanced datasets.

3. gamma = 0.0001 : Kernel coefficient for 'rbf', 'poly', and 'sigmoid'. For a linear kernel, it's not used but specified for completeness.

4. class_weight = 'balanced': Adjusts weights inversely proportional to class frequencies in the input data. This is important for handling imbalanced datasets, ensuring that the model does not bias towards the majority class.

In [12]:
svc = SVC(kernel='linear', C=10, gamma=0.0001, class_weight='balanced')

#### Fit the model on the training data.
1. x_train: Feature vectors of the training data.
2. y_train: Target values (class labels) for the training samples.

In [13]:
svc.fit(x_train, y_train)

#### Model Prediction and Evaluation

1. Predict the class labels for the test set.
2. x_test: Feature vectors of the test data.

In [14]:
y_pred = svc.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)

#printing the performance metric scores.
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred, zero_division=0))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.548767022451233
              precision    recall  f1-score   support

           0       0.89      0.72      0.79       134
           1       0.78      0.82      0.80       194
           2       0.44      0.54      0.48       162
           3       0.43      0.31      0.36       150
           4       0.68      0.71      0.70       153
           5       0.63      0.62      0.62        39
           6       0.74      0.83      0.79       291
           7       0.00      0.00      0.00         1
           8       0.62      0.73      0.67       211
           9       0.90      0.38      0.54        47
          10       0.43      0.27      0.33        22
          11       0.56      0.43      0.49       134
          12       0.75      0.19      0.30        32
          13       0.53      0.58      0.55       201
          14       0.62      0.50      0.55       210
          15       0.65      0.39      0.49        51
          16       0.44      0.40      0.42       108

#### Calculate and print the F1 score:
1. The F1 score is the harmonic mean of precision and recall, providing a balance between them.
2. It's particularly useful in the evaluation of binary classification systems, where the class distribution is imbalanced.

In [15]:
f1score = f1_score(y_test, y_pred, average='micro', labels=np.unique(y_train))
print(f1score)

0.548767022451233


#### Calculate F1 score for each relation:
1. This step calculates the F1 score for each class individually, providing insights into the model's performance on a per-class basis. This can highlight which classes are well-predicted by the model and which are not.

In [16]:
f1_score_per_relation = f1_score(y_test, y_pred, average=None, labels=np.unique(y_train))
# Print F1 score for each relation
relations = le.inverse_transform(range(len(le.classes_)))
for relation, score in zip(relations, f1_score_per_relation):
    print(f"Relation: {relation}, F1 Score: {score}")

Relation: 0, F1 Score: 0.793388429752066
Relation: 1, F1 Score: 0.8040201005025125
Relation: 2, F1 Score: 0.481994459833795
Relation: 3, F1 Score: 0.362934362934363
Relation: 4, F1 Score: 0.6964856230031949
Relation: 5, F1 Score: 0.6233766233766234
Relation: 6, F1 Score: 0.7857142857142857
Relation: 7, F1 Score: 0.0
Relation: 8, F1 Score: 0.6666666666666667
Relation: 9, F1 Score: 0.5373134328358209
Relation: 10, F1 Score: 0.33333333333333326
Relation: 11, F1 Score: 0.4851063829787234
Relation: 12, F1 Score: 0.3
Relation: 13, F1 Score: 0.5510688836104513
Relation: 14, F1 Score: 0.5488126649076518
Relation: 15, F1 Score: 0.4878048780487804
Relation: 16, F1 Score: 0.41951219512195115
Relation: 17, F1 Score: 0.3128491620111732
Relation: 18, F1 Score: 0.3289473684210526


#### Import the necessary modules from scikit-learn for creating ensemble models

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier

#### Stacking Classifier Setup

1. The Stacking Classifier is an ensemble learning technique that combines multiple classification models via a final estimator. Here, we use RandomForest and SVC as base models and another SVC as the final estimator.
2. Define base models for the stacking classifier.
3. Each model is defined as a tuple consisting of a unique name and the model instance.

In [18]:
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=151)),
    ('svc', SVC(probability=True, kernel='linear', random_state=151))
]

### Defining the final model.

1. The final model (meta-learner) takes in the outputs of the base models as input and makes the final prediction.
2. Here, we are using another SVC, but with a radial basis function (rbf) kernel.

In [19]:
final_model = SVC(kernel='rbf', probability=True, C=1.0, random_state=42)
stacking_model = StackingClassifier(estimators=base_models, final_estimator=final_model, cv=5)
stacking_model.fit(x_train, y_train)



#### Model Prediction and Evaluation

1. Predict the class labels for the test set using the trained stacking model.
2. The predictions are based on the combined strategy of the base models followed by the final model's decision.

In [20]:
y_pred_stack = stacking_model.predict(x_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred_stack)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred_stack, zero_division=0))
print(confusion_matrix(y_test, y_pred_stack))

Accuracy: 0.5800515274199485
              precision    recall  f1-score   support

           0       0.92      0.73      0.82       134
           1       0.77      0.85      0.81       194
           2       0.45      0.54      0.49       162
           3       0.59      0.31      0.41       150
           4       0.68      0.78      0.72       153
           5       0.65      0.72      0.68        39
           6       0.73      0.88      0.80       291
           7       0.00      0.00      0.00         1
           8       0.60      0.79      0.68       211
           9       0.74      0.66      0.70        47
          10       0.37      0.45      0.41        22
          11       0.62      0.49      0.55       134
          12       0.36      0.25      0.30        32
          13       0.55      0.66      0.60       201
          14       0.64      0.58      0.61       210
          15       0.69      0.49      0.57        51
          16       0.42      0.47      0.45       10

In [22]:
# Micro - F1 score calculation.
f1scores = f1_score(y_test, y_pred_stack, average='micro', labels=np.unique(y_train))
print(f1scores)

0.5800515274199485


In [23]:
# Print F1 score for each relation
f1_scores_per_relation = f1_score(y_test, y_pred_stack, average=None, labels=np.unique(y_train))
relations = le.inverse_transform(range(len(le.classes_)))
for relation, score in zip(relations, f1_scores_per_relation):
    print(f"Relation: {relation}, F1 Score: {score}")

Relation: 0, F1 Score: 0.8166666666666667
Relation: 1, F1 Score: 0.8078817733990147
Relation: 2, F1 Score: 0.4915254237288136
Relation: 3, F1 Score: 0.40869565217391307
Relation: 4, F1 Score: 0.723404255319149
Relation: 5, F1 Score: 0.6829268292682927
Relation: 6, F1 Score: 0.7975077881619937
Relation: 7, F1 Score: 0.0
Relation: 8, F1 Score: 0.6789366053169734
Relation: 9, F1 Score: 0.6966292134831461
Relation: 10, F1 Score: 0.40816326530612246
Relation: 11, F1 Score: 0.55
Relation: 12, F1 Score: 0.2962962962962963
Relation: 13, F1 Score: 0.5990990990990992
Relation: 14, F1 Score: 0.606516290726817
Relation: 15, F1 Score: 0.5747126436781609
Relation: 16, F1 Score: 0.4473684210526316
Relation: 17, F1 Score: 0.32608695652173914
Relation: 18, F1 Score: 0.30665163472378804


#User Testing

In [50]:
#classify_relation_svm function is being imported from the 'classifier' module and is then used for relation classification.

from classifier import classify_relation_svm

In [None]:
####################
""" USER TESTING """
####################

# Prompt the user for input in the specified format
print("Enter a sentence with entities in the format: \"The software <e1>company</e1> addressed the problem with the <e2>publication</e2> of a fix on Saturday.\"")




user_input = input("Enter a sentence for relation classification: ")




predicted_relation = classify_relation_svm(user_input,vectorizer, stacking_model)
print(f"Prediction Relation: {predicted_relation}")

# Get the predicted relation label
predicted_relation_label = relation_labels_dict[str(predicted_relation)]

# Print the result
print(f"Predicted Relation Class Label: {predicted_relation_label}")