In [None]:
!gdown 1pfEk77qsvzQKtUYi1h-eG--zbQcONcLH

Downloading...
From: https://drive.google.com/uc?id=1pfEk77qsvzQKtUYi1h-eG--zbQcONcLH
To: /content/TRAIN_FILE.TXT
  0% 0.00/1.40M [00:00<?, ?B/s]100% 1.40M/1.40M [00:00<00:00, 134MB/s]


In [None]:
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


def preprocess_data(file_path):
    sentences = []
    labels = []

    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()

    for i in range(0, len(content), 4):
        sentence_line = content[i].strip()
        label_line = content[i + 1].strip()

        sentence = re.sub(r'^\d+\t\"(.*)\"$', r'\1', sentence_line)
        sentence = sentence.replace('<e1>', ' [E1] ').replace('</e1>', ' [/E1] ')
        sentence = sentence.replace('<e2>', ' [E2] ').replace('</e2>', ' [/E2] ')
        sentence = re.sub(r'[\W_]+', ' ', sentence.lower())

        sentences.append(sentence)
        labels.append(label_line)

    return sentences, labels

train_file_path = '/content/TRAIN_FILE.TXT'

sentences, labels = preprocess_data(train_file_path)

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(sentences).toarray()

unique_labels = sorted(list(set(labels)))
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}
y = np.array([label_to_id[label] for label in labels])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
classifier = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
report = classification_report(y_test, y_pred, labels=list(range(len(unique_labels))), target_names=unique_labels, zero_division=0)
print(report)

def predict_relationship(sentence, top_k=3):
    if '<e1>' in sentence and '<e2>' in sentence:
        normalized_sentence = sentence.replace('<e1>', ' [E1] ').replace('</e1>', ' [/E1] ')
        normalized_sentence = normalized_sentence.replace('<e2>', ' [E2] ').replace('</e2>', ' [/E2] ')
    else:
        words = sentence.split()
        tfidf_scores = vectorizer.transform([sentence]).toarray()[0]
        word_scores = {word: tfidf_scores[vectorizer.vocabulary_.get(word.lower(), 0)] for word in words}
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)

        e1_text = sorted_words[0][0] if len(sorted_words) > 0 else "Unknown"
        e2_text = sorted_words[1][0] if len(sorted_words) > 1 else "Unknown"

        normalized_sentence = sentence.replace(e1_text, ' [E1] ', 1).replace(e2_text, ' [E2] ', 1)

    normalized_sentence = re.sub(r'[\W_]+', ' ', normalized_sentence.lower())
    features = vectorizer.transform([normalized_sentence]).toarray()
    probabilities = classifier.predict_proba(features)[0]

    top_k_indices = probabilities.argsort()[-top_k:][::-1]
    top_k_labels = [(id_to_label[idx], probabilities[idx]) for idx in top_k_indices]

    return {
        "e1": e1_text,
        "e2": e2_text,
        "predictions": top_k_labels
    }

example_sentence = "Cats don't taste sweetness"
result = predict_relationship(example_sentence)

print("Example sentence:")
print("Cats don't taste sweetness")
print(" ")
print("Extracted Entities:")
print(f"e1: {result['e1']}")
print(f"e2: {result['e2']}")
print(" ")
print("Predicted Relationships:")
for label, prob in result['predictions']:
    print(f"{label}: {prob:.2f}")


                           precision    recall  f1-score   support

      Cause-Effect(e1,e2)       0.91      0.64      0.75        61
      Cause-Effect(e2,e1)       0.72      0.51      0.60       138
   Component-Whole(e1,e2)       0.37      0.21      0.27       100
   Component-Whole(e2,e1)       0.56      0.09      0.16        97
 Content-Container(e1,e2)       0.70      0.62      0.66        88
 Content-Container(e2,e1)       0.71      0.52      0.60        42
Entity-Destination(e1,e2)       0.67      0.87      0.75       158
Entity-Destination(e2,e1)       0.00      0.00      0.00         0
     Entity-Origin(e1,e2)       0.34      0.72      0.46       107
     Entity-Origin(e2,e1)       0.83      0.17      0.29        29
 Instrument-Agency(e1,e2)       0.33      0.07      0.12        14
 Instrument-Agency(e2,e1)       0.67      0.26      0.37        85
 Member-Collection(e1,e2)       1.00      0.12      0.21        17
 Member-Collection(e2,e1)       0.34      0.45      0.39     