In [None]:
import os
from xml.dom.minidom import parse
import nltk
import pandas as pd
import numpy as np
import gc
from collections import defaultdict

In [144]:
datadir = "../data/train"
rows = []

# process each file in directory
for f in os.listdir(datadir):

    # parse XML file, obtaining a DOM tree
    xml_tree = parse(datadir + "/" + f)

    # process each sentence in the file
    xml_sentences = xml_tree.getElementsByTagName("sentence")
    for xml_sentence in xml_sentences:
        sentence_id = xml_sentence.attributes["id"].value  # get sentence id
        sentence_text = xml_sentence.attributes["text"].value  # get sentence text

        # there are no entity pairs, skip sentence
        if len(xml_sentence.getElementsByTagName("entity")) <= 1:
            continue

        # get entities in the sentence
        xml_entities = xml_sentence.getElementsByTagName("entity")
        entities: set[str] = set()
        for xml_entity in xml_entities:
            entities.add(xml_entity.attributes["text"].value)  # get entity text

        # for each pair in the sentence, decide whether it is DDI and its type
        xml_pairs = xml_sentence.getElementsByTagName("pair")
        for xml_pair in xml_pairs:
            pair_id = xml_pair.attributes["id"].value  # get pair id
            # ground truth
            is_interaction = xml_pair.attributes["ddi"].value
            if is_interaction == "true":
                interaction_type = xml_pair.attributes["type"].value
            else:
                interaction_type = "null"

            words = set(nltk.word_tokenize(sentence_text)) - entities  # get words in the sentence, excluding entities
            rows.append({"pair_id": pair_id, "interaction": interaction_type, "words": words})

# create pandas dataframe
df = pd.DataFrame(rows)
df["interaction"] = df["interaction"].astype("category")
df.head()
del rows
gc.collect()

86478

In [145]:
# Compute the histogram of the appearances of each word depending on the interaction type
counts = defaultdict(lambda: defaultdict(int))
for i, probs in df.iterrows():
    for word in probs["words"]:
        counts[word][probs["interaction"]] += 1


counts = pd.DataFrame.from_dict(counts, orient="index").fillna(0)
counts

Unnamed: 0,null,int,effect,mechanism,advise
problems,45.0,3.0,1.0,0.0,0.0
of,12789.0,117.0,1087.0,874.0,460.0
treatment,303.0,0.0,26.0,16.0,41.0
typical,2.0,1.0,0.0,0.0,0.0
.,18155.0,217.0,1409.0,964.0,661.0
...,...,...,...,...,...
anhydrase,0.0,0.0,0.0,0.0,1.0
tachypnea,0.0,0.0,0.0,0.0,1.0
anorexia,0.0,0.0,0.0,0.0,1.0
PCP,0.0,0.0,0.0,0.0,1.0


In [146]:
word_probs = pd.DataFrame(counts.sum(axis=1) / len(df), columns=["prob_yes"])
word_probs["prob_no"] = 1 - word_probs["prob_yes"]
word_probs.sort_values(by="prob_yes", ascending=False)

Unnamed: 0,prob_yes,prob_no
.,0.924745,0.075255
",",0.849749,0.150251
of,0.662131,0.337869
and,0.635908,0.364092
the,0.555728,0.444272
...,...,...
non-enzyme,0.000043,0.999957
fertility,0.000043,0.999957
~7,0.000043,0.999957
females,0.000043,0.999957


In [147]:
# Divide by the amount of sentences in the corpus
conditional_probabilities = counts.copy()
for inter in counts.columns:
    conditional_probabilities[inter] = counts[inter] / df["interaction"].value_counts()[inter]

# At this point, counts is a dataframe the index of which is all the words in the corpus,
# and the columns are the interaction types. Each cell contains the marginal probability of the word
# given the interaction type. (P(word=yes|interaction))
conditional_probabilities

Unnamed: 0,null,int,effect,mechanism,advise
problems,0.002278,0.012987,0.000690,0.000000,0.000000
of,0.647544,0.506494,0.749655,0.856863,0.659971
treatment,0.015342,0.000000,0.017931,0.015686,0.058824
typical,0.000101,0.004329,0.000000,0.000000,0.000000
.,0.919241,0.939394,0.971724,0.945098,0.948350
...,...,...,...,...,...
anhydrase,0.000000,0.000000,0.000000,0.000000,0.001435
tachypnea,0.000000,0.000000,0.000000,0.000000,0.001435
anorexia,0.000000,0.000000,0.000000,0.000000,0.001435
PCP,0.000000,0.000000,0.000000,0.000000,0.001435


At this point, the matrix holds the following conditional probabilities:
$$
P(W_{i}=1| I=\mathcal{I}_j)
$$
Where $W_{i}$ is a random variable that indicates whether the $i$-th word is present in the document, having $W_{i}=1$ if it is present and $W_{i}=0$ otherwise; $I$ is the random variable that indicates the class of the document, and $\mathcal{I}_j$ is the $j$-th class.

Thus, we can use this matrix to calculate the probability of the co-occurrence of a word $i$ and a class $j$:
$$
P(W_{i}=1, I=\mathcal{I}_j) = P(W_{i}=1| I=\mathcal{I}_j) \cdot P(I=\mathcal{I}_j)
$$
And the same for the negative case:
$$
P(W_{i}=0, I=\mathcal{I}_j) = P(W_{i}=0| I=\mathcal{I}_j) \cdot P(I=\mathcal{I}_j) = (1 - P(W_{i}=1| I=\mathcal{I}_j)) \cdot P(I=\mathcal{I}_j)
$$

In [148]:
prior_probabilities = df["interaction"].value_counts() / len(df)
prior_probabilities

interaction
null         0.853205
effect       0.062640
mechanism    0.044064
advise       0.030111
int          0.009979
Name: count, dtype: float64

In [149]:
# Now, to get P(w=yes, interaction), we need to multiply P(word=yes|interaction) by P(interaction)
probabilities_yes = conditional_probabilities.copy()
probabilities_no = conditional_probabilities.copy()
for inter in conditional_probabilities.columns:
    probabilities_yes[inter] = conditional_probabilities[inter] * prior_probabilities[inter]
    probabilities_no[inter] = (1 - conditional_probabilities[inter]) * prior_probabilities[inter]
probabilities_yes

Unnamed: 0,null,int,effect,mechanism,advise
problems,0.001944,0.000130,0.000043,0.000000,0.000000
of,0.552488,0.005054,0.046959,0.037757,0.019872
treatment,0.013090,0.000000,0.001123,0.000691,0.001771
typical,0.000086,0.000043,0.000000,0.000000,0.000000
.,0.784301,0.009374,0.060869,0.041645,0.028555
...,...,...,...,...,...
anhydrase,0.000000,0.000000,0.000000,0.000000,0.000043
tachypnea,0.000000,0.000000,0.000000,0.000000,0.000043
anorexia,0.000000,0.000000,0.000000,0.000000,0.000043
PCP,0.000000,0.000000,0.000000,0.000000,0.000043


We're going to select as features the words that have the highest expected information gain with respect to the class labels.

To understand this, let's first imagine two categorical variables, $Y$ and $X$, with a joint probability distribution function $P(X, Y)$. 

In this scenario, the entropy of $Y$ is defined as the expected information of its values, and can be calculated as:
$$
H(Y) = -\sum_{i=1}^{k} P(y_i) \log_2 P(y_i)
$$
Where $k$ is the number of classes, and $P(y_i)$ is the probability of class $y_i$.

Now, let's say we perform a measurement of $X$ and obtain a value $x_j$. We can calculate the information gained of $Y$ given this result by computing the reduction of the entropy of $Y$:
$$
IG(Y|X=x_j) = H(Y) - H(Y|X=x_j)
$$
Where $H(Y|X=x_j)$ is the conditional entropy of $Y$ given $X=x_j$, and can be calculated as:
$$
H(Y|X=x_j) = -\sum_{i=1}^{k} P(y_i|x_j) \log_2 P(y_i|x_j)
$$
Where $P(y_i|x_j)$ is the conditional probability of class $y_i$ given the value $x_j$, which can be expressed in terms of the joint probability:
$$
P(y_i|x_j) = \frac{P(x_j, y_i)}{P(x_j)}
$$
Thus, we can express the information gain of the result $x_j$ as:
$$
IG(Y|X=x_j) = H(Y) + \sum_{i=1}^{k} \frac{P(y_i, x_j)}{P(x_j)} \log_2 \left( \frac{P(y_i, x_j)}{P(x_j)} \right)
$$

Finally, let's say that we know that $X$ has $m$ possible values. Then, we can compute the expected information gain of $Y$ given $X$ as:
$$
E_X[IG(Y|X=x)] = IG(Y|X) = \sum_{j=1}^{m} P(x_j) IG(Y|X=x_j)
$$
Where $P(x_j)$ is the marginal probability of $X=x_j$.

Expanding this expression, we can see that the expected information gain of $Y$ given $X$ is:
$$
IG(Y|X) = H(Y) + \sum_{j=1}^{m} \sum_{i=1}^{k} P(y_i, x_j) \log_2 \left( \frac{P(y_i, x_j)}{P(x_j)} \right)
$$




Applying this to our case, we are trying to maximize the expected information gain of the class labels $I$ given the set of random variables $W_i$, where $m_i$ (the number of options per each of them) is equal to 2 (the word is present or not). Thus, we can express the expected information gain of $I$ given $W_i$ as:
$$
IG(I|W_i) = H(I) + \sum_{j=1}^{k} \left( P(I=\mathcal{I}_j, W_i=1) \log_2 \left( \frac{P(I=\mathcal{I}_j, W_i=1)}{P(W_i=1)} \right) + P(I=\mathcal{I}_j, W_i=0) \log_2 \left( \frac{P(I=\mathcal{I}_j, W_i=0)}{P(W_i=0)} \right)\right)
$$
Where:
- $I$ is the random variable that indicates the class of interaction of the document and $\mathcal{I}_j$ is the $j$-th class.
- $W_i$ is the random variable that indicates whether the $i$-th word is present in the document, having $W_i=1$ if it is present and $W_i=0$ otherwise.

In [150]:
i_entropy = sum(-p * np.log2(p) for p in prior_probabilities if p > 0)  # H(I)
print(f"Entropy of the interaction classes: {i_entropy:.4f}")

Entropy of the interaction classes: 0.8627


In [153]:
word_expected_information_gain = {}
for word in probabilities_yes.index:
    s = 0
    for inter in probabilities_yes.columns:
        joint_prob_yes = probabilities_yes.loc[word, inter]
        joint_prob_no = probabilities_no.loc[word, inter]
        if joint_prob_yes > 0:
            s += joint_prob_yes * np.log2(joint_prob_yes / word_probs.loc[word, "prob_yes"])
        if joint_prob_no > 0:
            s += joint_prob_no * np.log2(joint_prob_no / word_probs.loc[word, "prob_no"])
    word_expected_information_gain[word] = i_entropy + s

word_expected_information_gain = pd.DataFrame.from_dict(
    word_expected_information_gain, orient="index", columns=["expected_information_gain"]
)
word_expected_information_gain.sort_values(by="expected_information_gain", ascending=False, inplace=True)
word_expected_information_gain

Unnamed: 0,expected_information_gain
should,0.050831
be,0.038628
following,0.021063
plasma,0.020463
interact,0.020253
...,...
drug-metab-olizing,0.000010
generated,0.000010
conclusions,0.000010
sample,0.000010


In [154]:
word_expected_information_gain.to_csv("../data/word_expected_information_gain.csv")