# Experiment: Use classic machine learning approach (Random Forest) for relation classification

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
pd.set_option("max_colWidth", 300)

  from pandas.core import (


# Read the datasets

In [2]:
train_set = pd.read_json("../../data/subtask3/train_prepro.json")
valid_set = pd.read_json("../../data/subtask3/valid_prepro.json")
test_set = pd.read_json("../../data/subtask3/test_prepro.json")

In [3]:
labels = [
    'Version_of',
    'Developer_of',
    'Citation_of',
    'URL_of',
    'PlugIn_of',
    'Specification_of',
    'Release_of',
    'Extension_of',
    'License_of',    
    'Abbreviation_of',
    'AlternativeName_of',
]

# Feature Selection and Preparation

In [4]:
features_text = [
      # 'context',
       'sentence',
       'subject_text',
       'object_text'
]
features_cat = [
       #'target_label_binary',
       'subject_label',
       'subject_intention',
       'object_label',
       'object_intention',
       # Maybe other labels? 'entities',        
]
features_int = [
    "subject_begin",
    "subject_end",
    "object_begin",
    "object_end",
    'subject_object_distance',
    'max_distance_train',
    'left_to_right', # binary
    'unprobable_relation_direction', # binary
    'n_alternative_candidates_by_subject',
    'n_alternative_candidates_by_subject_filtered',
    'rank_by_candidate_distance', 'rank_by_candidate_distance_filtered'
]

In [5]:
## Not giving better results (kept, maybe helpfull later)
def text_between(rel):
    token = rel["sentence"].split()
    if rel.subject_end < rel.object_begin:
        return " ".join(token[rel.subject_end + 1: rel.object_begin])
    if rel.object_end < rel.subject_begin:
        return " ".join(token[rel.object_end + 1 : rel.subject_begin])
train_set["text_between_ents"] = train_set[["sentence", "subject_begin", "subject_end", "object_begin", "object_end"]].apply(text_between, axis=1)


### Build tokenizer

In [6]:
punctuation = "\.\(\)\[\]\,\:\;\"\'\/_/+"
token_pattern = f'(?u)(?:\\b\\w+\\b|[{punctuation}])'
vect = TfidfVectorizer(min_df=10, ngram_range=(1,2), token_pattern=token_pattern)
vect_subj = TfidfVectorizer(min_df=1, analyzer="word", ngram_range=(1,1), token_pattern=token_pattern, lowercase=False)
vect_obj = TfidfVectorizer(min_df=1, analyzer="word", ngram_range=(1,1), token_pattern=token_pattern, lowercase=False)
hih = vect.fit_transform(train_set.text_between_ents)
hah = vect_subj.fit_transform(train_set.subject_text.unique())
huh = vect_obj.fit_transform(train_set.object_text.unique())
hah.shape, hih.shape, huh.shape

((1150, 1183), (4450, 1275), (636, 808))

In [7]:
enc = OneHotEncoder(sparse_output=False)
enc.fit(train_set[features_cat]);

In [8]:
def get_features(dataset):
    """ function to transform features for classifier
    """
    dataset["rank_by_candidate_distance_filtered"] = dataset.rank_by_candidate_distance_filtered.apply(lambda x: x if x != -1 else 100)#.value_counts()
    X = dataset[features_int].astype(int)
    X_cat = pd.DataFrame(enc.transform(dataset[features_cat]), columns = enc.get_feature_names_out(), index=X.index)
    X = X.join(X_cat)
    ## Text features
    dataset["text_between_ents"] = train_set[["sentence", "subject_begin", "subject_end", "object_begin", "object_end"]].apply(text_between, axis=1)
    X_text_between = vect.transform(dataset["text_between_ents"])
    X_text_between = pd.DataFrame(X_text_between.toarray(), columns=[f"text_between_{w}" for w in vect.get_feature_names_out()])
    #X = X.join(X_text_between)
    X_text_subject = vect_subj.transform(dataset["subject_text"])
    X_text_subject = pd.DataFrame(X_text_subject.toarray(), columns=[f"text_subject_{w}" for w in vect_subj.get_feature_names_out()])
    X = X.join(X_text_subject)
    X_text_object = vect_obj.transform(dataset["object_text"])
    X_text_object = pd.DataFrame(X_text_object.toarray(), columns=[f"text_object_{w}" for w in vect_obj.get_feature_names_out()])
    X = X.join(X_text_object)
    return X.copy()

### Correction method to force for specific labels: one prediction per subject
 * (All but PlugIn_of, Specification_of, Extension_of)

In [9]:
labels_with_one_relation = {
    'Version_of',
    'Developer_of',
    'Citation_of',
    'URL_of',
     #'PlugIn_of',
     #'Specification_of',
     'Release_of',
     #'Extension_of',
     'License_of',
     'Abbreviation_of',
     'AlternativeName_of'}
def force_one(pred):
    if pred.iloc[0].target_label_binary in labels_with_one_relation:
        pred = pred.sort_values("proba", ascending=False)
        pred["prediction_force_one"] = 0
        index_first = pred.index[0]
        pred.loc[index_first, "prediction_force_one"] = 1
    else:
        pred["prediction_force_one"] = (pred.proba >= 0.5).astype(int)
    return pred.set_index("object_begin").prediction_force_one

def force_one_label(dataset, prediction, prediction_proba):
    data = dataset.copy()
    data["prediction"] = prediction
    data["proba"] = prediction_proba[:,1]
    subject_key = ["sentence_id", "subject_begin"]
    pred_force_one = data.groupby(subject_key)[["target_label_binary", "object_begin", "proba"]].apply(force_one)
    data = data.set_index(subject_key + ["object_begin"]).join(pred_force_one).reset_index()
    prediction_label_force_one = data[["target_label_binary", "prediction_force_one"]]\
        .apply(lambda x: x.target_label_binary if x.prediction_force_one == 1 else 'nil', axis=1)
    return data.prediction_force_one, prediction_label_force_one

## Prepare data

In [10]:
X_train = get_features(train_set)
y_train = train_set.label_binary

X_valid = get_features(valid_set)
y_valid_label_pred = valid_set.target_label_binary.to_numpy()
y_valid = valid_set.label_binary
y_valid_label = valid_set.relation.to_numpy()

X_test = get_features(test_set)
y_test_label_pred = test_set.target_label_binary.to_numpy()
y_test = test_set.label_binary
y_test_label = test_set.relation.to_numpy()

len(y_valid_label), len(y_test_label)

(1446, 960)

# Train Random Forest (use 5 runs)

In [11]:
%%time
model = RandomForestClassifier(
    #n_estimators=1000,
    #max_depth=10,
    #min_samples_split=10, #2
    #min_samples_leaf=10, #1 
)
eval_results = []
for run in range(5):
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    y_valid_pred_label = ["nil" if p == 0 else y_valid_label_pred[idx]  for idx, p in enumerate(y_valid_pred)] 
    y_valid_pred_proba = model.predict_proba(X_valid)
    y_valid_pred_force_one, y_valid_pred_label_force_one = force_one_label(valid_set, y_valid_pred, y_valid_pred_proba)
    eval_result = pd.DataFrame(classification_report(y_valid_label, y_valid_pred_label_force_one, zero_division=0., labels=labels, output_dict=True)).T
    eval_result = eval_result.reset_index().rename(columns=dict(index="label"))
    eval_results.append(eval_result) 
    y_test_pred = model.predict(X_test)
    y_test_pred_proba = model.predict_proba(X_test)
    y_test_pred_label = ["nil" if p == 0 else y_test_label_pred[idx]  for idx, p in enumerate(y_test_pred)]
    y_test_pred_force_one, y_test_pred_label_force_one = force_one_label(test_set, y_test_pred, y_test_pred_proba)
eval_results = pd.concat(eval_results)
eval_results = eval_results.groupby("label").describe()


CPU times: total: 30.9 s
Wall time: 31.6 s


# Evaluate
## Validation Set (5 runs)

In [12]:
show = [
    ("precision", "mean"),
    ("precision", "std"),
    ("recall", "mean"),
    ("recall", "std"),
    ("f1-score", "mean"),
    ("f1-score", "std"),
    ("support", "mean"),
]
eval_results[show]

Unnamed: 0_level_0,precision,precision,recall,recall,f1-score,f1-score,support
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Abbreviation_of,1.0,0.0,1.0,0.0,1.0,0.0,6.0
AlternativeName_of,1.0,0.0,1.0,0.0,1.0,0.0,3.0
Citation_of,0.933333,0.0,0.913043,0.0,0.923077,0.0,92.0
Developer_of,0.964706,1.241267e-16,0.942529,0.0,0.953488,1.241267e-16,174.0
Extension_of,1.0,0.0,1.0,0.0,1.0,0.0,13.0
License_of,0.9,0.0,0.9,0.0,0.9,0.0,10.0
PlugIn_of,0.831378,0.01491982,0.95,0.019562,0.886654,0.01401586,28.0
Release_of,1.0,0.0,1.0,0.0,1.0,0.0,16.0
Specification_of,1.0,0.0,0.488889,0.024845,0.65641,0.02293403,18.0
URL_of,0.918033,0.01159191,0.918033,0.011592,0.918033,0.01159191,61.0


## Test Set

In [13]:
pd.DataFrame(classification_report(y_test_label, y_test_pred_label_force_one, zero_division=0., labels=labels, output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
Version_of,0.981651,0.981651,0.981651,109.0
Developer_of,0.939759,0.939759,0.939759,83.0
Citation_of,0.977778,0.977778,0.977778,45.0
URL_of,0.897436,0.897436,0.897436,39.0
PlugIn_of,0.705882,0.923077,0.8,13.0
Specification_of,1.0,0.375,0.545455,8.0
Release_of,1.0,1.0,1.0,7.0
Extension_of,1.0,0.909091,0.952381,11.0
License_of,0.727273,0.888889,0.8,9.0
Abbreviation_of,1.0,1.0,1.0,5.0


### Best performance: 0.927

## Look into wrong predictions (on test set)

In [14]:
ha = valid_set.copy()
ha["pred"] = y_valid_pred_label
cols = ["sentence", "subject_text", "target_label_binary", "object_text", "relation", "pred"]
ha = ha[list(y_valid_label != y_valid_pred_label_force_one)][cols]#.sample(3)
len(ha)

61

In [25]:
ha.target_label_binary.value_counts()

target_label_binary
Developer_of        16
Citation_of         14
Specification_of    10
URL_of              10
PlugIn_of            7
License_of           2
Name: count, dtype: int64

In [26]:
#pd.Series(y_test).value_counts(), pd.Series(y_test_pred).value_counts(), y_test_pred_force_one.value_counts()

In [27]:
ha

Unnamed: 0,sentence,subject_text,target_label_binary,object_text,relation,pred
9,PyPhi is open - source and licensed under the GPLv 3 ; the source code is hosted on GitHub at https://github.com/wmayner/pyphi .,PyPhi,PlugIn_of,GitHub,nil,PlugIn_of
16,CoXpress is written in the native R language and has been fully tested on both windows and linux .,CoXpress,PlugIn_of,R,nil,PlugIn_of
30,"Mindboggle is a freely downloadable , open source software package written in Matlab ( version 6 , release 13 , with the Image Processing Toolbox , The Mathworks Inc . , USA ) and has been tested on different models of desktop and laptop computers running different distributions of Linux , as we...",The Mathworks Inc .,Developer_of,Matlab,Developer_of,nil
39,"Mindboggle is a freely downloadable , open source software package written in Matlab ( version 6 , release 13 , with the Image Processing Toolbox , The Mathworks Inc . , USA ) and has been tested on different models of desktop and laptop computers running different distributions of Linux , as we...",The Mathworks Inc .,Developer_of,Image Processing Toolbox,nil,Developer_of
54,Availability : FunciSNP is available from Bioconductor ( bioconductor.org ) .,Bioconductor,PlugIn_of,FunciSNP,nil,PlugIn_of
94,More details : Software name : CBFA plugin for Optflux Project home page : http://www.optflux.org/ Methods details and application tutorial : http://www.optflux.org/cbfa Operating system ( s ) : Platform independent Programming languages : Java Other requirements : Java JRE 1.7 .x ( for Mac OS u...,CBFA,PlugIn_of,Optflux,nil,PlugIn_of
96,More details : Software name : CBFA plugin for Optflux Project home page : http://www.optflux.org/ Methods details and application tutorial : http://www.optflux.org/cbfa Operating system ( s ) : Platform independent Programming languages : Java Other requirements : Java JRE 1.7 .x ( for Mac OS u...,http://www.optflux.org/cbfa,URL_of,CBFA,URL_of,nil
106,More details : Software name : CBFA plugin for Optflux Project home page : http://www.optflux.org/ Methods details and application tutorial : http://www.optflux.org/cbfa Operating system ( s ) : Platform independent Programming languages : Java Other requirements : Java JRE 1.7 .x ( for Mac OS u...,http://www.optflux.org/cbfa,URL_of,Optflux,nil,nil
157,""" • Project name : SSPACE - LongRead • Project home page : http://www.baseclear.com/bioinformatics-tools/ • Operating systems : All major Linux platforms • Programming languages : Perl , C + + ( the latter is required for BLASR , see below ) • Other requirements : BLASR for the alignment of long...",BaseTools,License_of,SSPACE - LongRead,License_of,nil
172,""" • Project name : SSPACE - LongRead • Project home page : http://www.baseclear.com/bioinformatics-tools/ • Operating systems : All major Linux platforms • Programming languages : Perl , C + + ( the latter is required for BLASR , see below ) • Other requirements : BLASR for the alignment of long...",BaseTools,License_of,BLASR,nil,nil
