In [25]:
from snorkel.preprocess.nlp import SpacyPreprocessor
from snorkel.labeling import PandasLFApplier,filter_unlabeled_dataframe,LFAnalysis ,labeling_function
from snorkel.labeling.model import MajorityClassVoter,MajorityLabelVoter,RandomVoter ,LabelModel


from snorkel.analysis import metric_score , get_label_buckets

from snorkel.utils import probs_to_preds

from sklearn.model_selection import train_test_split

import pandas as pd
import re
import os
from collections import OrderedDict

#importing self-defined helped modules
from snorkel_preprocessing_example import make_source_target_preprocessor,make_text_between_preprocessor

# 1. Load the data

In [2]:
#'https://raw.githubusercontent.com/covid19kg/covid19kg/master/supplement/terminology.csv'
url = 'https://raw.githubusercontent.com/CoronaWhy/task-vt/kaleidoescape_kg/immunology_kg/relations/covid19_frauenhofer_annotations.csv'
pybel_pd = pd.read_csv(url)
pybel_pd.head(7)

Unnamed: 0.1,Unnamed: 0,text,source,relation,target,link,pmc_id,doi_id
0,0,"While blocking TPC2 activity by tetrandrine, a...","{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...",negativeCorrelation,"{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...","{'annotations': {}, 'citation': {'authors': ['...",32221306.0,
1,1,Chemoinformatics searches yielded 15 approved ...,"{'(S)-verapamil': {'namespace': 'chebi', 'name...",negativeCorrelation,"{'hypertension': {'namespace': 'doid', 'name':...","{'annotations': {}, 'citation': {'db': 'DOI', ...",,https://doi.org/10.1101/2020.03.22.002386
2,2,Thyroid stimulating hormone and free triiodoth...,"{""3,3',5'-triiodothyronine"": {'namespace': 'ch...",negativeCorrelation,"{'COVID-19': {'namespace': 'doid', 'name': 'CO...","{'annotations': {'mesh': {'D044967': True}}, '...",32217556.0,
3,3,"Based on these results, we performed virtual d...","{""4'-epidoxorubicin"": {'namespace': 'chebi', '...",decreases,"{'3.4.22.69': {'namespace': 'eccode', 'name': ...","{'annotations': {}, 'citation': {'authors': ['...",32173287.0,
4,4,Doctors can also use a clinically approved bil...,{'4-methylumbelliferone': {'namespace': 'chebi...,decreases,"{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...","{'annotations': {'mesh': {'D008168': True}}, '...",32205856.0,
5,5,"Since Vitamin B3 is highly lung protective, it...",{'4-methylumbelliferone': {'namespace': 'chebi...,decreases,"{'HAS2': {'namespace': 'hgnc', 'name': 'HAS2',...","{'annotations': {}, 'citation': {'authors': ['...",32205856.0,
6,6,Doctors can also use a clinically approved bil...,{'4-methylumbelliferone': {'namespace': 'chebi...,decreases,"{'inflammatory response': {'namespace': 'go', ...","{'annotations': {'mesh': {'D008168': True}}, '...",32205856.0,


In [3]:
#list all types of relations
relation_categories = pybel_pd['relation'].unique()
relation_categories

array(['negativeCorrelation', 'decreases', 'regulates', 'increases',
       'positiveCorrelation', 'association', 'isA', 'biomarkerFor',
       'prognosticBiomarkerFor', 'causesNoChange'], dtype=object)

# 2. Snorkel Example

For the purpose of this example we'll only focus on rows with 'negativeCorrelation' and 'positiveCorrelation' as their relations.

In [4]:
example_data = pybel_pd[(pybel_pd['relation']=='negativeCorrelation') | (pybel_pd['relation']=='positiveCorrelation') ]
example_data['relation'] = example_data['relation']=='negativeCorrelation'
example_data.reset_index(inplace=True,drop=True)
example_data.drop('Unnamed: 0',inplace=True,axis=1)
example_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,text,source,relation,target,link,pmc_id,doi_id
0,"While blocking TPC2 activity by tetrandrine, a...","{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...",True,"{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...","{'annotations': {}, 'citation': {'authors': ['...",32221306.0,
1,Chemoinformatics searches yielded 15 approved ...,"{'(S)-verapamil': {'namespace': 'chebi', 'name...",True,"{'hypertension': {'namespace': 'doid', 'name':...","{'annotations': {}, 'citation': {'db': 'DOI', ...",,https://doi.org/10.1101/2020.03.22.002386
2,Thyroid stimulating hormone and free triiodoth...,"{""3,3',5'-triiodothyronine"": {'namespace': 'ch...",True,"{'COVID-19': {'namespace': 'doid', 'name': 'CO...","{'annotations': {'mesh': {'D044967': True}}, '...",32217556.0,
3,The administration of methylprednisolone appea...,{'6-methylprednisolone': {'namespace': 'chebi'...,True,"{'Death': {'namespace': 'mesh', 'name': 'Death...","{'annotations': {'doid': {'11394': True}}, 'ci...",32167524.0,
4,Adverse reactions of IFN-α mainly include low-...,"{'Interferon alfa-2a': {'namespace': 'chebi', ...",False,"{'Low-grade fever': {'namespace': 'hp', 'name'...","{'annotations': {}, 'citation': {'authors': ['...",32166483.0,


### 2.1 Split the data into training and testing 

Ideally should have training , validation and testing set. Also, here , I'm using a fixed testing period but k-fold cross validation techniques are a more robust way of determining the accuracy of the generated labels.

In [50]:
df_train,df_test,y_train,y_test = train_test_split(example_data[['text']],
                                                   example_data[['relation']],
                                                   test_size=0.20,
                                                   shuffle=False,random_state=1)

### 2.2 Reading sentences to understand syntactic differences betweem negative and positive correlation sentences

The utility of snorkel is that it allows you to create multiple labelling functions which (try to) mimic the rules that a human annotator of data would apply while deciding how to label unlabelled data. For instance a human annotator looking to identify negative correlation is sentences will follow the following rules:

##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like

1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'

2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'

3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)

4) does the sentence contain the expression 'negative effect'

5) does the sentence contain the expression 'move in opposite directions'

##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like

1) does the sentence contain the increase related words like words 'increased','improved'

2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related'

3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)

4) does the sentence contain the expression 'positive effect'

5) does the sentence contain the expression 'move in the same direction'

These rules can be coded using snorkel. Importantly it requires both positive and negative rules. 

#### 2.2.1 Examining Negative correlation sentences to understand their syntactic structure and then define labelling functions accordingly

In [51]:
neg_correl_df = example_data[example_data['relation']==1]
neg_correl_df.reset_index(inplace=True,drop=True)
neg_correl_df.head(8)

Unnamed: 0,text,source,relation,target,link,pmc_id,doi_id
0,"While blocking TPC2 activity by tetrandrine, a...","{'(+)-Tetrandrine': {'namespace': 'chebi', 'na...",True,"{'Tpcn2': {'namespace': 'mgi', 'name': 'Tpcn2'...","{'annotations': {}, 'citation': {'authors': ['...",32221306.0,
1,Chemoinformatics searches yielded 15 approved ...,"{'(S)-verapamil': {'namespace': 'chebi', 'name...",True,"{'hypertension': {'namespace': 'doid', 'name':...","{'annotations': {}, 'citation': {'db': 'DOI', ...",,https://doi.org/10.1101/2020.03.22.002386
2,Thyroid stimulating hormone and free triiodoth...,"{""3,3',5'-triiodothyronine"": {'namespace': 'ch...",True,"{'COVID-19': {'namespace': 'doid', 'name': 'CO...","{'annotations': {'mesh': {'D044967': True}}, '...",32217556.0,
3,The administration of methylprednisolone appea...,{'6-methylprednisolone': {'namespace': 'chebi'...,True,"{'Death': {'namespace': 'mesh', 'name': 'Death...","{'annotations': {'doid': {'11394': True}}, 'ci...",32167524.0,
4,"In our opinion, during the COVID-19 pandemic, ...",{'adrenergic antagonist': {'namespace': 'chebi...,True,"{'COVID-19': {'namespace': 'doid', 'name': 'CO...","{'annotations': {}, 'citation': {'authors': ['...",32220710.0,
5,"Consistent with previous reports, 20mM NH4Cl a...","{'ammonium chloride': {'namespace': 'chebi', '...",True,"{'G protein, vesicular stomatitis virus': {'na...","{'annotations': {}, 'citation': {'authors': ['...",32221306.0,
6,If the latter percentage would be found to be ...,{'angiotensin receptor antagonist': {'namespac...,True,"{'COVID-19': {'namespace': 'doid', 'name': 'CO...","{'annotations': {}, 'citation': {'authors': ['...",32129518.0,
7,"Consistent with previous reports, 20mM NH4Cl a...","{'bafilomycin A1': {'namespace': 'chebi', 'nam...",True,"{'G protein, vesicular stomatitis virus': {'na...","{'annotations': {}, 'citation': {'authors': ['...",32221306.0,


In [52]:
neg_correl_df['text'][0]

'While blocking TPC2 activity by tetrandrine, an inhibitor for TPC237, decreased entry of SARS-CoV-2 S pseudovirions (Fig. 3f), treatment of cells with 130, a TRPML1 inhibitor, had no effect (Supplementary Fig. 1), indicating that TPC2, not TRPML1, is important for SARS-CoV-2 entry.'

#### 2.2.2 Examining positive correlation sentences to understand their syntactic structure and then define labelling funtions accordingly

In [53]:
positive_relation_df = example_data[example_data['relation']==0]
positive_relation_df.reset_index(inplace=True,drop=True)
positive_relation_df.head(8)

Unnamed: 0,text,source,relation,target,link,pmc_id,doi_id
0,Adverse reactions of IFN-α mainly include low-...,"{'Interferon alfa-2a': {'namespace': 'chebi', ...",False,"{'Low-grade fever': {'namespace': 'hp', 'name'...","{'annotations': {}, 'citation': {'authors': ['...",32166483.0,
1,Adverse reactions of IFN-α mainly include low-...,"{'Interferon alfa-2a': {'namespace': 'chebi', ...",False,"{'influenza': {'namespace': 'doid', 'name': 'i...","{'annotations': {}, 'citation': {'authors': ['...",32166483.0,
2,This may be accounted for by two complementary...,"{'angiotensin II': {'namespace': 'chebi', 'nam...",False,"{'COVID-19': {'namespace': 'doid', 'name': 'CO...","{'annotations': {}, 'citation': {'authors': ['...",32129518.0,
3,ACE2 can also antagonize cardiac fibrosis and ...,"{'angiotensin II': {'namespace': 'chebi', 'nam...",False,{'Ventricular Remodeling': {'namespace': 'mesh...,"{'annotations': {}, 'citation': {'authors': ['...",32221983.0,
4,ACE2 can also antagonize cardiac fibrosis and ...,"{'angiotensin II': {'namespace': 'chebi', 'nam...",False,"{'Myocardial fibrosis': {'namespace': 'hp', 'n...","{'annotations': {}, 'citation': {'authors': ['...",32221983.0,
5,The existence of significantly increased fibri...,"{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...",False,"{'Hyperfibrinolysis': {'namespace': 'hp', 'nam...","{'annotations': {}, 'citation': {'authors': ['...",32216698.0,
6,This opinion is supported by the presence of h...,"{'Fibrin': {'namespace': 'chebi', 'name': 'Fib...",False,"{'Hemorrhage': {'namespace': 'mesh', 'name': '...","{'annotations': {}, 'citation': {'authors': ['...",32216698.0,
7,"In the influenza virus model, it was reported ...","{'chloroquine': {'namespace': 'chebi', 'name':...",False,{'dendritic cell antigen processing and presen...,"{'annotations': {'mesh': {'D007251': True}}, '...",32171740.0,


In [54]:
positive_relation_df['text'][0]

'Adverse reactions of IFN-α mainly include low-grade fever and flu-like symptoms (both in children with intramuscularly injection) [11].'

### 2.3 Source-Target dictionary

A simple but clean rule for identifying negative correlation sentences would be if negative tokens occured in the words between the source and the target. So, a source-target dictonary is created for some of the examples (in the final pipeline the source target dictonary will be obtained from the spacy pipeline).

In [55]:
example_sources = ['tetrandrine','triiodothyronine','methylprednisolone','IFN-α','angiotensin','ACE2','fibrin','hemorrhage','chloroquine']

example_targets = ['TPC2','recovered','death','fever','vasodilator','Ang','COVID-19','fibrinolysis','dendritic'] #low-grade fever

example_source_target_dict = list(OrderedDict.fromkeys(zip(example_sources,example_targets)))
example_source_target_dict

[('tetrandrine', 'TPC2'),
 ('triiodothyronine', 'recovered'),
 ('methylprednisolone', 'death'),
 ('IFN-α', 'fever'),
 ('angiotensin', 'vasodilator'),
 ('ACE2', 'Ang'),
 ('fibrin', 'COVID-19'),
 ('hemorrhage', 'fibrinolysis'),
 ('chloroquine', 'dendritic')]

### 2.4 Labeling functions for RE

In [56]:
spacy = SpacyPreprocessor(text_field="text", doc_field="doc", memoize=True)

ABSTAIN = -1
NOT_FOUND = 0
FOUND = 1

#### 2.4.1 Preprocessing

In [57]:
get_source_target = make_source_target_preprocessor(spacy, example_sources, example_targets)

candidate = example_data.loc[0]
candidate_with_function_applied = get_source_target(candidate) 

print("Sentence: ", candidate["text"],'\n',)
print("Source-target pair: ", candidate_with_function_applied.source_target)

Sentence:  While blocking TPC2 activity by tetrandrine, an inhibitor for TPC237, decreased entry of SARS-CoV-2 S pseudovirions (Fig. 3f), treatment of cells with 130, a TRPML1 inhibitor, had no effect (Supplementary Fig. 1), indicating that TPC2, not TRPML1, is important for SARS-CoV-2 entry. 

Source-target pair:  ('tetrandrine', 'TPC2')


In [58]:
get_text_between = make_text_between_preprocessor(spacy, example_sources, example_targets)

############ function example ###############################
candidate = example_data.loc[2]

candidate_with_function_applied = get_text_between(candidate)

print("Sentence: ", candidate["text"],'\n')
print("Text Between: ", candidate_with_function_applied.text_between,'\n')
print("Text to the left: ", candidate_with_function_applied.text_to_source_left,'\n')

Sentence:  Thyroid stimulating hormone and free triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in recovered patients (1.4 mIU/mL and 4.3 pmol/L). 

Text Between:  triiodothyronine concentrations were significantly lower in deceased patients (0.7 mIU/mL and 2.8 pmol/L) than in 

Text to the left:  Thyroid stimulating hormone and free 



#### 2.4.2 Defining the labelling functions (lf)

For the final labelling model to work, at least 3 rules are needed.

##### A. Positive rules: rules which tell you what a sentences with negative correlation looks like
    
1) does the sentence contain the reduction related words like words 'decreased','reduced','lowered'

In [59]:
reduction_tokens = {'decreased',
                            'lower',
                            'reduced',
}

In [60]:
@labeling_function(pre=[spacy],resources=dict(reduction_tokens=reduction_tokens))
def contains_reduction_tokens(x,reduction_tokens):
    
    tokens = [str(token) for token in x.doc]
    return FOUND if len(reduction_tokens.intersection(set(tokens))) > 0 else ABSTAIN

#positive rule - version 2
@labeling_function(pre=[spacy,get_text_between],resources=dict(reduction_tokens=reduction_tokens))
def contains_reduction_tokens_text_between(x,reduction_tokens):
    relation_text = x.text_between
    relation_text_tokens = [str(token) for token in relation_text]
    return FOUND if len(reduction_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN

2) does the sentence contain the expression 'negatively correlated' or 'negative correlation' or 'negatively related' or 'inversely related' or 'inverse relation'

In [61]:
negative_correlation_regex_1 = 'negative correlation'
negative_correlation_regex_2 = 'negatively correlated'
negative_correlation_regex_3 = 'negatively related'
negative_correlation_regex_4 = 'inversely related'
negative_correlation_regex_5 = 'inverse relation'
negative_correlation_regex_6 = 'negative effect'
negative_correlation_regex_7 = 'move in opposite directions'

@labeling_function()
def contains_negative_corrrelation_regex(x):
    if re.search(negative_correlation_regex_1, x.text, flags=re.I):
        return FOUND
    elif re.search(negative_correlation_regex_2, x.text, flags=re.I):
        return FOUND
    elif re.search(negative_correlation_regex_3, x.text, flags=re.I):
        return FOUND
    elif re.search(negative_correlation_regex_4, x.text, flags=re.I):
        return FOUND
    elif re.search(negative_correlation_regex_5, x.text, flags=re.I):
        return FOUND
    elif re.search(negative_correlation_regex_6, x.text, flags=re.I):
        return FOUND
    elif re.search(negative_correlation_regex_7, x.text, flags=re.I):
        return FOUND

    else: 
        return ABSTAIN
    

3) does the syntax follow the structure 'increase in x is associated with a decrease in y' (and vice versa)

In [62]:
@labeling_function(pre=[spacy,get_text_between])
def contains_increase_decrease_pattern(x):
    if ('increase' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):
        return FOUND
    elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):
        return FOUND
    else:
        return ABSTAIN

##### B. Negative rules: rules which tell you what a sentences without negative correlation looks like

1) does the sentence contain the increase related words like words 'increased','higher'

In [63]:
increase_tokens = {'increased',
                            'higher',
}

In [64]:
@labeling_function(pre=[spacy],resources=dict(increase_tokens=increase_tokens))
def contains_increase_tokens(x,increase_tokens):
    tokens = [str(token) for token in x.doc]
    return NOT_FOUND if len(increase_tokens.intersection(set(tokens))) > 0 else ABSTAIN


@labeling_function(pre=[spacy,get_text_between],resources=dict(increase_tokens=increase_tokens))
def contains_increase_tokens_text_between(x, increase_tokens):
    relation_text = x.text_between
    relation_text_tokens = [str(token) for token in relation_text]
    return NOT_FOUND if len(increase_tokens.intersection(set(relation_text_tokens))) > 0 else ABSTAIN

2) does the sentence contain the expression 'positively correlated' or 'positive correlation' or 'positively related' or 'positive effect' or 'move in the same direction'

In [65]:
#regex
positive_correlation_regex_1 = 'positive correlation'
positive_correlation_regex_2 = 'positively correlated'
positive_correlation_regex_3 = 'positively related'
positive_correlation_regex_4 = 'positive effect'
positive_correlation_regex_5 = 'move in the same direction'

@labeling_function()
def contains_positive_corrrelation_regex(x):
    if re.search(positive_correlation_regex_1, x.text, flags=re.I):
        return FOUND
    elif re.search(positive_correlation_regex_2, x.text, flags=re.I):
        return FOUND
    elif re.search(positive_correlation_regex_3, x.text, flags=re.I):
        return FOUND
    elif re.search(positive_correlation_regex_4, x.text, flags=re.I):
        return FOUND
    elif re.search(positive_correlation_regex_5, x.text, flags=re.I):
        return FOUND    
    else:
        return ABSTAIN

3) does the syntax follow the structure 'increase in x is associated with a increase in y' (and vice versa)

In [66]:
@labeling_function(pre=[spacy,get_text_between])
def contains_increase_increase_pattern(x):
    if ('increase' in [str(token) for token in x.text_to_source_left]) & ('increase' in [str(token) for token in x.text_between]):
        return FOUND
    elif ('decrease' in [str(token) for token in x.text_to_source_left]) & ('decrease' in [str(token) for token in x.text_between]):
        return FOUND
    else:
        return ABSTAIN

### 2.5 Creating all the labels for the different rules

In [67]:
label_functions_list = [contains_reduction_tokens,
                        contains_reduction_tokens_text_between,
                        contains_negative_corrrelation_regex,
                        contains_increase_decrease_pattern,
                        contains_increase_tokens,
                        contains_increase_tokens_text_between,
                        contains_positive_corrrelation_regex,
                        contains_increase_increase_pattern
                       ]

applier = PandasLFApplier(label_functions_list)

label_matrix_train = applier.apply(df_train)

  from pandas import Panel
100%|█████████████████████████████████████████████████████████████████████████████| 2227/2227 [00:10<00:00, 213.19it/s]


### 2.6 Examining the quality of the labels

In [68]:
#in the absence of a benchmark to compare against
LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
contains_reduction_tokens,0,[1],0.057027,0.019308,0.016165
contains_reduction_tokens_text_between,1,[1],0.004041,0.004041,0.000898
contains_negative_corrrelation_regex,2,[1],0.008532,0.0,0.0
contains_increase_decrease_pattern,3,[],0.0,0.0,0.0
contains_increase_tokens,4,[0],0.13022,0.016165,0.016165
contains_increase_tokens_text_between,5,[],0.0,0.0,0.0
contains_positive_corrrelation_regex,6,[1],0.002245,0.0,0.0
contains_increase_increase_pattern,7,[],0.0,0.0,0.0


In [69]:
#examining the quality of the labels in the presence of a benchmark to compare against
LFAnalysis(L=label_matrix_train, lfs=label_functions_list).lf_summary(y_train.values)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
contains_reduction_tokens,0,[1],0.057027,0.019308,0.016165,69,58,0.543307
contains_reduction_tokens_text_between,1,[1],0.004041,0.004041,0.000898,8,1,0.888889
contains_negative_corrrelation_regex,2,[1],0.008532,0.0,0.0,18,1,0.947368
contains_increase_decrease_pattern,3,[],0.0,0.0,0.0,0,0,0.0
contains_increase_tokens,4,[0],0.13022,0.016165,0.016165,263,27,0.906897
contains_increase_tokens_text_between,5,[],0.0,0.0,0.0,0,0,0.0
contains_positive_corrrelation_regex,6,[1],0.002245,0.0,0.0,0,5,0.0
contains_increase_increase_pattern,7,[],0.0,0.0,0.0,0,0,0.0


In [70]:
#examine which sentences were picked up as showing negative correlation by each label function
df_train.iloc[label_matrix_train[:, 0] == FOUND]

Unnamed: 0,text
0,"While blocking TPC2 activity by tetrandrine, a..."
2,Thyroid stimulating hormone and free triiodoth...
3,The administration of methylprednisolone appea...
7,"Consistent with previous reports, 20mM NH4Cl a..."
12,"Consistent with previous reports, 20mM NH4Cl a..."
...,...
1644,Actual bicarbonate and total carbon dioxide co...
1655,Albumin concentrations were significantly lowe...
1657,"Moreover, the frequencies of regulatory T cell..."
1658,The reduced expressions of interferon-γ (IFN-γ...


### 2.7 Predict the final label

Different models can be used to create the final model that aggrateges the different label functions to perdict the final lebel.

In [71]:
#testing data
label_matrix_test = applier.apply(df_test)

  from pandas import Panel
100%|███████████████████████████████████████████████████████████████████████████████| 557/557 [00:01<00:00, 282.64it/s]


In [72]:
#Model 1 : majority model (mm)
majority_model = MajorityLabelVoter()

#training data
mm_preds_class_train = majority_model.predict(L=label_matrix_train)
mm_preds_proba_train = majority_model.predict_proba(L=label_matrix_train)

#testing data
mm_preds_class_test = majority_model.predict(L=label_matrix_test)
mm_preds_proba_test = majority_model.predict_proba(L=label_matrix_test)

In [73]:
mm_preds_class_train # only the 1s and 0s are labels. T-1s are abstains i.e. unlabeled data points

array([ 1, -1,  1, ..., -1, -1, -1])

In [74]:
mm_preds_proba_train

array([[0. , 1. ],
       [0.5, 0.5],
       [0. , 1. ],
       ...,
       [0.5, 0.5],
       [0.5, 0.5],
       [0.5, 0.5]])

In [75]:
# Model 2:label model (lm)

#call the model
label_model = LabelModel(cardinality=2, verbose=True)

#fit the model
num_epochs = 1000
log_frequency = 100
random_seed = 1
label_model.fit(L_train=label_matrix_train, n_epochs=num_epochs, log_freq=log_frequency, seed=random_seed)

#generate lables for training data
lm_preds_proba_train = label_model.predict_proba(label_matrix_train)
lm_preds_class_train = probs_to_preds(lm_preds_proba_train)

#generate labels for testing data
lm_preds_proba_test = label_model.predict_proba(label_matrix_test)
lm_preds_class_test = probs_to_preds(lm_preds_proba_test)

In [76]:
# Model 3 : Random Voter (rv)

random_voter = RandomVoter()

#training data
rv_preds_class_train = random_voter.predict(L=label_matrix_train)
rv_preds_proba_train = random_voter.predict_proba(L=label_matrix_train)

#testing data
rv_preds_class_test = random_voter.predict(L=label_matrix_test)
rv_preds_proba_test = random_voter.predict_proba(L=label_matrix_test)

### 2.8 Comparing different models

In [77]:
all_models = {'Majority Model':majority_model,
              'Label Model':label_model,
              'Random Voter Model':random_voter}

for model_name,model in all_models.items():
    
    #accuracy
    train_acc = model.score(L=label_matrix_train, Y=y_train, tie_break_policy="random")["accuracy"]
    test_acc = model.score(L=label_matrix_test, Y=y_test, tie_break_policy="random")["accuracy"]
    
    #auc
    train_auc = metric_score(y_train, probs=model.predict_proba(L=label_matrix_train), metric='roc_auc')
    test_auc = metric_score(y_test, probs=model.predict_proba(L=label_matrix_test), metric='roc_auc')
    
    print(f'{model_name}','\n',
          'Accuracy:','\n','train->',train_acc,'\n','test->',test_acc,'\n',
          'AUC:','\n','train->',train_auc,'\n','test->',test_auc,'\n')
    
    
    

Majority Model 
 Accuracy: 
 train-> 0.5635383924562192 
 test-> 0.4703770197486535 
 AUC: 
 train-> 0.6010212548732079 
 test-> 0.5227059436913452 

Label Model 
 Accuracy: 
 train-> 0.5527615626403233 
 test-> 0.49012567324955114 
 AUC: 
 train-> 0.524345344386498 
 test-> 0.4437434827945777 

Random Voter Model 
 Accuracy: 
 train-> 0.5024696901661428 
 test-> 0.5008976660682226 
 AUC: 
 train-> 0.5126309212678816 
 test-> 0.5023114355231144 



The label model has the highest test AUC so that's the  best model.

# 3. Filter out unlabeled points

In [79]:
#training labels
df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train['text'], 
    y=mm_preds_proba_train, 
    L=label_matrix_train
)

#testing labels
df_test_filtered, probs_test_filtered = filter_unlabeled_dataframe(
    X=df_test['text'], 
    y=mm_preds_proba_test, 
    L=label_matrix_test
)

print('Total points labelled in training data:',len(df_train_filtered))
print('Total points labelled in testing data:',len(df_test_filtered))

Total points labelled in training data: 405
Total points labelled in testing data: 91


# References

https://www.snorkel.org/use-cases/spouse-demo
    
https://github.com/snorkel-team/snorkel-tutorials/blob/master/spouse/spouse_demo.ipynb
    
https://www.snorkel.org/use-cases/01-spam-tutorial
    
https://readthedocs.org/projects/snorkel/downloads/pdf/master/