<a href="https://colab.research.google.com/github/CDU-data-science-team/zero-shot/blob/feature-Huggingface_transformer/Patient_Feedback_with_Hugging_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 32.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 54.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 41.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 50.2 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Success

#Load important libraries

In [2]:
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from time import time
import datetime
from warnings import filterwarnings
filterwarnings("once", category=DeprecationWarning) # Display just the first matching deprecation warnings (multi_class changing to multi_label).

#Load Data

In [3]:
#df = pd.read_csv(filepath, usecols=['label', 'feedback'])
filepath = 'new_data.csv' #missing values have been filled with 'Nothing' while extracting this data from the main patient dataset 

df = pd.read_csv(filepath) #, nrows=1000) #take a small sample to test diffrent nli models for zeroshot
#df = df.iloc[:500, :]; df.reset_index(inplace=True, drop=True)  

In [4]:
df.fillna('Nothing', inplace=True)
print(df.shape)
df.head()

(9291, 2)


Unnamed: 0,label,feedback
0,Couldn't be improved,Nothing.
1,Environment/ facilities,Temperature in theatre a little low.
2,Access,Same service available at Bingham Health Centre.
3,Communication,Appointment details given over phone - no phys...
4,Communication,On one occasion I was not made aware that my a...


# Prepare data for prediction

In [5]:
sequence = df.feedback.values
candidate_labels = df.label.unique()
sequence
print(len(candidate_labels))
candidate_labels


9


array(["Couldn't be improved", 'Environment/ facilities', 'Access',
       'Communication', 'Dignity', 'Staff', 'Care received',
       'Transition/coordination', 'Miscellaneous'], dtype=object)

#Prediction

In [10]:
#List of pretrained Zeroshot models
#models = ['cross-encoder/nli-MiniLM2-L6-H768']

models = ['roberta-large-mnli', 'facebook/bart-large-mnli', 'typeform/distilbert-base-uncased-mnli',
          'cross-encoder/nli-MiniLM2-L6-H768', 'cross-encoder/nli-distilroberta-base',
          'cross-encoder/nli-roberta-base', 'cross-encoder/nli-deberta-base', 
          'valhalla/distilbart-mnli-12-1', 'valhalla/distilbart-mnli-12-9', 'valhalla/distilbart-mnli-12-6',
          'Recognai/bert-base-spanish-wwm-cased-xnli', 'typeform/roberta-large-mnli']

#['typeform/mobilebert-uncased-mnli', 'typeform/squeezebert-mnli', 'typeform/squeezebert-mnli'
#'digitalepidemiologylab/covid-twitter-bert-v2-mnli',
#'digitalepidemiologylab/covid-twitter-bert-v2-mnli'] # Giving RuntimeError: The size of tensor a (540) must match the size of tensor b (512) at non-singleton dimension 1

#'vicgalle/xlm-roberta-large-xnli-anli', 'joeddav/xlm-roberta-large-xnli'


In [7]:
# start = time()
# x = 7*88899000
# end = time()
# a = end-start
# a
# print(prediction_time)
# initialising_time

Any NLI model can be used, but the id of the 'entailment' label must be included in the model config's 
~transformers.PretrainedConfig.label2id.

In [12]:
def time_convert(seconds):
    min, sec = divmod(seconds, 60)
    hour, min = divmod(min, 60)
    return "%d:%02d:%02d" % (hour, min, sec)

In [13]:
%%time
Model_performance_df = pd.DataFrame(columns = ['model_name', 'model_score', 'model_initialising_time',
                                               'model_prediction_time'])
model_name = []
model_score = []
model_initialising_time = []
model_prediction_time = []
model_average_feedback_score = []

for model in models:    #for model in [m for m in models if m not in model_name]:
  start = time()
  classifier = pipeline("zero-shot-classification", model=model, device=0) # utilize GPU)
  end = time()
  initialising_time =  round(end-start) #tracks model initialisation time
  
  feedback_predict_score = []; feedback_predict = []  #initialise attribute to hold the predicted class and its score
  
  #Actual prediction (model prediction time is tracked)
  start2 = time()
  for i in range(len(df)):
    result = classifier(sequence[i], candidate_labels) # to do multiclass classification set <multi_class=True>
    feedback_predict.append(result['labels'][0])
    feedback_predict_score.append(result['scores'][0])       #  or (float('%.2f' %result['scores'][0]))
  end2 = time()
  prediction_time = round(end2-start2) #tracks model prediction time   

  #Model evaluation
  y_true = df.label.values
  y_pred = feedback_predict
  score = accuracy_score(y_true, y_pred)
  
  #update model performance table for model comparism
  model_name.append(model)
  model_score.append(round(score*100, 2))     # or append(float(f'{(score*100):.2f}'))
  model_initialising_time.append(time_convert(initialising_time))
  model_prediction_time.append(time_convert(prediction_time))
  model_average_feedback_score.append(round(np.mean(feedback_predict_score), 2)) 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425744429.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1154.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1629486723.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=776.0, style=ProgressStyle(description_…

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267866263.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=258.0, style=ProgressStyle(description_…

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=701.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=328532073.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=772.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=702.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=498682313.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=772.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=975.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=556867008.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1135.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798296.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456356.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1356443.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=778.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=890410947.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=772.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1427984707.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=772.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1226394703.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=772.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=834.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=439502089.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=528.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242120.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=861.0, style=ProgressStyle(description_…

404 Client Error: Not Found for url: https://huggingface.co/typeform/roberta-large-mnli/resolve/main/pytorch_model.bin





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1421972232.0, style=ProgressStyle(descr…




All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at typeform/roberta-large-mnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=261.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798293.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456356.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=239.0, style=ProgressStyle(description_…


CPU times: user 1h 40min 47s, sys: 2min 37s, total: 1h 43min 24s
Wall time: 1h 42min 59s


In [14]:
#Export the module performance
Model_performance_df['model_name'] = model_name
Model_performance_df['model_score'] = model_score
Model_performance_df['model_initialising_time'] = model_initialising_time
Model_performance_df['model_prediction_time'] = model_prediction_time
Model_performance_df['model_average_feedback_score'] = model_average_feedback_score

Model_performance_df.to_csv('Models_performance.txt', index = False) #export the model performance table

In [15]:
Model_performance_df

Unnamed: 0,model_name,model_score,model_initialising_time,model_prediction_time,model_average_feedback_score
0,roberta-large-mnli,43.78,0:00:36,0:09:01,0.4
1,facebook/bart-large-mnli,52.51,0:00:44,0:10:50,0.54
2,typeform/distilbert-base-uncased-mnli,37.14,0:00:08,0:01:31,0.57
3,cross-encoder/nli-MiniLM2-L6-H768,43.01,0:00:10,0:01:38,0.57
4,cross-encoder/nli-distilroberta-base,40.54,0:00:11,0:01:38,0.39
5,cross-encoder/nli-roberta-base,36.95,0:00:14,0:03:06,0.5
6,cross-encoder/nli-deberta-base,51.39,0:00:16,0:03:42,0.48
7,valhalla/distilbart-mnli-12-1,54.07,0:00:24,0:05:12,0.5
8,valhalla/distilbart-mnli-12-9,57.09,0:00:37,0:09:25,0.54
9,valhalla/distilbart-mnli-12-6,52.9,0:00:36,0:07:50,0.48


# Possible useful codes

##Model evaluation

In [None]:
# #Model evaluation
# from sklearn.metrics import accuracy_score

# y_true = df.label.values
# y_pred = feedback_predict
# score = accuracy_score(y_true, y_pred)
# score


##Track the performance of the various models

In [None]:
# Model_performance = f'{model}: {score:.2f}'
# with open('Models_performance.txt', 'a') as file:
#   file.write(Model_performance+'\n')

In [None]:
# #investigating the importance of the probability score
# df_above60 = df1[df1['feedback_predict_score' ]>0.70]
# df_below60 = df1[df1['feedback_predict_score' ]<0.70]

# y_true_above60 = df_above60.label.values
# y_pred_above60 = df_above60.feedback_predict.values

# y_true_below60 = df_below60.label.values
# y_pred_below60 = df_below60.feedback_predict.values
# print (f'Entries with below 60% probability score: No = {len(df_below60)}, Accuracy = {accuracy_score(y_true_below60, y_pred_below60)}')
# print (f'Entries with over 60% probability score: No = {len(df_above60)}, Accuracy = {accuracy_score(y_true_above60, y_pred_above60)}')

In [None]:
# df['feedback_predict'] = feedback_predict
# df['feedback_predict_score'] = feedback_predict_score
# df1 = df.loc[:, ['feedback','label','feedback_predict', 'feedback_predict_score']]
# df1.head()

In [None]:
# #check for missing values
# print(df.isna().sum())

# #fill missing value has this causes runtime error while fiting the model 
# df['feedback'].fillna('Nothing', inplace = True)
# df.isna().sum()

In [None]:
# #testing 
# #trainer.train()
# hypothesis_template = "This text is about {}."

# b = 212
# a = classifier(sequence[b], candidate_labels, hypothesis_template=hypothesis_template) # to do multiclass classification set <multi_class=True>
# print (a['labels'], '\n', a['scores'], '\n', sequence[b], '\n', df.label[b])