<a href="https://colab.research.google.com/github/CDU-data-science-team/zero-shot/blob/feature-Huggingface_transformer/Patient_Feedback_with_Hugging_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.3 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 34.7 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 64.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 61.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

#Load important libraries

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from transformers import pipeline
from sklearn.metrics import accuracy_score

from time import time
import datetime
from warnings import filterwarnings
filterwarnings("once", category=DeprecationWarning) # Display just the first matching deprecation warnings (multi_class changing to multi_label).

#Load Data

In [5]:
filepath = 'new_data.csv' 
df = pd.read_csv(filepath, usecols=['label', 'feedback'])
#df = pd.read_csv(filepath, usecols=['label', 'feedback'], nrows=1000) #take a small sample to test diffrent nli models for zeroshot

In [6]:
# Fill missing values with 'Nothing'
df.fillna('Nothing', inplace=True)
print(df.shape)
df.head()

(9291, 2)


Unnamed: 0,label,feedback
0,Couldn't be improved,Nothing.
1,Environment/ facilities,Temperature in theatre a little low.
2,Access,Same service available at Bingham Health Centre.
3,Communication,Appointment details given over phone - no phys...
4,Communication,On one occasion I was not made aware that my a...


# Prepare data for prediction

In [7]:
sequence = df.feedback.values
candidate_labels = df.label.unique()
print(len(candidate_labels))
candidate_labels


9


array(["Couldn't be improved", 'Environment/ facilities', 'Access',
       'Communication', 'Dignity', 'Staff', 'Care received',
       'Transition/coordination', 'Miscellaneous'], dtype=object)

#Prediction

In [16]:
#List of pretrained Zeroshot models to test
#uncheck the model list to run for all models
# models = ['roberta-large-mnli', 'facebook/bart-large-mnli', 'typeform/distilbert-base-uncased-mnli',
#           'cross-encoder/nli-MiniLM2-L6-H768', 'cross-encoder/nli-distilroberta-base',
#           'cross-encoder/nli-roberta-base', 'cross-encoder/nli-deberta-base', 
#           'valhalla/distilbart-mnli-12-1', 'valhalla/distilbart-mnli-12-9', 'valhalla/distilbart-mnli-12-6',
#           'Recognai/bert-base-spanish-wwm-cased-xnli', 'typeform/roberta-large-mnli']
models = ['cross-encoder/nli-deberta-base']   #for testing the pipeline

In [8]:
# Function to convert seconds to hour, minute and second
# Parameters: (number of seconds)
def time_convert(seconds):
    min, sec = divmod(seconds, 60)
    hour, min = divmod(min, 60)
    return "%d:%02d:%02d" % (hour, min, sec)

In [None]:
%%time
model_name = []
model_score = []
model_initialising_time = []
model_prediction_time = []
model_average_feedback_score = []

# Loop through the list of models to try
for model in models:
  feedback_predict_score = []; feedback_predict = []  #list to hold the predicted class and its score

  # time the model initialization period
  start = time()
  # instantiate a zeroshot classifier object
  classifier = pipeline("zero-shot-classification", model=model)#, device=0) # utilize GPU)
  end = time()
  initialising_time =  round(end-start) #tracks model initialisation time
    
  # Actual prediction
  # track model prediction time
  # to do multiclass classification set <multi_class=True>
  start2 = time()
  for i in range(len(df)):
    result = classifier(sequence[i], candidate_labels) 
    feedback_predict.append(result['labels'][0])
    feedback_predict_score.append(result['scores'][0]) 
  end2 = time()
  prediction_time = round(end2-start2) #tracks model prediction time   

  # Model evaluation
  y_true = df.label.values
  y_pred = feedback_predict
  score = accuracy_score(y_true, y_pred)
  
  # Update model performance tracking lists
  model_name.append(model)
  model_score.append(round(score*100, 2))    
  model_initialising_time.append(time_convert(initialising_time))
  model_prediction_time.append(time_convert(prediction_time))
  model_average_feedback_score.append(round(np.mean(feedback_predict_score)*100, 2)) 

In [None]:
#update model performance table for model comparism
Model_performance_df = pd.DataFrame()

Model_performance_df['model_name'] = model_name
Model_performance_df['%_Accuracy_score'] = model_score
Model_performance_df['initialising_time'] = model_initialising_time
Model_performance_df['prediction_time'] = model_prediction_time
Model_performance_df['%average_feedback_score'] = model_average_feedback_score

# Sort the table by vlaues in model_score in ascending order
Model_performance_df.sort_values('model_score', inplace=True, ascending=False)


# Visualize the data and Export it as csv

In [None]:
#set values in model name as index
model_df = Model_performance_df.set_index(Model_performance_df['model_name'])

model_df.plot.bar(rot = 45, figsize=(15,10), fontsize=20)

# Export the model performance table
Model_performance_df.to_csv('Models_performance.txt', index = False) 