# Set Up



In [1]:
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip install -q datasets loralib sentencepiece
!pip -q install bitsandbytes accelerate
!pip -q install langchain
!pip install einops
!pip install tensorflow_probability>=0.13.0


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
import warnings
warnings.filterwarnings("ignore")

import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

import pandas as pd

from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline

from Utils import *

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

# Load Data

In [3]:
df = pd.read_csv('/content/Train_Data.csv')

In [4]:
df.head()

Unnamed: 0,ID,Number,Description,Keywords
0,699,71988,A man in a wheelchair and another sitting on a...,"['man', 'wheelchair', 'another', 'bench', 'wat..."
1,701,193622,A man sits with a traditionally decorated cow,"['man', 'traditionally decorated cow']"
2,827,52087,A man getting a drink from a water fountain th...,"['man', 'drink', 'water fountain', 'toilet']"
3,891,119964,A person holding a skateboard overlooks a dead...,"['person', 'skateboard', 'dead field of crops']"
4,1221,382406,A woman is walking a dog in the city.,"['woman', 'dog', 'city']"


# BERT BASE Model

In [5]:
model_name = "QCRI/bert-base-multilingual-cased-pos-english"

# tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Token Classification
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Pipeline
pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def Predict_Keywords(text):
  word_list= [i['word'] for i in pipeline(text) if i['entity'] in ['NN']]

  result = []
  if len(word_list) > 1:
    for index, word in enumerate(word_list):
        if "#" in word:
            try:
              result[-1] += word[2:]
            except:
              pass
        else:
            result.append(word)

  return result

In [7]:
df['BERT_Keywords']=df['Description'].apply(Predict_Keywords)

In [8]:
df.to_csv("BERT_Baseline_Data_Predicted.csv")

In [9]:
df.head()

Unnamed: 0,ID,Number,Description,Keywords,BERT_Keywords
0,699,71988,A man in a wheelchair and another sitting on a...,"['man', 'wheelchair', 'another', 'bench', 'wat...","[man, wheelchair, bench, water]"
1,701,193622,A man sits with a traditionally decorated cow,"['man', 'traditionally decorated cow']","[man, cow]"
2,827,52087,A man getting a drink from a water fountain th...,"['man', 'drink', 'water fountain', 'toilet']","[man, drink, water, fountain, toilet]"
3,891,119964,A person holding a skateboard overlooks a dead...,"['person', 'skateboard', 'dead field of crops']","[person, skateboard, field]"
4,1221,382406,A woman is walking a dog in the city.,"['woman', 'dog', 'city']","[woman, dog, city]"


In [10]:
processed_df = Data_Mapping(df,'Description','Keywords','BERT_Keywords')

In [11]:
processed_df

Unnamed: 0,ID,Token,Class,Predicted_Class
0,827,A,O,O
1,827,man,Noun,Noun
2,827,getting,O,O
3,827,a,O,O
4,827,drink,Noun,Noun
5,827,from,O,O
6,827,a,O,O
7,827,water,O,Noun
8,827,fountain,O,Noun
9,827,that,O,O


In [13]:
# Define mapping dictionary
label_map = {'Noun': 1, 'O': 0}

# Map values in 'Labels' column
processed_df['Class'] = processed_df['Class'].map(label_map)
processed_df['Predicted_Class'] = processed_df['Predicted_Class'].map(label_map)

In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score
#accuracy, precision, recall and F1

metrics_df = pd.DataFrame()

# Compute accuracy,precision, recall, and F1 score
accuracy = accuracy_score(processed_df['Class'], processed_df['Predicted_Class'])
precision = precision_score(processed_df['Class'], processed_df['Predicted_Class'])
recall = recall_score(processed_df['Class'], processed_df['Predicted_Class'])
f1 = f1_score(processed_df['Class'], processed_df['Predicted_Class'])

# Print the results
print("Accuracy:", accuracy*100)
print("Precision:", precision*100)
print("Recall:", recall*100)
print("F1 Score:", f1*100)

pred_noun_count = processed_df[processed_df['Predicted_Class']==1].shape[0]
tokens_noun_count = processed_df[processed_df['Class']==1].shape[0]
covered_area = pred_noun_count/tokens_noun_count
print("the percentage of covered area:", covered_area)
from collections import defaultdict
results = defaultdict(list)
results['Model'].append("BERT_Baseline")
results['Accuracy'].append(round(accuracy*100,2))
results['Precision'].append(round(precision*100,2))
results['Recall'].append(round(recall*100,2))
results['F1 Score'].append(round(f1*100,2))
results['Covered Area'].append(round(covered_area*100,2))

metrics_df = metrics_df.append(results, ignore_index=True)

metrics_df


Accuracy: 85.71428571428571
Precision: 60.0
Recall: 100.0
F1 Score: 74.99999999999999
the percentage of covered area: 1.6666666666666667


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Covered Area
0,[BERT_Baseline],[85.71],[60.0],[100.0],[75.0],[166.67]
