# Set Up



In [None]:
!pip -q install -r requirements.txt

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
import warnings
warnings.filterwarnings("ignore")

import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

import pandas as pd
import re
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification,AutoModelForCausalLM, TokenClassificationPipeline, \
GenerationConfig, pipeline, BitsAndBytesConfig , CodeGenTokenizer

from Utils import *

from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score
metrics_df = pd.DataFrame()


# Load Data

In [None]:
df = pd.read_csv('/content/Train_Data.csv')

In [None]:
df.shape

In [None]:
#df = df.sample(n=100)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
df.head()

In [None]:
#!pip install --upgrade tensorflow


In [None]:
import tensorflow as tf

# Detect TPU if available
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    print("TPU not found")

# Check available devices
print(tf.config.list_physical_devices())


# Phi-2 Base Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

#quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)

base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    torch_dtype=torch.float32,
    device_map='auto'
    #quantization_config=quantization_config
)

In [None]:
pipe = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    max_length=256,
    temperature=0.6,
    top_p=0.95,
    repetition_penalty=1.2
)
local_llm = HuggingFacePipeline(pipeline=pipe)
pipe.model.config.pad_token_id = pipe.model.config.eos_token_id

In [None]:
template = """A virtual assistant answers questions from a user based on the provided text.
             identify and generate list of all NOUN type word in the text.
              USER: Text: {input_text}
              ASSISTANT:"""
prompt = PromptTemplate(template=template, input_variables=["instruction"])

In [None]:
def generate_prompt(text,llm_chain):

    result = llm_chain.invoke(text)['text']
    answer = [i.replace("\'","") for i in re.findall(r"\'.[a-zA-Z ]+?\'", result)]
    return answer

llm_chain = LLMChain(prompt=prompt,
                     llm=local_llm
                     )


In [None]:
df['Phi2_Keywords']=df['Description'].apply(generate_prompt,llm_chain=llm_chain)

In [None]:
df.to_csv("Phi2_Baseline_Data_Predicted.csv")

In [None]:
df.head()

In [None]:
processed_phi2_base_df = Data_Mapping(df,'Description','Keywords','Phi2_Keywords')

In [None]:
processed_phi2_base_df = Data_Labelling(processed_phi2_base_df)

In [None]:
#accuracy, precision, recall and F1
model_name = "Phi-2 Base"
metrics_df = compute_performance_metrics(metrics_df,processed_phi2_base_df,model_name)

# Phi-2 IOB Annotation

In [None]:
template = """A virtual assistant answers questions from a user based on the provided text.
             identify and generate list of all NOUN type word consider (Inside, Outside, Beginning) using in the text.
              USER: Text: {input_text}
              ASSISTANT:"""
prompt = PromptTemplate(template=template, input_variables=["instruction"])

In [None]:
llm_chain = LLMChain(prompt=prompt,
                     llm=local_llm
                     )


In [None]:
df['Phi2_IOB_Keywords']=df['Description'].apply(generate_prompt,llm_chain=llm_chain)

In [None]:
df.to_csv("Phi2_IOB_Annotation_Predicted.csv")

In [None]:
processed_Phi2_IOB_df = Data_Mapping(df,'Description','Keywords','Phi2_IOB_Keywords')

In [None]:
processed_Phi2_IOB_df = Data_Labelling(processed_Phi2_IOB_df)

In [None]:
# Compute accuracy,precision, recall, and F1 score
model_name = "Phi-2 IOB Annotation"
metrics_df = compute_performance_metrics(metrics_df,processed_Phi2_IOB_df,model_name)

# BERT BASE Model

In [None]:
model_name = "QCRI/bert-base-multilingual-cased-pos-english"

# tokenize
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Token Classification
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Pipeline
pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer)

In [None]:
def Predict_Keywords(text):
  word_list= [i['word'] for i in pipeline(text) if i['entity'] in ['NN']]

  result = []
  if len(word_list) > 1:
    for index, word in enumerate(word_list):
        if "#" in word:
            try:
              result[-1] += word[2:]
            except:
              pass
        else:
            result.append(word)

  return result

In [None]:
df['BERT_Keywords']=df['Description'].apply(Predict_Keywords)

In [None]:
df.to_csv("BERT_Baseline_Data_Predicted.csv")

In [None]:
df.head()

In [None]:
processed_BERT_df = Data_Mapping(df,'Description','Keywords','BERT_Keywords')

In [None]:
processed_BERT_df

In [None]:
processed_BERT_df = Data_Labelling(processed_BERT_df)

In [None]:
# Compute accuracy,precision, recall, and F1 score
model_name = "BERT Base"
metrics_df = compute_performance_metrics(metrics_df,processed_BERT_df,model_name)

# BERT with IOB tagging Annotation

In [None]:
df['BIO_tags'] = df['Description'].apply(DataAnnotate)

In [None]:
df.head()

In [None]:
df.to_csv("IOB_Annotated_Data.csv",index=False)

In [None]:
processed_BERT_iob_df = Data_Mapping_IOB(df,'Description','Keywords','BIO_tags')

In [None]:
processed_BERT_iob_df.shape

In [None]:
processed_BERT_iob_df = Data_Labelling(processed_BERT_iob_df)

In [None]:
#accuracy, precision, recall and F1
model_name = "BERT IOB Annotation"
metrics_df = compute_performance_metrics(metrics_df,processed_BERT_iob_df,model_name)

In [None]:
metrics_df