# Set Up

In [1]:
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip install -q datasets loralib sentencepiece
!pip -q install bitsandbytes accelerate
!pip -q install langchain
!pip install einops

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
import warnings
warnings.filterwarnings("ignore")

import re
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline, BitsAndBytesConfig , CodeGenTokenizer
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from transformers import AutoTokenizer , AutoModelForCausalLM
import torch

#from Util import *

# Load Data

In [3]:
import pandas as pd
df = pd.read_csv("/content/Train_Data.csv")

# Phi-2 Base Model

In [4]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)

In [6]:
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    torch_dtype=torch.float32,
    device_map='auto',
    quantization_config=quantization_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
pipe = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    max_length=256,
    temperature=0.8,
    top_p=0.95,
    repetition_penalty=1.2
)
local_llm = HuggingFacePipeline(pipeline=pipe)
pipe.model.config.pad_token_id = pipe.model.config.eos_token_id

In [8]:
template = """A virtual assistant answers questions from a user based on the provided text.
             identify and generate list of all NOUN type word consider (Inside, Outside, Beginning) using in the text.
              USER: Text: {input_text}
              ASSISTANT:"""
prompt = PromptTemplate(template=template, input_variables=["instruction"])

In [9]:
def generate_prompt(text):
    result = llm_chain.run(text)
    answer = [i.replace("\'","") for i in re.findall(r"\'.[a-zA-Z ]+?\'", result)]
    return answer

llm_chain = LLMChain(prompt=prompt,
                     llm=local_llm
                     )


In [10]:
df['Phi2_Keywords']=df['Description'].apply(generate_prompt)

  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:5025

KeyboardInterrupt: 

In [None]:
df.to_csv("Phi2_Baseline_Data_Predicted.csv")

In [None]:
processed_df = Data_Mapping(df,'Description','Keywords')

In [None]:
processed_df['Predicted_Class'] = Data_Mapping(df,'Description','Phi2_Keywords')['Class']
processed_df.shape

In [None]:
processed_df.to_csv("Phi2_Baseline_Data_Predicted_token.csv",index=False)

In [None]:
processed_df.head()

In [None]:
# Define mapping dictionary
label_map = {'Noun': 1, 'O': 0}

# Map values in 'Labels' column
processed_df['Class'] = processed_df['Class'].map(label_map)
processed_df['Predicted_Class'] = processed_df['Predicted_Class'].map(label_map)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score
#accuracy, precision, recall and F1

metrics_df = pd.DataFrame()

# Compute accuracy,precision, recall, and F1 score
accuracy = accuracy_score(processed_df['Class'], processed_df['Predicted_Class'])
precision = precision_score(processed_df['Class'], processed_df['Predicted_Class'])
recall = recall_score(processed_df['Class'], processed_df['Predicted_Class'])
f1 = f1_score(processed_df['Class'], processed_df['Predicted_Class'])

# Print the results
print("Accuracy:", accuracy*100)
print("Precision:", precision*100)
print("Recall:", recall*100)
print("F1 Score:", f1*100)

pred_noun_count = processed_df[processed_df['Predicted_Class']==1].shape[0]
tokens_noun_count = processed_df[processed_df['Class']==1].shape[0]
covered_area = pred_noun_count/tokens_noun_count
print("the percentage of covered area:", covered_area)
from collections import defaultdict
results = defaultdict(list)
results['Model'].append("Phi2_Baseline")
results['Accuracy'].append(round(accuracy*100,2))
results['Precision'].append(round(precision*100,2))
results['Recall'].append(round(recall*100,2))
results['F1 Score'].append(round(f1*100,2))
results['Covered Area'].append(round(covered_area*100,2))

metrics_df = metrics_df.append(results, ignore_index=True)

metrics_df
