# Importing Libraries

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import InferenceClient
import pandas as pd
import json
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
prompt = 'Generate 100 customer responses regarding a banking experience, make sure to have parts describing convenience, speed and informative. Format each response into a single-line message. Do not need index like "1.".'

# 1. Generate Transcripts
- microsoft/phi-3-mini-4k-instruct

In [3]:
# Generate Transcripts using API
client = InferenceClient(api_key="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")
data = client.chat_completion(
    model="microsoft/Phi-3-mini-4k-instruct",
    messages=[{"role": "user", "content": prompt}],
    max_tokens=2048,
    stream=True,
)

messages = []
for message in data:
    content = message.choices[0].delta.content
    messages.append(content)

# Join messages
full_response = ''.join(messages)
response_lst = list(full_response.split('\n'))

# Remove '' and whitespaces
response_lst = [response.strip() for response in response_lst if response != '']
response_lst

# Remove numbering using regular expressions
response_lst = [re.sub(r'^\d+\.\s*', '', response) for response in response_lst]
response_lst
# some_lst = [response.split('.')[1] for response in response_lst]
# some_lst

['"I appreciate the mobile app making banking so convenient and the quick transactions."',
 '"The ATM\'s availability is exceptional, offering round-the-clock banking."',
 '"Speedy loan disbursement was a pleasant surprise, saving time and paperwork."',
 '"The online portal’s ease of use made my last billing very efficient."',
 '"Kudos for the swift resolution of my query by the customer service team."',
 '"Convenienced by free and immediate ATM cash withdrawals."',
 '"Mobile deposits are faster and just as reliable as going to a branch."',
 '"Online banking interface is user-friendly and quick to navigate."',
 '"Instant confirmation for online transfers eased my worries of pending transactions."',
 '"The customer service representative\'s prompt and detailed information helped greatly."',
 '...',
 '"Savings in terms of time and fees with the home equity line of credit application process."',
 '(Note: Due to space constraints this list is not complete with all 100 generated responses.)

# 2. Predict Entity using BERT-NER

In [4]:
label_list = ['O', 'B-CON', "I-CON", "B-SPD", "I-SPD", "B-INF", "I-INF"]

id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id

json.dump(config, open("ner_model/config.json","w"))

In [5]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")



In [None]:
# Some trial run
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

example = "Absolutely thrilled with the fast online banking service! Effortless transactions anytime."

ner_results = nlp(example)

# for i in ner_results:
# 	print(i)

In [7]:
res = [nlp(response) for response in response_lst]
res

[[{'entity': 'B-SPD',
   'score': 0.8178127,
   'index': 13,
   'word': 'quick',
   'start': 66,
   'end': 71}],
 [],
 [{'entity': 'B-SPD',
   'score': 0.47681487,
   'index': 2,
   'word': 'speedy',
   'start': 1,
   'end': 7}],
 [{'entity': 'I-CON',
   'score': 0.5821327,
   'index': 9,
   'word': 'use',
   'start': 29,
   'end': 32}],
 [{'entity': 'I-INF',
   'score': 0.72931993,
   'index': 3,
   'word': '##dos',
   'start': 3,
   'end': 6}],
 [{'entity': 'B-CON',
   'score': 0.5148403,
   'index': 2,
   'word': 'convenience',
   'start': 1,
   'end': 12},
  {'entity': 'B-SPD',
   'score': 0.88783306,
   'index': 7,
   'word': 'immediate',
   'start': 26,
   'end': 35}],
 [{'entity': 'B-CON',
   'score': 0.75440663,
   'index': 5,
   'word': 'faster',
   'start': 21,
   'end': 27}],
 [{'entity': 'B-CON',
   'score': 0.953115,
   'index': 10,
   'word': 'quick',
   'start': 47,
   'end': 52},
  {'entity': 'I-CON',
   'score': 0.7873198,
   'index': 11,
   'word': 'to',
   'start': 5

# 3. Label reviews

In [15]:
df = pd.read_csv('../data/bank_reviews3.csv')

In [16]:
reviews = df['review']

In [18]:
# Some trial run
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

example = reviews[1]

ner_results = nlp(example)

for i in ner_results:
	print(i)

{'entity': 'B-CON', 'score': 0.5293347, 'index': 40, 'word': 'convenient', 'start': 179, 'end': 189}
{'entity': 'I-INF', 'score': 0.47595707, 'index': 47, 'word': 'offer', 'start': 213, 'end': 218}
