# Importing Libraries

In [44]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import InferenceClient
import pandas as pd
import json
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline
import re

In [45]:
import re

In [46]:
prompt = 'Generate 100 customer responses regarding a banking experience, make sure to have parts describing convenience, speed and informative. Format each response into a single-line message. Do not need index like "1.".'

# 1. Generate Transcripts
- microsoft/phi-3-mini-4k-instruct

In [47]:
# client = InferenceClient(api_key="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")

# data = client.chat_completion(
# 	model="microsoft/Phi-3-mini-4k-instruct",
# 	messages=[{"role": "user", "content": prompt}],
# 	max_tokens=2048,
# 	stream=True,
# )

# for message in data:
#     print(message.choices[0].delta.content, end="")

In [48]:
# Generate Transcripts using API
client = InferenceClient(api_key="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")
data = client.chat_completion(
    model="microsoft/Phi-3-mini-4k-instruct",
    messages=[{"role": "user", "content": prompt}],
    max_tokens=2048,
    stream=True,
)

messages = []
for message in data:
    content = message.choices[0].delta.content
    messages.append(content)

# Join messages
full_response = ''.join(messages)
response_lst = list(full_response.split('\n'))

# Remove '' and whitespaces
response_lst = [response.strip() for response in response_lst if response != '']
response_lst

# Remove numbering using regular expressions
response_lst = [re.sub(r'^\d+\.\s*', '', response) for response in response_lst]
response_lst
# some_lst = [response.split('.')[1] for response in response_lst]
# some_lst

['"Loved the quick fund transfers, made their little bear very happy."',
 '"Super quick and convenient online banking, it saved us a trip downtown!"',
 '"The customer service was very informative and helped us understand our mortgage options quickly."',
 '"Impressed by how fast they processed our rental application, turns out to be a reliable landlord service!"',
 '"The branch on 5th was so conveniently located – saved us time every day!"',
 '"I appreciate the informative brochure about personal loans sent to our address."',
 '"Customer support answered my questions so quickly, saved my day!"',
 '"So convenient to fill out a balance inquiry online, thanks for the smooth experience!"',
 '"The call center representative was very informative during my application processing inquiry."',
 '"Decent savings account intro – I learned a lot and got great rates."',
 '"Virtual meeting with bank rep about my account—time-saver and clear info presented!"',
 '"The ATM\'s easy use and location made m

# 2. Predict Entity using BERT-NER

In [49]:
label_list = ['O', 'B-CON', "I-CON", "B-SPD", "I-SPD", "B-INF", "I-INF"]

id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id

json.dump(config, open("ner_model/config.json","w"))

In [50]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")



In [63]:
# Some trial run
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

example = "Absolutely thrilled with the fast online banking service! Effortless transactions anytime."

ner_results = nlp(example)

for i in ner_results:
	print(i)

In [61]:
res = [nlp(response) for response in response_lst]
res

[[{'entity': 'B-SPD',
   'score': 0.96340996,
   'index': 4,
   'word': 'quick',
   'start': 11,
   'end': 16}],
 [{'entity': 'B-SPD',
   'score': 0.3913223,
   'index': 3,
   'word': 'quick',
   'start': 7,
   'end': 12},
  {'entity': 'B-CON',
   'score': 0.89510155,
   'index': 5,
   'word': 'convenient',
   'start': 17,
   'end': 27}],
 [],
 [],
 [],
 [{'entity': 'B-INF',
   'score': 0.8577528,
   'index': 6,
   'word': '##ative',
   'start': 24,
   'end': 29},
  {'entity': 'I-INF',
   'score': 0.9431742,
   'index': 9,
   'word': '##re',
   'start': 36,
   'end': 38}],
 [],
 [{'entity': 'B-CON',
   'score': 0.84389293,
   'index': 3,
   'word': 'convenient',
   'start': 4,
   'end': 14}],
 [],
 [{'entity': 'B-CON',
   'score': 0.38207504,
   'index': 2,
   'word': 'decent',
   'start': 1,
   'end': 7},
  {'entity': 'I-INF',
   'score': 0.6005461,
   'index': 14,
   'word': 'rates',
   'start': 62,
   'end': 67}],
 [{'entity': 'B-CON',
   'score': 0.6322961,
   'index': 2,
   'word'

# 3. Rough Work
## 3.1 Using Phi-3 Model to predict the entity

In [54]:
# Label which parts are informative
prompt_add = "The ATM at Bank XYZ was super convenient and had English instructions. Label which parts of the text describing informative in string format. Do not add explanation behind."

client = InferenceClient(api_key="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")

data = client.chat_completion(
	model="microsoft/Phi-3-mini-4k-instruct",
	messages=[{"role": "user", "content": prompt_add}],
	max_tokens=2048,
	stream=True,
)

for message in data:
    print(message.choices[0].delta.content, end="")

 "ATM at Bank XYZ", "super", "convenient", "English instructions"

In [55]:
# Label which parts are speed
prompt_add = "The ATM at Bank XYZ was super convenient and had English instructions. Label which parts of the text describing speed in string format. Do not add explanation behind."

client = InferenceClient(api_key="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")

data = client.chat_completion(
	model="microsoft/Phi-3-mini-4k-instruct",
	messages=[{"role": "user", "content": prompt_add}],
	max_tokens=2048,
	stream=True,
)

for message in data:
    print(message.choices[0].delta.content, end="")

 The parts of the text describing speed in string format are "convenient", "English", and "instructions". As they are not numerical values, but descriptions, they cannot be quantified directly in terms of speed.

In [56]:
# Label which parts are convenience
prompt_add = "The ATM at Bank XYZ was super convenient and had English instructions. Label which parts of the text describing convenience. Do not add explanation behind."

client = InferenceClient(api_key="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")

data = client.chat_completion(
	model="microsoft/Phi-3-mini-4k-instruct",
	messages=[{"role": "user", "content": prompt_add}],
	max_tokens=2048,
	stream=True,
)

for message in data:
    print(message.choices[0].delta.content, end="")

 Convenience factors: 
- English instructions
- ATM availability

the responses are the same???

## 3.2 GPT-2

In [57]:
prompt1 = "I have just made a banking transaction. It was"

In [58]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generated_text = generator(prompt1, 
        max_length=100, 
        num_return_sequences=100)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


KeyboardInterrupt: 

In [12]:
for i, text in enumerate(generated_text):
    print(f"Generated text {i+1}:\n{text['generated_text']}\n")

Generated text 1:
I have just made a banking transaction. It was my only deposit in the state bank. I also got a deposit in the state bank of New Castle County. I was told to pay $2,700 in deposits with the state police and this was a cashier's deposit. I contacted the bank. She said, "I'll be taking your money."

Barry Dickson Jr.

Barry is a very private lawyer. He always gives a big smile and usually he will

Generated text 2:
I have just made a banking transaction. It was to a mutual friend and is currently trading at.01 BTC on the NYSE. As I go to my local ATM my wallet has been hacked. The customer is very helpful and told me when I can get my coins back. I will call and have their refund done. I am confident my bill will be paid for next time I check this page.

http://www.coinshive.co.uk/

It can't

Generated text 3:
I have just made a banking transaction. It was a transaction involving a small amount of $1,000 and we'd like to buy some stuff that has no interest in a bank."

G