# Importing Libraries

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import InferenceClient
import pandas as pd
import json
from transformers import BertTokenizerFast, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline
import re

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
prompt = 'Generate 100 customer responses regarding a banking experience, make sure to have parts describing convenience, speed and informative. Format each response into a single-line message. Do not need index like "1.".'

# 1. Generate Transcripts
microsoft/phi-3-mini-4k-instruct

In [None]:
# Generate Transcripts using API
client = InferenceClient(api_key="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")
data = client.chat_completion(
    model="microsoft/Phi-3-mini-4k-instruct",
    messages=[{"role": "user", "content": prompt}],
    max_tokens=2048,
    stream=True,
)

messages = []
for message in data:
    content = message.choices[0].delta.content
    messages.append(content)

# Join messages
full_response = ''.join(messages)
response_lst = list(full_response.split('\n'))

# Remove '' and whitespaces
response_lst = [response.strip() for response in response_lst if response != '']
# response_lst

# Remove numbering using regular expressions
response_lst = [re.sub(r'^\d+\.\s*', '', response) for response in response_lst]
response_lst

['Quick services at the branch, incredibly convenient.',
 'Checked my account balance - so fast!',
 'Very informative chat support, love it!',
 'Gained clarity about my investment options - real help.',
 '...',
 'Felt rushed at the counter, just wanted to be more friendly.',
 'Waited 15 minutes for ATM service - not worth it.',
 "Had no personal device consultation, wished I'd asked for it."]

# 2. Predict Entity

In [10]:
label_list = ['O', 'B-CON', "I-CON", "B-SPD", "I-SPD", "B-INF", "I-INF"]

id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id

json.dump(config, open("ner_model/config.json","w"))

In [11]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [None]:
# # Some trial run
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)

# example = "Absolutely thrilled with the fast online banking service! Effortless transactions anytime."

# ner_results = nlp(example)

# # for i in ner_results:
# # 	print(i)

In [13]:
res = [nlp(response) for response in response_lst]
res

[[{'entity': 'B-SPD',
   'score': 0.99470574,
   'index': 1,
   'word': 'quick',
   'start': 0,
   'end': 5},
  {'entity': 'I-SPD',
   'score': 0.99729013,
   'index': 2,
   'word': 'services',
   'start': 6,
   'end': 14}],
 [],
 [{'entity': 'B-CON',
   'score': 0.466963,
   'index': 3,
   'word': '##ative',
   'start': 11,
   'end': 16},
  {'entity': 'I-CON',
   'score': 0.5576583,
   'index': 4,
   'word': 'chat',
   'start': 17,
   'end': 21},
  {'entity': 'I-INF',
   'score': 0.697609,
   'index': 5,
   'word': 'support',
   'start': 22,
   'end': 29}],
 [{'entity': 'B-SPD',
   'score': 0.6985686,
   'index': 8,
   'word': 'real',
   'start': 45,
   'end': 49}],
 [],
 [],
 [{'entity': 'B-SPD',
   'score': 0.99768984,
   'index': 2,
   'word': '15',
   'start': 7,
   'end': 9},
  {'entity': 'I-SPD',
   'score': 0.9985863,
   'index': 3,
   'word': 'minutes',
   'start': 10,
   'end': 17}],
 [{'entity': 'B-INF',
   'score': 0.4580407,
   'index': 4,
   'word': 'device',
   'start': 

# 3. Reviews

In [None]:
df = pd.read_csv('../data/bank_reviews3.csv')
reviews = df['review']
n, p = df.shape

In [36]:
res = []
for i in range(n):
    example = reviews[i]
    ner_results = nlp(example)
    for j in ner_results:
        res.append(j)

In [38]:
from collections import defaultdict
summary = defaultdict(lambda: {'count': 0, 'total_score': 0.0, 'words': set()})

# Populate summary dictionary
for result in res:
    entity_type = result['entity']
    summary[entity_type]['count'] += 1
    summary[entity_type]['total_score'] += result['score']
    summary[entity_type]['words'].add(result['word'])

# Calculate averages and prepare final summary
for entity, stats in summary.items():
    stats['average_score'] = stats['total_score'] / stats['count']
    stats['words'] = list(stats['words'])  # Convert set to list for readability

# Display summary
for entity, stats in summary.items():
    print(f"Entity Type: {entity}")
    print(f"  - Total Count: {stats['count']}")
    print(f"  - Average Confidence Score: {stats['average_score']:.2f}")
    print(f"  - Example Words: {', '.join(stats['words'])}")
    print()

Entity Type: B-CON
  - Total Count: 598
  - Average Confidence Score: 0.72
  - Example Words: moderate, 81, any, staggered, prompt, great, 1000, normal, internet, mandatory, pathetic, 850, clear, more, fast, 2, friendly, come, proper, yes, quarterly, good, customized, maintain, perfect, need, comfortable, 2500, awesome, separate, convenient, fair, 100, convenience, instantly, minimum, secure, average, smooth, non, simple, protocol, right, high, active, attractive, organized, easier, difficult, ##1, quick, ##able, mobile, fake, limit, difficulties, hopeless, usa, safe, improve, supportive, best, differ, okay, excellent, avail, free, long, slow, ##icate, better, ##sle, annual, yearly, option, nice, ##tive, easy, reliable, monthly, near, online, fine, net

Entity Type: I-INF
  - Total Count: 692
  - Average Confidence Score: 0.72
  - Example Words: offer, update, ##dation, details, bounce, en, 1000, ##que, notification, rate, ##ment, id, 50, de, sign, ##ig, response, que, ##unt, pay, 5000