# LLM Text Generation

In [1]:
import os
import sys


import pandas as pd

from tqdm import tqdm

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../utils'))

from llms import TextGenerationModelFactory
from prompting_strategies import ZeroShotPrompt

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
file_name = "akuapem_with_tags_dataset-verified_data.xlsx"
path = os.path.join("../data/", file_name)

### Load One to Many

In [3]:
one_to_many_df = pd.read_excel(path, sheet_name="1-M_tags")
akan_source_one = "Akan (Source, One)"
eng_target_many = "English (Target, Many)"
one_many_cols_to_rename = {"Akuapem Twi": akan_source_one, 
                  "English": eng_target_many
                  }
one_to_many_df.rename(columns=one_many_cols_to_rename, inplace=True)
one_to_many_df

Unnamed: 0,AUD_SIZE,STATUS,AGE,FORMALITY,GENDER,GENDER_2,ANIMACY,SPEECH_ACT,"Akan (Source, One)","English (Target, Many)"
0,INDIVIDUAL,,PEER,INFORMAL,MASCULINE,FEMININE,ANIMATE,STATEMENT,Anadwo biara ɔfrɛ no.,He calls her every night.
1,INDIVIDUAL,,PEER,INFORMAL,MASCULINE,MASCULINE,ANIMATE,STATEMENT,Anadwo biara ɔfrɛ no.,He calls him every night.
2,INDIVIDUAL,,PEER,INFORMAL,FEMININE,FEMININE,ANIMATE,STATEMENT,Anadwo biara ɔfrɛ no.,She calls her every night.
3,INDIVIDUAL,,PEER,INFORMAL,MASCULINE,FEMININE,ANIMATE,STATEMENT,Anadwo biara ɔfrɛ no.,He calls her every night.
4,INDIVIDUAL,,PEER,INFORMAL,FEMININE,MASCULINE,ANIMATE,STATEMENT,"Anɔpa yi, ohyiaa no.",She met him this morning.
...,...,...,...,...,...,...,...,...,...,...
458,INDIVIDUAL,,PEER,FORMAL,NEUTRAL,NEUTRAL,INANIMATE,STATEMENT,"Nokwarem no, osu bɛtɔ.",It is definitely going to rain.
459,INDIVIDUAL,,PEER,FORMAL,NEUTRAL,NEUTRAL,INANIMATE,STATEMENT,"Nokwarem no, osu bɛtɔ.",Rain is surely on the way.
460,SMALL GROUP,,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Ɛsɛ sɛ yehu akokoaa foforo bi a wɔawo no foforo.,We've got to find a new babysitter.
461,SMALL GROUP,,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Ɛsɛ sɛ yehu akokoaa foforo bi a wɔawo no foforo.,We need to look for another babysitter.


In [4]:
akan_one_to_eng_many_mappings = one_to_many_df.groupby(akan_source_one)[eng_target_many].apply(list).to_dict()
for akan, e_list in akan_one_to_eng_many_mappings.items():
    print(f"Key: {akan}")
    print(f"Values: {e_list}\n")

Key: Amerika atubrafo a wodi kan no duu hɔ wɔ afeha a ɛto so 17 no mu.
Values: ['The first American colonists arrived in the 17th century.', 'The first settlers came to America in the 1600s.', 'America was first colonized in the 17th century.', 'The first American colonists landed in the 17th century.']

Key: Anadwo biara ɔfrɛ no.
Values: ['He calls her every night.', 'He calls him every night.', 'She calls her every night.', 'He calls her every night.']

Key: Anigyesɛm ne sɛ, hyɛn no mufo biara anhaw.
Values: [' Fortunately, no passengers were injured.', ' Luckily, none of the passengers were hurt.', ' Thankfully, no passengers were harmed.', ' Thankfully, all passengers were safe.']

Key: Anɔpa yi, ohyiaa no.
Values: ['She met him this morning.', 'She met her this morning.', 'He met her this morning.', 'He met him this morning.']

Key: Asamaoh ye nipa kese.
Values: ["Asamoah's great.", 'Asamoah is fantastic.', 'Asamoah is wonderful.', 'Asamoah is an amazing person.']

Key: Asamoah an

### Load Many to One

In [5]:
many_to_one_df = pd.read_excel(path, sheet_name="M-1_tags")
akan_source_many = "Akan (Source, Many)"
eng_target_one = "English (Target, One)"
many_to_1_cols_to_rename = {"Akuapem Twi": akan_source_many, 
                  "English": eng_target_one
                  }
many_to_one_df.rename(columns=many_to_1_cols_to_rename, inplace=True)
many_to_one_df

Unnamed: 0,AUD_SIZE,STATUS,AGE,FORMALITY,GENDER,GENDER.1,ANIMACY,SPEECH_ACT,"Akan (Source, Many)","English (Target, One)"
0,INDIVIDUAL,EQUAL,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Me na mewɔ ha.,I belong here.
1,INDIVIDUAL,EQUAL,PEER,FORMAL,NEUTRAL,NEUTRAL,ANIMATE,ANSWER,Me fata sɛ mewɔ ha.,I belong here.
2,INDIVIDUAL,EQUAL,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Ha na me wɔ.,I belong here.
3,INDIVIDUAL,,PEER,FORMAL,MASCULINE,,ANIMATE,STATEMENT,Kwaku to dwom yiye.,Kwaku sings quite well.
4,INDIVIDUAL,,PEER,FORMAL,MASCULINE,,ANIMATE,STATEMENT,Kwaku nim nwom to.,Kwaku sings quite well.
...,...,...,...,...,...,...,...,...,...,...
395,INDIVIDUAL,,ELDER,FORMAL,NEUTRAL,NEUTRAL,ANIMATE,QUESTION,Ne su te sɛn?,What's it like?
396,INDIVIDUAL,,PEER,FORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Na minhu nea menyɛ.,I ran out of ideas.
397,INDIVIDUAL,,PEER,INFORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Me nsusuiɛ asa.,I ran out of ideas.
398,INDIVIDUAL,,PEER,FORMAL,NEUTRAL,NEUTRAL,ANIMATE,STATEMENT,Menni adwene biara bio.,I ran out of ideas.


In [6]:
akan_man_to_eng_one_mappings = many_to_one_df.groupby(eng_target_one)[akan_source_many].apply(list).to_dict()
for eng, a_list in akan_man_to_eng_one_mappings.items():
    print(f"Key: {eng}")
    print(f"Values: {a_list}\n")

    

Key: Act like a man.
Values: ["Yɛ w'ade te sɛ onipa.", 'Di wo dwuma sɛ ɔbarima.', 'Ma mmarisɛm mmra wo mu.', 'Yɛ ɔkokoɔdurufoɔ.']

Key: Almost three thousand people died.
Values: ['Nnipa bɛyɛ mpensa wuwui.', 'Nnipa a wɔwui no bɛduru mpem mmiɛnsa.', 'Nnipa a wɔwui no bɛyɛ mpem mmiɛnsa.']

Key: Asamoah didn't tell Abena his secret.
Values: ['Asamoah anka ne kokoam asɛm ankyerɛ Abena.', "Asamoah anka n'ahintasɛm ankyerɛ Abena.", 'Asamoah de nsɛm sumaa Abena.']

Key: Asamoah is not serious about his studies.
Values: [' "Sɛ Yalah no, n\'adesua no ho nhia no kɛse.', "Asamoah ani nku n'adesua ho.", "Asamoah atoto n'adesua ase.", 'Asamoah adesua nyɛ adeɛ a ɛho hia no.']

Key: Asamoah isn't my father.
Values: ['Me papa a wɔfrɛ no ulah no nyɛ me papa.', 'Asamoah nyɛ me papa.', 'Ɛnyɛ me papa ne Asamoah.', 'Me papa nyɛ Asamoah.']

Key: Be respectful to your superiors, if you have any.
Values: [' Sɛ wowɔ bi a,  bu wo mpanyimfo.', ' Sɛ wowɔ akannifoɔ a,  fa anidie ma wɔn.', ' Sɛ wowɔ mpanimfoɔ a,  b

## Initialize Models + Propmt Models

In [7]:
tgmf = TextGenerationModelFactory
llama_31_70b_instruct = tgmf.create_instance('llama-3.1-70b-instruct')
llama_33_70b_instruct = tgmf.create_instance('llama-3.3-70b-instruct')
models = [llama_31_70b_instruct, llama_33_70b_instruct]
# models = [llama_33_70b_instruct]

In [8]:
zero_shot_prompt = ZeroShotPrompt

In [9]:
def generate_data(prompt: str, model):
    model_output = model.generate(prompt)
    # model_outputs[model.model_name] = model_output
    return int(model_output)

### One to Many

In [39]:
idx = 0
results = []
for key, value in tqdm(akan_one_to_eng_many_mappings.items()):
    # print(f"Key: {key}")
    # print(f"\tIndex: {value}")
    
    prompt = zero_shot_prompt.get_base_prompt(key, value)
    if idx == 0:
        print(f"Prompt: {prompt}")
        idx = idx + 1

    for model in models:
        llm_result = generate_data(prompt, model)
        # print(f"Model: {model.__name__()}\tGenerated: {llm_result}\n")
        result = (key, value, llm_result, model.__name__())
        results.append(result)
results

  0%|          | 0/132 [00:00<?, ?it/s]

Prompt: 
        You are translating from Akan to English. Select the most appropriate English translation from the options provided.
        
        Akan sentence: "Amerika atubrafo a wodi kan no duu hɔ wɔ afeha a ɛto so 17 no mu."
        
        Translation options: 
	1. The first American colonists arrived in the 17th century.
	2. The first settlers came to America in the 1600s.
	3. America was first colonized in the 17th century.
	4. The first American colonists landed in the 17th century.
        
        Select the best translation by number only. Respond with just the number (1, 2, 3, …).
        


100%|██████████| 132/132 [00:37<00:00,  3.50it/s]


[('Amerika atubrafo a wodi kan no duu hɔ wɔ afeha a ɛto so 17 no mu.',
  ['The first American colonists arrived in the 17th century.',
   'The first settlers came to America in the 1600s.',
   'America was first colonized in the 17th century.',
   'The first American colonists landed in the 17th century.'],
  1,
  'llama-3.1-70b-instruct'),
 ('Amerika atubrafo a wodi kan no duu hɔ wɔ afeha a ɛto so 17 no mu.',
  ['The first American colonists arrived in the 17th century.',
   'The first settlers came to America in the 1600s.',
   'America was first colonized in the 17th century.',
   'The first American colonists landed in the 17th century.'],
  1,
  'llama-3.3-70b-instruct'),
 ('Anadwo biara ɔfrɛ no.',
  ['He calls her every night.',
   'He calls him every night.',
   'She calls her every night.',
   'He calls her every night.'],
  1,
  'llama-3.1-70b-instruct'),
 ('Anadwo biara ɔfrɛ no.',
  ['He calls her every night.',
   'He calls him every night.',
   'She calls her every night.',

In [40]:
zero_shot_df = pd.DataFrame(results, columns=['akan_sentence', 'english_sentences', 'llm_label', 'llm_name'])
zero_shot_df

Unnamed: 0,akan_sentence,english_sentences,llm_label,llm_name
0,Amerika atubrafo a wodi kan no duu hɔ wɔ afeha...,[The first American colonists arrived in the 1...,1,llama-3.1-70b-instruct
1,Amerika atubrafo a wodi kan no duu hɔ wɔ afeha...,[The first American colonists arrived in the 1...,1,llama-3.3-70b-instruct
2,Anadwo biara ɔfrɛ no.,"[He calls her every night., He calls him every...",1,llama-3.1-70b-instruct
3,Anadwo biara ɔfrɛ no.,"[He calls her every night., He calls him every...",1,llama-3.3-70b-instruct
4,"Anigyesɛm ne sɛ, hyɛn no mufo biara anhaw.","[ Fortunately, no passengers were injured., L...",2,llama-3.1-70b-instruct
...,...,...,...,...
259,Ɛyɛ nwonwa yiye.,"[It's quite remarkable., It is truly amazing.,...",2,llama-3.3-70b-instruct
260,Ɛyɛ pɛ!,"[Perfect!, Spot on!, Exactly!, Just right!]",3,llama-3.1-70b-instruct
261,Ɛyɛ pɛ!,"[Perfect!, Spot on!, Exactly!, Just right!]",3,llama-3.3-70b-instruct
262,ɔyarehwɛfo no bɛkyerɛ wo ɔkwan a wobɛfa so ayɛ...,"[The nurse will tell you how to do it., The nu...",2,llama-3.1-70b-instruct


In [41]:
akan_sentences = list(akan_one_to_eng_many_mappings.keys())
akan_sentences

['Amerika atubrafo a wodi kan no duu hɔ wɔ afeha a ɛto so 17 no mu.',
 'Anadwo biara ɔfrɛ no.',
 'Anigyesɛm ne sɛ, hyɛn no mufo biara anhaw.',
 'Anɔpa yi, ohyiaa no.',
 'Asamaoh ye nipa kese.',
 'Asamoah anhu sɛnea na wobu Esi anim animtiaa no.',
 'Asamoah betumi abua no.',
 'Asamoah yɛ ɔnokwafo.',
 'Awarefo no de sika pii na ɛyɛɛ wɔn fie.',
 'Bere a ade tɔɔ ne so ara pɛ na yɛde no kɔɔ ayaresabea hɔ.',
 'Bere a Ɔhemmaa no soo nkuruwa no mu no, ɔde guu adaka no mu.',
 'Bere bɛn na wiase no bɛba awiei?',
 'Bɔ mmɔden yɛ saa.',
 'Dɛn na wobɛka sɛ ɛsɛ sɛ wokɔ nnwonto no ase?',
 'Efi saa bere no, minni adagyew koraa.',
 'Egye obi na wahu emu biako.',
 'Esi bɔɔ ne tirim sɛ ɔbɛyɛ ne fie ho mfonini.',
 'Esiane ahum no nti, hyɛn no antumi amfi po so hyɛn gyinabea hɔ.',
 "Esiane sɛ na m'ani so atan me nti, anka mereyɛ akyere me.",
 'Hena na ɔka kyerɛɛ wo sɛ mintumi nnom nsu?',
 'Kwaku anyɛ bɔne biara.',
 'Kwaku dii akɔneaba.',
 'Kwaku ntumi ntu ntɛm sɛnea Abena betumi ayɛ no.',
 "M'ani begye ho s

In [53]:
def get_llm_labels(df, model_name):
    filt_llama = (df['llm_name'] == model_name)
    filt_df = df[filt_llama]
    return filt_df['llm_label']

results_df = pd.DataFrame(akan_sentences, columns=['akan_sentence'])
for model in models:
    print(model.__name__())
    model_labels = get_llm_labels(zero_shot_df, model.__name__())
    # print(model_labels)
    results_df[model.__name__()] = model_labels.to_numpy().ravel()

results_df


llama-3.1-70b-instruct
llama-3.3-70b-instruct


Unnamed: 0,akan_sentence,llama-3.1-70b-instruct,llama-3.3-70b-instruct
0,Amerika atubrafo a wodi kan no duu hɔ wɔ afeha...,1,1
1,Anadwo biara ɔfrɛ no.,1,1
2,"Anigyesɛm ne sɛ, hyɛn no mufo biara anhaw.",2,1
3,"Anɔpa yi, ohyiaa no.",3,3
4,Asamaoh ye nipa kese.,4,4
...,...,...,...
127,Ɛsɛ sɛ yɛma yɛn ani da hɔ.,2,2
128,Ɛyɛ nwonwa sɛ ebetumi aba saa.,2,2
129,Ɛyɛ nwonwa yiye.,2,2
130,Ɛyɛ pɛ!,3,3


### Many to One

In [None]:
idx = 0
m_to_o_direct_llm_selection = {}
for key, value in tqdm(akan_man_to_eng_one_mappings.items()):
    # print(f"Key: {key}")
    # print(f"\tIndex: {value}")
        
    prompt = f"You are a linguist that knows both Akan and English languages. I need you to take these Akan sentences: '{value}' alone without surrounding context and choose which translation most match the one English sentences: '{key}'. Do NOT generate anything other than the best Akan sentence that fits the Akan sentence."
    generated_data = generate_data(prompt, models)
    # print(f"\tGenerated: {generated_data}\n")
    m_to_o_direct_llm_selection[key] = (generated_data)
    akan_man_to_eng_one_mappings[key].append(generated_data)

In [None]:
m_to_o_direct_llm_selection

In [None]:
akan_man_to_eng_one_mappings