In [None]:
import kagglehub

# Download latest version
path = kagglehub.model_download("metaresearch/llama-2/pyTorch/7b-chat-hf")

print("Path to model files:", path)

In [None]:
import transformers
import torch
from transformers import AutoTokenizer

model = "/kaggle/input/llama-2/pytorch/7b-chat-hf/1"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
import pandas as pd

def get_response(word: str, example_sentence: str, eos_token_id: int, do_sample=True, top_k=10, num_return_sequences=1, max_length=4000):
    prompt_detailed = f"""<s>[INST] <<SYS>>
        Explain meaning of the given word. Respond directly and formally to each question. Do not use the phrase 'Sure!' at the beginning of your response.
        If a question does not make any sense or is not factually coherent, explain why instead of answering something not correct.
        If you don't know the answer, please don't share false information.
        <</SYS>>
        Explain the meaning of the word misapprehension basing on context “I’m afraid you are acting under a misapprehension, Governor." [/INST]
        The word misapprehension means a misunderstanding or a wrong interpretation of something.

        In the context of the sentence "I’m afraid you are acting under a misapprehension, Governor," it implies that the Governor is acting based on a false or incorrect understanding of a situation or fact. The speaker is politely indicating that the Governor's actions or decisions might be misguided due to this misunderstanding.

        Synonyms: misconception, mistaken belief, misunderstanding

        Usage example: He refused the offer due to a misapprehension about the terms of the contract
        </s>


        <s>[INST]  
        Explain the meaning of the word {word} basing on context "{example_sentence}"
        [/INST]"""
    sequences = pipeline(
        prompt_detailed,
        do_sample=do_sample,
        top_k=top_k,
        num_return_sequences=num_return_sequences,
        eos_token_id=eos_token_id,
        max_length=max_length,
    )
    for seq in sequences:
        response = seq['generated_text']
        #print(f"Result: {response}")
        return response

def clean_response(response: str):
    #cleaned_response = get_response("crestfallen", '"Which you know because you’ve already checked the older archives,” Vanto said, a bit crestfallen', tokenizer.eos_token_id)
    cleaned_response = response
    cleaned_response = cleaned_response.replace('<s>', '').replace('</s>', '')
    cleaned_response = cleaned_response.split('[INST]')[-1].split('[/INST]')[-1]
    cleaned_response = cleaned_response.split('<<SYS>>')[-1].split('<</SYS>>')[-1]
    cleaned_response = cleaned_response.replace('[', '').replace(']', '')
    cleaned_response = cleaned_response.strip()

    return(cleaned_response)

def get_final_response(word: str, example_sentence: str, eos_token_id: int):
    response = get_response(word, example_sentence, eos_token_id)
    cleaned_response = clean_response(response)
    return cleaned_response

def prepare_data_from_csv(input_dir: str):
    df = pd.read_csv(input_dir)
    
    columns_to_keep = ['word', 'sentence', 'source_author', 'source_title']

    df_filtered = df.filter(items=columns_to_keep)
    df_filtered['output'] = ""
    
    df_filtered['sentence'] = df_filtered['sentence'].str.replace('<b>', '', regex=False)
    df_filtered['sentence'] = df_filtered['sentence'].str.replace('</b>', '', regex=False)
    
    return df_filtered

def df_col_to_dict(df: pd.DataFrame):
    word = df['word'].tolist()
    sentence = df['sentence'].tolist()
    author = df['source_author'].tolist()
    title = df['source_title'].tolist()
    output = df['output'].tolist()
    
    dict = {
    'word': word,
    'sentence': sentence,
    'author': author,
    'title': title,
    'output': output,
    }
    
    return dict

def prepare_sentence_author_title(sentence, author, title):
    phrase_to_add = '\n\n"' + sentence + '"\n' + author + " - " +  title
    return phrase_to_add
    

def getMultipleResponses(input_dict, tokenizer):
    dict_lists_length = len(input_dict['word'])
    output_dict = input_dict
    for i in range(dict_lists_length):
        sentence = input_dict['sentence'][i]
        word = input_dict['word'][i]
        author = input_dict['author'][i]
        title = input_dict['title'][i]
        response = get_final_response(word = word, example_sentence = sentence, eos_token_id = tokenizer.eos_token_id)
        output_dict['output'][i] = response
        output_dict['output'][i] = output_dict['output'][i] + prepare_sentence_author_title(sentence=sentence, author=author, title=title)
    return output_dict

In [None]:
input_dataframe = prepare_data_from_csv('/kaggle/input/dictionary-2024-07-30-21-05/dictionary_2024_07_30_21_05.csv')
input_dict = df_col_to_dict(input_dataframe)
output_dict = getMultipleResponses(input_dict, tokenizer)

output_df = pd.DataFrame(output_dict)
output_df.to_csv("generated_responses.csv", sep='\t', encoding='utf-8', index=False, header=True)

In [None]:
#pip install git+https://github.com/kerrickstaley/genanki.git

In [None]:
import genanki
import random

def create_note(question, answer):
    css = """
    .card {
        font-family: Arial, sans-serif;
        font-size: 20px;
        text-align: center;
    }
    """

    # Tworzenie modelu
    my_model = genanki.Model(
        generate_random_number(),
        'Simple Model',
        fields=[
            {'name': 'Question'},
            {'name': 'Answer'},
        ],
        templates=[
            {
                'name': 'Card 1',
                'qfmt': '{{Question}}',
                'afmt': '{{FrontSide}}<hr id="answer">{{Answer}}',
            },
        ],
        css=css
    )
    
    return genanki.Note(
        model=my_model,
        fields=[question, answer]
    )

def create_deck(deck_name, input_df):
    deck_id = generate_random_number()
    my_deck = genanki.Deck(
        deck_id,
        deck_name
    )
    df_length = len(input_df)
    for i in range(df_length):
        question = input_df.iloc[i]["word"]
        answer = input_df.iloc[i]["output"]
        answer = answer.replace('\n', '<br>')
        my_note = create_note(question, answer)
        my_deck.add_note(my_note)

    return my_deck


def generate_random_number(start=1 << 30, end=1 << 31):
    return random.randrange(start, end)

In [None]:
deck = create_deck('lingatest', output_df)
genanki.Package(deck).write_to_file('outputAFTERBREAK.apkg')