In [None]:
!pip install hf_transfer

In [None]:
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
print(os.getenv("HF_HUB_ENABLE_HF_TRANSFER"))

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig, Conversation, pipeline
import torch

In [2]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
#name = "RomanOrac/llama-2-7b-slovenian"
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer.pad_token_id = tokenizer.eos_token_id    # for open-ended generation

In [5]:
tokenizer.chat_template = (
            "{% if messages[0]['role'] == 'system' %}"
            "{% set loop_messages = messages[1:] %}"
            "{% set system_message = messages[0]['content'] %}"
            "{% else %}"
            "{% set loop_messages = messages %}"
            "{% set system_message = false %}"
            "{% endif %}"
            "{% for message in loop_messages %}"
            "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
            "{{ raise_exception('Conversation roles must alternate "
            "user/assistant/user/assistant/...') }}"
            "{% endif %}"
            "{% if loop.index0 == 0 and system_message != false %}"
            "{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + "
            "message['content'] %}"
            "{% else %}"
            "{% set content = message['content'] %}"
            "{% endif %}"
            "{% if message['role'] == 'user' %}"
            '{{ bos_token + "[INST] " + content.strip() + " [/INST]" + "\n" + "Let\'s think step by step:" }}'
            "{% elif message['role'] == 'assistant' %}"
            "{{ ' '  + content.strip() + ' ' + eos_token }}"
            "{% endif %}"
            "{% endfor %}"
            )

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [None]:
!pip install accelerate

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [8]:
model.generation_config = GenerationConfig(
    do_sample=True,
    temperature=0.7, #0.01 > 1.0
    #top_k=
)

In [9]:
generation_pipe = pipeline(
    "conversational",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    device_map="auto",    # finds GPU
)

In [10]:
violence_df = pd.read_csv("violence.csv")

In [11]:
violence_df.Sentence.loc[0]

'Ker sicer nas boste prisilili, da bomo posegli s svoje rezervne višine, toda ne v volilni boj. ampak v vaša ušesa ter vam jih navili.'

In [12]:
examples_df = violence_df[violence_df["Example"] == True]

In [13]:
examples_df

Unnamed: 0,Date,Sentence,Violent,Description,Example
9,21. 11. 1925.,Princip naš bodi: odkrit in viteški boj s pošt...,True,Implied violence or violent methods,True
15,07. 02. 1925.,"Boj, ki smo ga začeli, pa bomo vodili dalje: Z...",True,Implied violence or violent methods,True
23,03. 09. 1925.,"Pripraviti vse, da bo to jutro čimpreje napoči...",True,Implied violence or violent methods,True
28,14. 02. 1925.,O tem si moramo biti vsi Orjunaši na jasnem da...,True,Implied violence or violent methods,True
33,24. 12. 1925.,"Zbrisal ga bo, kakor besen orkan z zemlje, tak...",True,Implied violence or violent methods,True
60,05. 01. 1926.,"Borba je pred nami in kri za nami, kri ki nas ...",True,Threat against other political parties,True


In [14]:
text_df = pd.DataFrame()

In [15]:
ex_violence = violence_df[violence_df["Example"] == False]
sample_val = ex_violence.sample(5)

In [None]:
no_violence = violence_df[violence_df["Violent"] == False]

# Text generation

In [146]:
system_prompt = "You are an expert classifier in Slovenian 1920s history, rightwing organizations, nationalism, and sociology. You always answer in English."

In [147]:
prompt_base_few_shot = "## Label this Slovenian text, according to wether it contains alusions to expansionism, contains words connected to harming other persons or damaging property, calls to potentially violent or subversive actions or sublevation (both explictly or figuratively), or not, indicating 'True' if it does and 'False' if it doesn't:\n"

In [148]:
FEW_SHOT = prompt_base_few_shot+"# Text: {text}\n# Violent: {label}\n# Description: {description}\n\n"

In [149]:
prompt_base = "## Label JUST THE FOLLOWING Slovenian text, according to wether it contains alusions to expansionism, contains words connected to harming other persons or damaging property, calls to potentially violent or subversive actions or sublevation (both explictly or figuratively), or not, indicating 'True' if it does and 'False' if it doesn't:\n"

# Chainf of thought

In [158]:
COT = f"""{prompt_base}
## Answer:
Let's think step by step to label this text according to wether the whole text contains language that can point to violence, includes alusions to expansionism, calls to violent or subversive actions or sublevation (explictly or figuratively), or not.

Step 1: Recognize words or locutions related to any context of militarism, literal, metaphorical, or implied.
- Identify key words or locutions related to any militaristic context, or metaphors and alusions about the violent behaviour such as 'žrtva', 'akcija', 'otpraviti', 'konačni obračun', or 'boj'.

Step 2: Recognize words or locutions related to expansionist rhetoric, or words related to foreign conquest, invading or attacking state borders or war between two nations, literal, metaphorical, or implied.
- Identify key words or locutions related to expansionist rhetoric, or words related to foreign conquest, invading or attacking state borders or war such as 'mrtvi straži', 'vpasti preko meje', 'or 'osvojiti meje'.

Step 3: Recognize words or locutions related to signs of bodily harm, wounds or mutilation, literal, metaphorical, or implied.
- Identify key words or locutions related to signs of bodily harm, such as 'krv', 'rana', or 'smrt'.

Step 4: Recognize words or locutions related to signs of weapon use, literal, metaphorical, or implied.
- Identify key words or locutions related to weapon use, such as 'bombe', 'revolver', 'nož'.

Step 5: Recognize words or locutions related to self-defence or opposition, literal, metaphorical, or implied.
- Identify key words or locutions related to defense, such as 'samoobramba', 'braniti se', 'ščit'.

Step 6: Label the text based on the previous steps.
- If even one previous step yields a result, label the whole text as 'Query: True'.
- Otherwise, label the text as 'Query: False'.
- Include a description of why the text has been labeled as such.
"""

In [151]:
text_df["prompts"] = examples_df[["Sentence", "Violent", "Description"]].astype(str).apply(lambda x: FEW_SHOT.format(text=x['Sentence'],label=x['Violent'],description=x['Description']), axis=1)

In [152]:
few_shots = '## I present you here six (6) examples of correctly labeled texts based on similar problems:\n' + "\n".join(text_df["prompts"].to_list()) + "\n\nDo not take into account the previous examples, just use them as guidelines without labelling them.\n\n"

In [153]:
def label(texts):
    conversation = Conversation()
    conversation.add_message({'role':'system', 'content': system_prompt})
    conversation.add_message({'role':'user', 'content': few_shots+COT+prompt_base+"# Text: " + texts + "\n# Let's think step by step: "})
    response = generation_pipe(
        conversation,
        max_new_tokens=1280,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    return response

In [154]:
sample_val

Unnamed: 0,Date,Sentence,Violent,Description,Example
55,27. 02. 1926.,Orjuna sprejema historično vloga organiziranja...,True,Attacking another country,False
76,25. 09. 1926.,Nacionalizem je na pohodu in zajema vedno glob...,True,Implied violence or violent methods,False
90,08. 07. 1926.,Za izpremembo so sam o sviraj trobači in peli ...,True,Implied violence or violent methods,False
98,03. 04. 1926.,"S svojo močjo in smelostjo, s svojo ljubeznijo...",True,Implied violence or violent methods,False
48,24. 07. 1926.,Niti en Italijan ne sme prestopiti v tej zemlj...,True,Implied violence or violent methods,False


In [155]:
label(ex_violence.Sentence[75])  # test with 65



Conversation id: c09772b9-d0ff-454a-89dd-d58748597784
system: You are an expert classifier in Slovenian 1920s historical propaganda, nationalism, and sociology. You always answer in English.
user: ## I present you here six (6) examples of correctly labeled texts based on similar problems:
## Label this Slovenian text, according to wether the whole text is violent, includes alusions to expansionism, calls to violent or subversive actions or sublevation (both explictly or figuratively), or not, indicating 'True' if it does and 'False' if it doesn't:
# Text: Princip naš bodi: odkrit in viteški boj s poštenimi sredstvi, ki morajo služiti vedno samo naciji i ideji!
# Violent: True
# Description: Implied violence or violent methods


## Label this Slovenian text, according to wether the whole text is violent, includes alusions to expansionism, calls to violent or subversive actions or sublevation (both explictly or figuratively), or not, indicating 'True' if it does and 'False' if it doesn't

In [156]:
ans = label(ex_violence.Sentence[8])  # test with 65

Conversation id: eaf00383-3775-4fe8-acaa-c3a8f1797629
system: You are an expert classifier in Slovenian 1920s historical propaganda, nationalism, and sociology. You always answer in English.
user: ## I present you here six (6) examples of correctly labeled texts based on similar problems:
## Label this Slovenian text, according to wether the whole text is violent, includes alusions to expansionism, calls to violent or subversive actions or sublevation (both explictly or figuratively), or not, indicating 'True' if it does and 'False' if it doesn't:
# Text: Princip naš bodi: odkrit in viteški boj s poštenimi sredstvi, ki morajo služiti vedno samo naciji i ideji!
# Violent: True
# Description: Implied violence or violent methods


## Label this Slovenian text, according to wether the whole text is violent, includes alusions to expansionism, calls to violent or subversive actions or sublevation (both explictly or figuratively), or not, indicating 'True' if it does and 'False' if it doesn't

In [None]:
# Saving results 

idx_list = []
answer_list = []
for idx, _, text, _, _, _ in ex_violence.itertuples():
  answer = label(text).generated_responses[0]
  idx_list.append(idx)
  answer_list.append(answer)

In [None]:
ex_violence

In [None]:
results_df = pd.DataFrame()
results_df["id"] = idx_list
results_df["date"] = ex_violence["Date "].to_list()
results_df["sentence"] = ex_violence.Sentence.to_list()
results_df["label"] = ex_violence.Violent.to_list()

results_df["answer"] = answer_list
results_df.to_csv("results.csv", index=False)
results_df.to_excel("results.xlsx", index=False)

In [None]:
# Nevermind this block, it's just for easier assessment of results

file = results_df.drop(['label'], axis=1)

file_list = file['answer'].tolist()



def list_to_string(s):
    str1 = ''
    for element in s:
        str1 += element

    return str1

string_column1 = list_to_string(file_list)
string_column = string_column1.replace("'", "")

final_list = string_column.split()

result1 = final_list.count('True.')

result2 = final_list.count('True')

print('The number of queries marked as True is:', result1 + result2)


In [None]:
# label("Iz tega mesta pojdi preko cele Jugoslavije in povej vsem neustrašno, da se v svojem pohodu, ki ga započeniaš s tega mesta, ne bojiš niti smrti.")

In [None]:
# Tested, 55 True, 44 False