In [17]:
import pandas as pd
import os
from tqdm import tqdm

from itertools import islice

In [3]:
raw_df = pd.read_table("/root/autodl-tmp/datasets/offenseval/datasets/training-v1/offenseval-training-v1.tsv")

raw_df.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [4]:
def bi_split_dataframe(df, target_column, target_value):
    grouped = df.groupby(df[target_column] == target_value)
    
    split_1 = grouped.get_group(True).reset_index().drop("index", axis=1)
    split_2 = grouped.get_group(False).reset_index().drop("index", axis=1)
    
    return split_1, split_2

In [5]:
off_df, not_off_df = bi_split_dataframe(raw_df, "subtask_a", "OFF")

In [6]:
off_df.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
3,97670,@USER Liberals are all Kookoo !!!,OFF,TIN,OTH
4,77444,@USER @USER Oh noes! Tough shit.,OFF,UNT,


In [7]:
print(off_df.iloc[0]["tweet"])

@USER She should ask a few native Americans what their take on this is.


In [66]:
print(not_off_df.iloc[0]["tweet"])

Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT


In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [21]:
doc = nlp("@USER Canada")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

@USER Canada 0 12 ORG


In [10]:
target_labels = ["GPE", "NORP"]

off_tweet_list = off_df["tweet"].tolist()

In [20]:
off_tweet_national_related_dict = {}

for idx, tweet in enumerate(tqdm(off_tweet_list[:10])):
    ents = nlp(tweet).ents
    if len(ents) <= 0:
        continue
    else:
        for ent in ents:
            print(ent.label_, ent.text)
            if ent.label_ in target_labels:
                if idx not in off_tweet_national_related_dict:
                    off_tweet_national_related_dict[idx] = {
                        "nationality_entities": [ent.text],
                        "tweet": tweet,
                    }
                else:
                    off_tweet_national_related_dict[idx]["nationality_entities"].append(ent.text)

100%|██████████| 10/10 [00:00<00:00, 78.38it/s]

NORP Americans
ORG @USER Liberals
ORG @USER
ORG @USER Canada
GPE n’t
MONEY #LooneyLeft #Liberals
MONEY #Qproofs #
MONEY #DeepStateCorruption
NORP PutUpOrShutUp
ORG MAGA





In [13]:
len(off_tweet_national_related_dict)

885

In [18]:
list(islice(off_tweet_national_related_dict.items(), 5))


[(0,
  {'nationality_entities': ['Americans'],
   'tweet': '@USER She should ask a few native Americans what their take on this is.'}),
 (6,
  {'nationality_entities': ['n’t'],
   'tweet': '@USER Canada doesn’t need another CUCK! We already have enough #LooneyLeft #Liberals f**king up our great country! #Qproofs #TrudeauMustGo'}),
 (9,
  {'nationality_entities': ['PutUpOrShutUp'],
   'tweet': '@USER @USER @USER @USER LOL!!!   Throwing the BULLSHIT Flag on such nonsense!!  #PutUpOrShutUp   #Kavanaugh   #MAGA   #CallTheVoteAlready URL'}),
 (10,
  {'nationality_entities': ['communist'],
   'tweet': '@USER @USER Kind of like when conservatives wanna associate everyone to their left as communist antifa members?'}),
 (13,
  {'nationality_entities': ['Democrat', 'n’t bash'],
   'tweet': '@USER @USER @USER She?  To whom are you referring to?  Hillary?  You know what is tiresome?  Bernie supporters bashing Hillary Clinton.  She is an actual Democrat who raises money for the Democratic Party and

## Use LLM to Find Nationality NERs

In [75]:
from llm_client.pipeline import Pipeline
from typing import List
import asyncio
import time

config_yaml = "/root/llm_client/config_yamls/llama2-hf.yaml"

pipeline = Pipeline(config_yaml, verbose=1)

2023-10-25 23:53:26,643 - [32mINFO[0m - pipeline.py:20 - pipeline.__init__ - 5563 - parameters for every request: {'do_sample': False, 'max_new_tokens': 64, 'repetition_penalty': None, 'return_full_text': False, 'seed': None, 'temperature': None, 'top_k': None, 'top_p': None, 'truncate': None, 'typical_p': None, 'best_of': None, 'watermark': False, 'decoder_input_details': False, 'stop_sequences': ['</s>', '[/INST]', '[/SYS>>', 'Question']}


In [76]:
example=(
    "\n[INST] Detect the nationality-related entity in the following quoted text: "
    "\"My name is Wolfgang and I come from Germany""[/INST]"
    "\nAnswer: The entity is \"Germeny\""
    "\n[INST] Detect the nationality-related entity in the following quoted text: "
    "\"My name is Wolfgang and I'm German""[/INST]"
    "\nAnswer: The entity is \"German\""
    "\n[INST] Detect the nationality-related entity in the following quoted text: "
    "\"Amazon is investigating employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace.""[/INST]"
    "\nAnswer: The entity is \"NONE\""
)

prompt_list = []
for tweet in off_tweet_list:
    prompt_list.append(
        (
            f"{example}\n[INST] Detect the nationality-related entity in the following quoted text: \"{tweet}\"[/INST]"
            # f"return NONE if you can't find any: \"{tweet}\"[/INST]"
        )
    )


In [77]:
prompt_list[:5]

['\n[INST] Detect the nationality-related entity in the following quoted text: "My name is Wolfgang and I come from Germany[/INST]\nAnswer: The entity is "Germeny"\n[INST] Detect the nationality-related entity in the following quoted text: "My name is Wolfgang and I\'m German[/INST]\nAnswer: The entity is "German"\n[INST] Detect the nationality-related entity in the following quoted text: "Amazon is investigating employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace.[/INST]\nAnswer: The entity is "NONE"\n[INST] Detect the nationality-related entity in the following quoted text: "@USER She should ask a few native Americans what their take on this is."[/INST]',
 '\n[INST] Detect the nationality-related entity in the following quoted text: "My name is Wolfgang and I come from Germany[/INST]\nAnswer: The entity is "Germeny"\n[INST] Detect the nationality-related entity in the following quoted text: "My name is Wolfgang and I\'m Ge

In [78]:
async def main(input_list: List, pipeline: Pipeline):
    tasks = [pipeline.model_predict(input) for input in input_list]
    results = await asyncio.gather(*tasks)
    return results

In [79]:
sample_list = prompt_list[:5]

result_list = await main(sample_list, pipeline)
for result in result_list:
    print(result.split("\n[INST]")[0])


The entity is "Native Americans"
The entity is "NONE"
The entity is "NONE"
The entity is "NONE"
The entity is "NONE"


In [81]:
train_result_list = []
chunk_size = 50

question_list = prompt_list.copy()
pbar = tqdm(total = len(question_list))
while len(question_list) > chunk_size:
    current_chunk = question_list[:chunk_size]
    question_list = question_list[chunk_size:]
    
    result_list = await main(current_chunk, pipeline)
    train_result_list.extend([result_text.split("\n[INST]")[0] for result_text in result_list])
    
    pbar.update(chunk_size)
    time.sleep(2)

result_list = await main(question_list, pipeline)
train_result_list.extend([result_text.split("\n[INST]")[0] for result_text in result_list])
pbar.update(len(question_list))

100%|██████████| 4400/4400 [12:54<00:00,  5.68it/s]

True

In [83]:
with open("temp_off_nationality.txt", "w") as output_file:
    for result_text in train_result_list:
        output_file.write(f"{result_text}\n")