In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

import requests
import json
from tqdm import tqdm
from emoji import demojize
from nltk.tokenize import TweetTokenizer
import os
from datetime import datetime

import numpy as np

import pandas as pd
from collections import Counter

# Define a list of filenames to load
filenames = ["../data/labeled_data/generic_test_0.json"]

df_train = pd.DataFrame()
df_test = pd.DataFrame()
df_valid = pd.DataFrame()

# Load all JSON data and concatenate into one DataFrame
for filename in filenames:
    with open(filename) as f:
        data = json.load(f)
    df_train = pd.DataFrame(data["train"])
    df_test = pd.DataFrame(data["test"])
    df_valid = pd.DataFrame(data["valid"])

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "[url]"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token
    
def normalizeTweet(tweet):
    tokens = TweetTokenizer().tokenize(tweet.replace("’", "'"))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("n 't", "n't")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("p . m .", "pm")
            .replace("p . m", "pm")
            .replace("a . m .", "am")
            .replace("a . m", "am")
    )
    return " ".join(normTweet.split())

def api(prompt):
    import requests

# For local streaming, the websockets are hosted without ssl - http://
HOST = 'http://127.0.0.1:5000'
URI = f'{HOST}/api/v1/generate'

# For reverse-proxied streaming, the remote will likely host with ssl - https://
# URI = 'https://your-uri-here.trycloudflare.com/api/v1/generate'

def get_response(request_params, prompt, context):
    request_params['prompt'] = prompt
    request_params['context'] = context

    response = requests.post(URI, json=request_params)

    if response.status_code == 200:
        result = response.json()['results'][0]['text']
        #print(prompt + result)
        return result
    else:
    	print(response)

def get_base_request_params(max_new_tokens = 200, stopping_strings = []):
    return {
        'prompt': None,
        'context': None,
        'max_new_tokens': 200,
        'do_sample': True,
        'temperature': 0.7,
        'top_p': 0.1,
        'typical_p': 1,
        'repetition_penalty': 1.2,
        'encoder_repetition_penalty': 1.0,
        'top_k': 40,
        'min_length': 0,
        'no_repeat_ngram_size': 0,
        'num_beams': 1,
        'penalty_alpha': 0,
        'length_penalty': 1,
        'early_stopping': False,
        'seed': -1,
        #'add_bos_token': True,
        #'truncation_length': 2048,
        #'ban_eos_token': False,
        #'skip_special_tokens': True,
        'stopping_strings': stopping_strings
    }

def get_vicuna_multi_label_v01(tweet_text):
    instruction = "Assign multilabel labels to the following tweet. Choose out of the following list of labels where \"Others\" is only assigned if no other label fits: [\"War/Terror\", \"Conspiracy Theory\", \"Education\", \"Election Campaign\", \"Environment\", \"Government/Public\", \"Health\", \"Immigration/Integration\", \"Justice/Crime\", \"Labor/Employment\", \"Macroeconomics/Economic Regulation\", \"Media/Journalism\", \"Religion\", \"Science/Technology\", \"Others\"]"
    prompt = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{tweet_text}\n\n### Response:\n"
    return prompt, ""

#TODO: retrain with "Others"
def get_vicuna_multi_label_v02(tweet_text):
    instruction = "Out of the following list of topics, choose topics that best fit the text in your opinion. Here are the rules:\n- Use clear indicators for labeling.\n- If unclear, output \"Others\". No speculation.\n- Don't mix content from different operations.\n- Refer to Oxford definitions for topics.\n\nTopics: \"War/Terror\", \"Conspiracy Theory\", \"Education\", \"Election Campaign\", \"Environment\", \"Government/Public\", \"Health\", \"Immigration/Integration\", \"Justice/Crime\", \"Labor/Employment\", \"Macroeconomics/Economic Regulation\", \"Media/Journalism\", \"Religion\", \"Science/Technology\""
    prompt = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{tweet_text}\n\n### Response:\n"
    return prompt, ""

def get_vicuna_binary_v01(label, tweet_text):
    instruction = f"Classify the input based on if it's about {label}. Use 1 (True) or 0 (False) as output."
    prompt = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{tweet_text}\n\n### Response:\n"
    return prompt, ""

In [6]:
models_to_test_names = ["vicuna_binary_war_v01"]
model_funcs = [get_vicuna_binary_v01]
label = "War/Terror"
dataframes = [df_test]
dataframes_names = ["test"]

for i, df in enumerate(dataframes):

    for model_name, model_func in zip(models_to_test_names, model_funcs):
        print("Starting with model: " + model_name)
        print("----------------------------------")
        df_tmp = df.copy()
        df_tmp["prompt"] = ""
        df_tmp["context"] = ""
        new_column_name = dataframes_names[i] + model_name
        output_folder = f"../data/vicuna_4bit/lora/{model_name}/"
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        for idx, row in tqdm(df_tmp.iterrows(), total=df_tmp.shape[0]):

            tweet_text = normalizeTweet(row["text"])
            df_tmp.loc[lambda df: df['id'] == row["id"], 'normalized_tweet'] = tweet_text

            prompt, context = model_func(label, tweet_text)
            df_tmp.at[idx, "prompt"] = prompt
            df_tmp.at[idx, "context"] = context

            request_params = get_base_request_params()
            request_params["stopping_strings"] = ["\n", "### Human:", "Human:", "###"]
            request_params["max_new_tokens"] = 10
            response = get_response(request_params, prompt, "")

            # Save the response in the 'api_results' column
            df_tmp.loc[lambda df: df['id'] == row["id"], new_column_name] = response
            if (idx + 1) % 100 == 0:
                output_path = os.path.join(output_folder, f'{dataframes_names[i]}_generic_test_0.csv')
                df_tmp.to_csv(output_path, index=False)
                print(f"Saved progress at index {idx}")
                print("Sample Tweet: ", tweet_text)
                print("Sample Annotation: ", response)

        output_path = os.path.join(output_folder, f'{dataframes_names[i]}_generic_test_0.csv')
        df_tmp.to_csv(output_path, index=False)        
            # Save the request_params as a JSON file in the output folder
        with open(os.path.join(output_folder, 'request_params.json'), 'w') as f:
            json.dump(request_params, f, indent=4)

Starting with model: vicuna_binary_war_v01
----------------------------------


 10%|█         | 100/1000 [01:22<13:39,  1.10it/s]

Saved progress at index 99
Sample Tweet:  RT @USER : When you fight all your battles on your kness ( prayer ) , you will always win . #MenGather2020
Sample Annotation:  ### Assistant: Based on the provided


 20%|██        | 200/1000 [02:46<12:31,  1.06it/s]

Saved progress at index 199
Sample Tweet:  RT @USER : #TweetRequest | What music videos would you like to watch on MTV Base ? Tweet us NOW with the hashtag & name of vid :front-facing_baby_chick: htt …
Sample Annotation:  ### Assistant: As an AI


 30%|███       | 300/1000 [04:10<07:50,  1.49it/s]

Saved progress at index 299
Sample Tweet:  However , the #Syrian troops repelled the offensive and retained the occupied lines . #Militants suffered losses and retreated .
Sample Annotation:  1


 40%|████      | 400/1000 [05:32<07:35,  1.32it/s]

Saved progress at index 399
Sample Tweet:  Militants of terrorist groups constantly arrange provocations by shelling the positions of Syrian government forces in western Aleppo . [url]
Sample Annotation:  1


 50%|█████     | 500/1000 [06:52<06:40,  1.25it/s]

Saved progress at index 499
Sample Tweet:  RT @USER : A working person in Iraq is a working person is Texas is a working person in Berlin ; war is the desire to kill this tr …
Sample Annotation:  ### Assistant: Based on the given


 60%|██████    | 600/1000 [08:13<05:33,  1.20it/s]

Saved progress at index 599
Sample Tweet:  The #US is completing the training of one more terrorist group at #AlTanf military base . #Syria heading for the next series of terrorist attacks very soon [url]
Sample Annotation:  ### Assistant: Based on the information


 70%|███████   | 700/1000 [09:32<04:04,  1.22it/s]

Saved progress at index 699
Sample Tweet:  RT @USER : #SleepunderthenetUg Door to door distribution of mosquito nets in Buwambwa sub county , Namisindwa District . kick malaria …
Sample Annotation:  ### Assistant: Based on the information


 80%|████████  | 800/1000 [10:53<02:46,  1.20it/s]

Saved progress at index 799
Sample Tweet:  RT @USER =: 🇺🇸 #US military is being deployed to the #Greek Section ! Op-ed by @USER [url] #Turkey #Russia # U …
Sample Annotation:  ### Assistant: Based on the provided


 90%|█████████ | 900/1000 [12:12<01:27,  1.14it/s]

Saved progress at index 899
Sample Tweet:  @USER Smashing it up Tomorrow Again @USER @USER #PremiereWednesday THE DAWN Kasanga [url]
Sample Annotation:  ### Assistant: Based on the given


100%|██████████| 1000/1000 [13:36<00:00,  1.22it/s]

Saved progress at index 999
Sample Tweet:  VIDEO : #Women'sMarch Linda Sarsour Told Maddow “ Islamic Children Are Executed In The U . S . ” [url] [url]
Sample Annotation:  ### Assistant: Based on the given





In [15]:
# code to count the number of rows in df_tmp where "0" is in the column test_vicuna_binary_war_v01
df_tmp[df_tmp["testvicuna_binary_war_v01"].str.contains("0")].shape[0]

0

In [5]:
prompt

"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nClassify the input based on if it's about War/Terror. Use 1 (True) or 0 (False) as output.\n\n### Input:\n#Crimea : What to Expect from #Ukraine and the West [url] #russiainvadesukraine\n\n### Response:\n"

In [4]:
response

'1'

In [7]:
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_path = os.path.join(output_folder, f'{dataframes_names[i]}_generic_test_0.csv')
df_tmp.to_csv(output_path, index=False)        
    # Save the request_params as a JSON file in the output folder
with open(os.path.join(output_folder, 'request_params.json'), 'w') as f:
    json.dump(request_params, f, indent=4)