In [2]:
import pandas as pd
import json
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

import requests
import json
from tqdm import tqdm
from emoji import demojize
from nltk.tokenize import TweetTokenizer
import os
from datetime import datetime

import numpy as np

import pandas as pd
from collections import Counter

# Define a list of filenames to load
filenames = ["../data/labeled_data/generic_test_0.json"]

df_train = pd.DataFrame()
df_test = pd.DataFrame()
df_valid = pd.DataFrame()

# Load all JSON data and concatenate into one DataFrame
for filename in filenames:
    with open(filename) as f:
        data = json.load(f)
    df_train = pd.DataFrame(data["train"])
    df_test = pd.DataFrame(data["test"])
    df_valid = pd.DataFrame(data["valid"])

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "[url]"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token
    
def normalizeTweet(tweet):
    tokens = TweetTokenizer().tokenize(tweet.replace("’", "'"))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("n 't", "n't")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("p . m .", "pm")
            .replace("p . m", "pm")
            .replace("a . m .", "am")
            .replace("a . m", "am")
    )
    return " ".join(normTweet.split())

def api(prompt):
    import requests

# For local streaming, the websockets are hosted without ssl - http://
HOST = 'http://127.0.0.1:5000'
URI = f'{HOST}/api/v1/generate'

# For reverse-proxied streaming, the remote will likely host with ssl - https://
# URI = 'https://your-uri-here.trycloudflare.com/api/v1/generate'

def get_response(request_params, prompt, context):
    request_params['prompt'] = prompt
    request_params['context'] = context

    response = requests.post(URI, json=request_params)

    if response.status_code == 200:
        result = response.json()['results'][0]['text']
        #print(prompt + result)
        return result
    else:
    	print(response)

def get_base_request_params(max_new_tokens = 200, stopping_strings = []):
    return {
        'prompt': None,
        'context': None,
        'max_new_tokens': 200,
        'do_sample': True,
        'temperature': 0.7,
        'top_p': 0.1,
        'typical_p': 1,
        'repetition_penalty': 1.2,
        'encoder_repetition_penalty': 1.0,
        'top_k': 40,
        'min_length': 0,
        'no_repeat_ngram_size': 0,
        'num_beams': 1,
        'penalty_alpha': 0,
        'length_penalty': 1,
        'early_stopping': False,
        'seed': -1,
        #'add_bos_token': True,
        #'truncation_length': 2048,
        #'ban_eos_token': False,
        #'skip_special_tokens': True,
        'stopping_strings': stopping_strings
    }

def get_vicuna_multi_label_v01(tweet_text):
    instruction = "Assign multilabel labels to the following tweet. Choose out of the following list of labels where \"Others\" is only assigned if no other label fits: [\"War/Terror\", \"Conspiracy Theory\", \"Education\", \"Election Campaign\", \"Environment\", \"Government/Public\", \"Health\", \"Immigration/Integration\", \"Justice/Crime\", \"Labor/Employment\", \"Macroeconomics/Economic Regulation\", \"Media/Journalism\", \"Religion\", \"Science/Technology\", \"Others\"]"
    prompt = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{tweet_text}\n\n### Response:\n"
    return prompt, ""

In [3]:
df_valid

Unnamed: 0,id,campaign_name,text,annotations
0,962251540617670661,GRU_202012,RT @Kasman62: After his injury for fifth time ...,[War/Terror]
1,898277432922263552,VENEZUELA_201901_2,RT BGEEZ: dncchuckschumernancypelosimsnbccnn h...,[Others]
2,966214616601841664,GRU_202012,RT @LinaArabii: Russian presence in the Black ...,[Media/Journalism]
3,1303856635668987905,UGANDA_0621,RT @brianmixologist: Am disappointed in the an...,[Election Campaign]
4,462250037972840449,IRA_202012,RT @CMCL1979: Why do those those ghastly nativ...,[Others]
...,...,...,...,...
795,847754640972070912,VENEZUELA_201901_2,Stocks: 5 things to know before the bell https...,[Others]
796,1038043813267296256,GRU_202012,"September 4, 2018 #Syrian air defense units re...",[War/Terror]
797,1030464264748728320,UGANDA_0621,RT @xJ57jjSHWvX9mAMmhv7fVaVzxe13bBfCZuGZaBNucL...,[Others]
798,1020843977669586945,UGANDA_0621,RT @HowweEnt: Dj Shiru To Thrill His Fans http...,[Others]


In [4]:
models_to_test_names = ["multilabel_without_context_v01"]
model_funcs = [get_vicuna_multi_label_v01]
dataframes = [df_test]
dataframes_names = ["test"]

for i, df in enumerate(dataframes):

    for model_name, model_func in zip(models_to_test_names, model_funcs):
        print("Starting with model: " + model_name)
        print("----------------------------------")
        df_tmp = df.copy()
        df_tmp["prompt"] = ""
        df_tmp["context"] = ""
        new_column_name = dataframes_names[i] + model_name
        output_folder = f"../data/vicuna_4bit/lora/{model_name}/"

        for idx, row in tqdm(df_tmp.iterrows(), total=df_tmp.shape[0]):

            tweet_text = normalizeTweet(row["text"])
            df_tmp.loc[lambda df: df['id'] == row["id"], 'normalized_tweet'] = tweet_text

            prompt, context = model_func(tweet_text)
            df_tmp.at[idx, "prompt"] = prompt
            df_tmp.at[idx, "context"] = context

            request_params = get_base_request_params()
            request_params["stopping_strings"] = ["\n", "### Human:", "Human:", "###"]
            response = get_response(request_params, prompt, "")

            # Save the response in the 'api_results' column
            df_tmp.loc[lambda df: df['id'] == row["id"], new_column_name] = response
            if (i + 1) % 100 == 0:
                output_path = os.path.join(output_folder, f'{dataframes_names[i]}_generic_test_0.csv')
                df_tmp.to_csv(output_path, index=False)
                print(f"Saved progress at index {idx}")
                print("Sample Tweet: ", tweet_text)
                print("Sample Annotation: ", response)

        output_path = os.path.join(output_folder, f'{dataframes_names[i]}_generic_test_0.csv')
        df_tmp.to_csv(output_path, index=False)        
            # Save the request_params as a JSON file in the output folder
        with open(os.path.join(output_folder, 'request_params.json'), 'w') as f:
            json.dump(request_params, f, indent=4)

Starting with model: generic_prompt_few_shot_prompt_only_classification_3_random_example
----------------------------------


  0%|          | 1/1000 [00:08<2:15:57,  8.17s/it]


KeyboardInterrupt: 

In [19]:
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [20]:

output_path = os.path.join(output_folder, f'{dataframes_names[i]}_generic_test_0.csv')
df_tmp.to_csv(output_path, index=False)        
    # Save the request_params as a JSON file in the output folder
with open(os.path.join(output_folder, 'request_params.json'), 'w') as f:
    json.dump(request_params, f, indent=4)