In [11]:
import pandas as pd
import json
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

import requests
import json
from tqdm import tqdm
from emoji import demojize
from nltk.tokenize import TweetTokenizer
import os
from datetime import datetime

import numpy as np

import pandas as pd
from collections import Counter

# Define a list of filenames to load
filenames = ["../data/labeled_data/generic_test_0.json"]

df_train = pd.DataFrame()
df_test = pd.DataFrame()
df_valid = pd.DataFrame()

# Load all JSON data and concatenate into one DataFrame
for filename in filenames:
    with open(filename) as f:
        data = json.load(f)
    df_train = pd.DataFrame(data["train"])
    df_test = pd.DataFrame(data["test"])
    df_valid = pd.DataFrame(data["valid"])

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "[url]"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token
    
def normalizeTweet(tweet):
    tokens = TweetTokenizer().tokenize(tweet.replace("’", "'"))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("n 't", "n't")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("p . m .", "pm")
            .replace("p . m", "pm")
            .replace("a . m .", "am")
            .replace("a . m", "am")
    )
    return " ".join(normTweet.split())

def api(prompt):
    import requests

# For local streaming, the websockets are hosted without ssl - http://
HOST = 'http://127.0.0.1:5000'
URI = f'{HOST}/api/v1/generate'

# For reverse-proxied streaming, the remote will likely host with ssl - https://
# URI = 'https://your-uri-here.trycloudflare.com/api/v1/generate'

def get_response(request_params, prompt, context):
    request_params['prompt'] = prompt
    request_params['context'] = context

    response = requests.post(URI, json=request_params)

    if response.status_code == 200:
        result = response.json()['results'][0]['text']
        #print(prompt + result)
        return result
    else:
    	print(response)

def get_base_request_params(max_new_tokens = 200, stopping_strings = []):
    return {
        'prompt': None,
        'context': None,
        'max_new_tokens': 200,
        'do_sample': True,
        'temperature': 0.7,
        'top_p': 0.1,
        'typical_p': 1,
        'repetition_penalty': 1.2,
        'encoder_repetition_penalty': 1.0,
        'top_k': 40,
        'min_length': 0,
        'no_repeat_ngram_size': 0,
        'num_beams': 1,
        'penalty_alpha': 0,
        'length_penalty': 1,
        'early_stopping': False,
        'seed': -1,
        #'add_bos_token': True,
        #'truncation_length': 2048,
        #'ban_eos_token': False,
        #'skip_special_tokens': True,
        'stopping_strings': stopping_strings
    }

def get_vicuna_prompt_without_context_only_classification(tweet_text, label):
    prompt = f"### Human: Classify the Tweet based on if it's about {label}. Use 1 or 0 as class.\n\nTweet: {tweet_text}\n### Assistant:\nClass: "
    context = ''
    return prompt, context

all_labels = ["War/Terror", "Conspiracy Theory", "Education", "Election Campaign", "Environment", 
              "Government/Public", "Health", "Immigration/Integration", 
              "Justice/Crime", "Labor/Employment", 
              "Macroeconomics/Economic Regulation", "Media/Journalism", "Religion", "Science/Technology"]

In [None]:
models_to_test_names = ["get_vicuna_prompt_without_context_only_classification_full_classification"]
model_funcs = [get_vicuna_prompt_without_context_only_classification]
dataframes = [df_test]
dataframes_names = ["test"]

for i, df in enumerate(dataframes):

    for model_name, model_func in zip(models_to_test_names, model_funcs):
        print("Starting with model: " + model_name)
        print("----------------------------------")
        df_tmp = df.copy()
        df_tmp["prompt"] = ""
        df_tmp["context"] = ""

        for idx, label in enumerate(all_labels):
            new_column_name = f'{label}_pred'
            df_tmp[new_column_name] = None

        new_column_name = dataframes_names[i] + model_name
        output_folder = f"../data/vicuna_4bit/{model_name}/"
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        for idx, row in tqdm(df_tmp.iterrows(), total=df_tmp.shape[0]):

            tweet_text = normalizeTweet(row["text"])
            df_tmp.loc[lambda df: df['id'] == row["id"], 'normalized_tweet'] = tweet_text
            prompt, context = model_func(tweet_text, "label")
            df_tmp.at[idx, "prompt"] = prompt
            df_tmp.at[idx, "context"] = context

            request_params = get_base_request_params()
            request_params["stopping_strings"] = ["\n", "### Human:", "Human:", "###"]
            
            for label in all_labels:        
                new_column_name = f'{label}_pred'
                prompt, context = model_func(tweet_text, label)
                response = get_response(request_params, prompt, "")
                df_tmp.at[idx, new_column_name] = response

            # Save the response in the 'api_results' column
            df_tmp.loc[lambda df: df['id'] == row["id"], new_column_name] = response
            if (i + 1) % 100 == 0:
                output_path = os.path.join(output_folder, f'{dataframes_names[i]}_generic_test_0.csv')
                df_tmp.to_csv(output_path, index=False)
                print(f"Saved progress at index {idx}")
                print("Sample Tweet: ", tweet_text)
                print("Sample Annotation: ", response)

        output_path = os.path.join(output_folder, f'{dataframes_names[i]}_generic_test_0.csv')
        df_tmp.to_csv(output_path, index=False)        
            # Save the request_params as a JSON file in the output folder
        with open(os.path.join(output_folder, 'request_params.json'), 'w') as f:
            json.dump(request_params, f, indent=4)

In [8]:
import sys
import os
sys.path.append("../src")
import llm_utils
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import ast
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

def classification_reports_to_df(classification_reports, binary):

    if binary:
        # Your code for creating the DataFrame and adding the results
        df = pd.DataFrame(columns=['label', 'f1_score_macro', #'precision_macro', 'recall_macro', 'support_macro',
                                        'f1_score_class_0','support_class_0',
                                        'f1_score_class_1', 'support_class_1'])

        for label, cr in classification_reports.items():
            try: 
                df = df.append({
                    'label': label,
                    'f1_score_macro': cr['macro avg']['f1-score'],
                    #'precision_macro': cr['macro avg']['precision'],
                    #'recall_macro': cr['macro avg']['recall'],
                    #'support_macro': cr['macro avg']['support'],
                    'f1_score_class_0': cr['0']['f1-score'],
                    #'precision_class_0': cr['0']['precision'],
                    #'recall_class_0': cr['0']['recall'],
                    'support_class_0': cr['0']['support'],
                    'f1_score_class_1': cr['1']['f1-score'],
                    #'precision_class_1': cr['1']['precision'],
                    #'recall_class_1': cr['1']['recall'],
                    'support_class_1': cr['1']['support']
                }, ignore_index=True)
            except Exception as e:
                print(f"Error for {label}: {e}")
                df = df.append({
                    'label': label,
                    'f1_score_macro': None,
                    #'precision_macro': None,
                    #'recall_macro': None,
                    #'support_macro': None,
                    'f1_score_class_0': None,
                    #'precision_class_0': None,
                    #'recall_class_0': None,
                    'support_class_0': None,
                    'f1_score_class_1': None,
                    #'precision_class_1': None,
                    #'recall_class_1': None,
                    'support_class_1': None
                }, ignore_index=True)
                continue

        # Display the results
        return df
    
    return None

# Function to convert 'Classification: [0 or 1]' string to int value
def extract_nth_character(classification_str, n, strip = False):
    if strip and type(classification_str) == str:
        classification_str = classification_str.strip()
    #print(classification_str)
    if pd.isna(classification_str):
        return None
    if type(classification_str) == float:
        return classification_str
    if type(classification_str) == int:
        return classification_str
    try:
        classification_str = classification_str.strip()
        #print("whole string: ", classification_str)
        #print("only n", classification_str[0])
        class_value = int(classification_str[n])
        if class_value != 0 and class_value != 1:
            print("Class value not 0 or 1")
            print("---------------------")
            print(classification_str)
            print("----------------------")
            return None
        return class_value
    except ValueError:
        return None

# Function to assign 'Others' label if none of the prediction columns have a 1
def assign_others_to_row(row, classes):
    if not any(row[f"{label}_pred"] == 1 for label in classes):
        return 1
    return 0

def to_dataframe(average_reports, classes):
    data = {
        "precision": [],
        "recall": [],
        "f1-score": [],
        "support": []
    }
    index = []

    for i, average_report in enumerate(average_reports):
        for class_name, metrics in average_report.items():
            if class_name in {'micro avg', 'macro avg', 'weighted avg', 'accuracy'}:
                continue
            index.append(classes[i])
            data["precision"].append(metrics["0"]["precision"])
            data["recall"].append(metrics["0"]["recall"])
            data["f1-score"].append(metrics["0"]["f1-score"])
            data["support"].append(metrics["0"]["support"])

    return pd.DataFrame(data, index=index)

classes = ["War/Terror", "Conspiracy Theory", "Education", "Election Campaign", "Environment", 
              "Government/Public", "Health", "Immigration/Integration", 
              "Justice/Crime", "Labor/Employment", 
              "Macroeconomics/Economic Regulation", "Media/Journalism", "Religion", "Science/Technology"]
    
vicuna_full_classification_df = pd.read_csv("../data/vicuna_4bit/get_vicuna_prompt_without_context_only_classification_full_classification/test_generic_test_0.csv")
for class_ in classes:
    vicuna_full_classification_df[f"{class_}_pred"] = vicuna_full_classification_df[f"{class_}_pred"].apply(lambda x: llm_utils.extract_using_class_token(x))

print(vicuna_full_classification_df)
vicuna_full_classification_df["Others_pred"] = None
for idx, row in vicuna_full_classification_df.iterrows():
    vicuna_full_classification_df.at[idx, "Others_pred"] = assign_others_to_row(row, classes)

classes.append("Others")
mlb = MultiLabelBinarizer(classes=classes)
vicuna_full_classification_df['annotations'] = vicuna_full_classification_df['annotations'].apply(ast.literal_eval)
y_true = mlb.fit_transform(vicuna_full_classification_df['annotations'])
vicuna_full_classification_df['response'] = vicuna_full_classification_df.apply(lambda row: [class_.replace('_pred', '') for class_ in classes if row[f"{class_}_pred"]], axis=1)
y_pred = mlb.transform(vicuna_full_classification_df['response'])
#print(vicuna_full_classification_df)
report = classification_report(y_true, y_pred, output_dict=True, target_names=classes)
df_report = pd.DataFrame(report).transpose()
print(df_report)

                      id       campaign_name                                               text                                        annotations                                             prompt  context  War/Terror_pred  ...  Justice/Crime_pred  Labor/Employment_pred  Macroeconomics/Economic Regulation_pred  Media/Journalism_pred  Religion_pred  Science/Technology_pred                                   normalized_tweet
0    1144169368227635200            REA_0621  The Automobile Association said it is expectin...             ['Macroeconomics/Economic Regulation']  ### Human: Classify the Tweet based on if it's...      NaN                0  ...                 0.0                    0.0                                        0                      0              0                        0  The Automobile Association said it is expectin...
1    1187637103850610688            REA_0621  A severe flooding, triggered by heavy rains an...                                    ['Environment']