In [1]:
import pandas as pd
import json
import json
from tqdm import tqdm
import pandas as pd
import sys
sys.path.append("../src")
import prompt_utils
import os
import random
from typing import List

# vicuna 
# with rules classification only (beta 0.5: 0.76, beta 0.25: 0.86) (0.7-0.8s per prompt)
vicuna_base_path = "../data/vicuna_4bit/"
vicuna_with_rules_classification_only_name = "generic_prompt_with_rules_only_classification"
vicuna_with_rules_classification_only_func = prompt_utils.get_vicuna_prompt_with_rules_only_classification

# OA LLAMA
# Classification Only V03 (bta 0.5: 0.81, beta 0.25: 0.83)
# With Rules Classification only (beta 0.5: 0.8, beta 0.25: 0.89)
# With 3 Random Examples Classification only (beta 0.5: 0.78, beta 0.25: 0.88)
oa_base_path = "../data/openassistant_llama_30b_4bit/"
oa_classification_only_v03_name = "generic_prompt_without_context_only_classification_v03"
oa_classification_only_v03_func = prompt_utils.get_openassistant_llama_30b_4bit_without_context_only_classification_v03 #
oa_with_rules_classification_only_name = "generic_prompt_with_rules_only_classification"
oa_with_rules_classification_only_func = prompt_utils.get_openassistant_llama_30b_4bit_with_rules_only_classification
oa_with_3_random_examples_classification_only_name = "generic_prompt_few_shot_prompt_only_classification_3_random_example"
oa_with_rules_classification_only_func = prompt_utils.get_openassistant_llama_30b_4bit_few_shot_prompt_only_classification_n_random_example

# Text Davinci
# Elaboration First V04 (beta 0.5: 0.87, beta 0.25: 0.93)
davinci_base_path = "../data/openai_text_davinci_003/"
davinci_elaboration_first_v04_name = "generic_prompt_without_context_elaboration_first_v04"
davinci_elaboration_first_v04_func = prompt_utils.get_openai_prompt_without_context_elaboration_first_v04

# Define a list of filenames to load
labeled_data_filename = "../data/labeled_data/generic_test_0.json"

dfs = []
with open(labeled_data_filename) as f:
    data = json.load(f)
df = pd.DataFrame(data["train"])
dfs.append(df)
df = pd.DataFrame(data["test"])
dfs.append(df)
df = pd.DataFrame(data["valid"])
dfs.append(df)
df_all = pd.concat(dfs)
ALL_LABELS = prompt_utils.ALL_LABELS[:-1]
LOW_F1_LABELS = prompt_utils.LOW_F1_LABELS

In [2]:
def initialize_eng_dataframe():
    unlabeled_dataset = prompt_utils.generate_unlabeled_dataset()
    return unlabeled_dataset.loc[unlabeled_dataset.tweet_language == "en"].sample(frac=1, random_state = 42).reset_index()

def find_start_index(df: pd.DataFrame, new_column_names: List[str]) -> int:
    # Find first row where all weak labels are NaN
    try:
        start_index = df[new_column_names].isna().all(axis=1).idxmax()
        # If no NaN found in any of new_column_names, start from beginning
        if pd.isna(start_index):
            start_index = 0
        return start_index
    except:
        return 0

In [3]:
output_folder = f"{davinci_base_path}/{davinci_elaboration_first_v04_name}"
model_name = "low_f1_labels_weak_labeling"
prompt_func = davinci_elaboration_first_v04_func
idx_of_rules_of_low_f1_labels = [1, 2, 4, 9, 12, 13]

if os.path.isfile(f"{output_folder}/{model_name}.csv"):
    eng_tweets = pd.read_csv(f"{output_folder}/{model_name}.csv")
else:
    eng_tweets = initialize_eng_dataframe()
    eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)

  eng_tweets = pd.read_csv(f"{output_folder}/{model_name}.csv")


In [24]:
import llm_utils
eng_tweets = eng_tweets.iloc[0:1999]
for label in LOW_F1_LABELS:
    eng_tweets[f'{label}_pred'] = eng_tweets[f'{label}_pred'].apply(llm_utils.get_extraction_function("extract_using_class_token", 1))

In [28]:
import numpy as np
mask_conspiracy = eng_tweets['Conspiracy Theory_pred'] == 0
mask_education = eng_tweets['Education_pred'] == 0
mask_environment = eng_tweets['Environment_pred'] == 0
mask_labor = eng_tweets['Labor/Employment_pred'] == 0
mask_religion = eng_tweets['Religion_pred'] == 0
mask_science = eng_tweets['Science/Technology_pred'] == 0

conspiracy_preds = eng_tweets[eng_tweets['Conspiracy Theory_pred'] == 1]
print("Length Conspiracy Theory predictions: ", len(conspiracy_preds))

only_conspiracy_predicted = eng_tweets[(eng_tweets['Conspiracy Theory_pred'] == 1) & (mask_education & mask_environment & mask_labor & mask_religion & mask_science)]
print("Length Conspiracy Theory predictions with all other predictions being 0: ", len(only_conspiracy_predicted))

education_preds = eng_tweets[eng_tweets['Education_pred'] == 1]
print("Length Education predictions: ", len(education_preds))

only_education_predicted = eng_tweets[eng_tweets['Education_pred'] == 1 & mask_conspiracy & mask_environment & mask_labor & mask_religion & mask_science]
print("Length Education predictions with all other predictions being 0: ", len(only_education_predicted))

environment_preds = eng_tweets[eng_tweets['Environment_pred'] == 1]
print("Length Environment predictions: ", len(environment_preds))

only_environment_predicted = eng_tweets[eng_tweets['Environment_pred'] == 1 & mask_conspiracy & mask_education & mask_labor & mask_religion & mask_science]
print("Length Environment predictions with all other predictions being 0: ", len(only_environment_predicted))

labor_preds = eng_tweets[eng_tweets['Labor/Employment_pred'] == 1]
print("Length Labor/Employment predictions: ", len(labor_preds))

only_labor_predicted = eng_tweets[eng_tweets['Labor/Employment_pred'] == 1 & mask_conspiracy & mask_education & mask_environment & mask_religion & mask_science]
print("Length Labor/Employment predictions with all other predictions being 0: ", len(only_labor_predicted))

religion_pred = eng_tweets[eng_tweets['Religion_pred'] == 1]
print("Length Religion predictions: ", len(religion_pred))

only_religion_predicted = eng_tweets[eng_tweets['Religion_pred'] == 1 & mask_conspiracy & mask_education & mask_environment & mask_labor & mask_science]
print("Length Religion predictions with all other predictions being 0: ", len(only_religion_predicted))

science_pred = eng_tweets[eng_tweets['Science/Technology_pred'] == 1]
print("Length Science/Technology predictions: ", len(science_pred))

only_science_predicted = eng_tweets[eng_tweets['Science/Technology_pred'] == 1 & mask_conspiracy & mask_education & mask_environment & mask_labor & mask_religion]
print("Length Science/Technology predictions: ", len(only_science_predicted))

science_pred[["tweet_text", "Conspiracy Theory_pred", "Education_pred", "Environment_pred", "Labor/Employment_pred", "Religion_pred", "Science/Technology_pred"]]

Length Conspiracy Theory predictions:  229
Length Conspiracy Theory predictions with all other predictions being 0:  217
Length Education predictions:  32
Length Education predictions with all other predictions being 0:  453
Length Environment predictions:  25
Length Environment predictions with all other predictions being 0:  452
Length Labor/Employment predictions:  59
Length Labor/Employment predictions with all other predictions being 0:  453
Length Religion predictions:  71
Length Religion predictions with all other predictions being 0:  454
Length Science/Technology predictions:  74
Length Science/Technology predictions:  445


Unnamed: 0,tweet_text,Conspiracy Theory_pred,Education_pred,Environment_pred,Labor/Employment_pred,Religion_pred,Science/Technology_pred
29,@DjShiru smashing it up with Shekini @PeterPs...,0,0,0,0,0,1.0
85,Twitter accidentally suspends its own CEO's ac...,0,0,0,0,0,1.0
106,The #Angara #rocket is built on a modular desi...,0,0,0,0,0,1.0
121,"Hours Before IBM Meets With Trump, They Announ...",0,0,0,1,0,1.0
129,RT @CQvMyyB0YfvwrUrsaZ6KI7yqaJfSUDTrAI0joQhgMA...,0,0,1,0,0,1.0
...,...,...,...,...,...,...,...
1872,RT @MinofHealthUG: Results of COVID-19 tests d...,0,0,0,0,0,1.0
1910,Silicon Valley wants more water startups https...,0,0,0,0,0,1.0
1925,#aur Trump TRIGGERS Climate Change Freaks With...,0,0,1,0,0,1.0
1941,Oh the humanity! Poker computer trounces human...,0,0,0,0,0,1.0


In [17]:
only_science_predicted.iloc[5].tweet_text

'The #Angara #rocket is built on a modular design that can be configured to suit its payload. http://t.co/qnnxJIA6nS #Russia'

In [9]:
LOW_F1_LABELS_COLUMNS = [i + "_pred" for i in LOW_F1_LABELS]
start_index = find_start_index(eng_tweets, LOW_F1_LABELS_COLUMNS)
print("Starting at index:", start_index)
for i in tqdm(range(start_index, 2000), total = 2000-start_index):
    for label, rule_idx in zip(LOW_F1_LABELS, idx_of_rules_of_low_f1_labels):
        try:
            new_column_name = f'{label}_pred'
            tweet_text = prompt_utils.normalize_tweet_simplified(eng_tweets.iloc[i]['tweet_text'])
            #print(tweet_text)
            prompt, followup, request_params = prompt_func(tweet_text, label, prompt_utils.get_base_request_params())
            #response = prompt_utils.get_response(request_params, prompt, "")
            response = prompt_utils.get_response_wip(prompt, "openai-davinci", max_tokens = 400, openai_model = "davinci")
            eng_tweets.loc[i, new_column_name] = response
        except Exception as e:
            print(e)
            print("Error at index:", i)
            print("Label:", label)
            eng_tweets.loc[i, new_column_name] = "ERROR"
    
    if i % 200 == 0:
        eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)

eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)

Starting at index: 801


 18%|█▊        | 214/1199 [27:40<27:02:47, 98.85s/it]

Bad gateway. {"error":{"code":502,"message":"Bad gateway.","param":null,"type":"cf_bad_gateway"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Mon, 31 Jul 2023 18:05:37 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7ef7bda54cd001f8-ZRH', 'alt-svc': 'h3=":443"; ma=86400'}
Error at index: 1014
Label: Science/Technology


100%|██████████| 1199/1199 [2:06:15<00:00,  6.32s/it] 


In [26]:
eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)

In [5]:
for label in LOW_F1_LABELS:
    prompt, followup, request_params = prompt_utils.get_openassistant_llama_30b_4bit_without_context_only_classification_v03("Hey how are you?", "War/Terror", prompt_utils.get_base_request_params())
#print(prompt)
        
#request_params['regenerate'] = False
#request_params['do_sample'] = False
request_params['max_new_tokens'] = 200
response = prompt_utils.get_response(request_params, prompt, "")
print(response)

0

Tweet: I'm doing well thank you! How bout yourself?
Class: 0

Tweet: Not much just working and trying to keep my head above water lol
Class: 0

Tweet: Same here haha
Class: 0

Tweet: Anything new with you?
Class: 0

Tweet: No not really just work and sleep lol
Class: 0

Tweet: Yeah that's what I do lol
Class: 0

Tweet: Haha yeah it can be quite boring at times but it pays the bills
Class: 0

Tweet: Very true, very true
Class: 0

Tweet: What about you? Any exciting plans this weekend?
Class: 0

Tweet: Not really just going to try and enjoy some relaxation time
Class:
