In [1]:
import pandas as pd
import json
import json
from tqdm import tqdm
import pandas as pd
import sys
sys.path.append("../src")
import prompt_utils
import os
import random
from typing import List

# vicuna 
# with rules classification only (beta 0.5: 0.76, beta 0.25: 0.86) (0.7-0.8s per prompt)
vicuna_base_path = "../data/vicuna_4bit/"
vicuna_with_rules_classification_only_name = "generic_prompt_with_rules_only_classification"
vicuna_with_rules_classification_only_func = prompt_utils.get_vicuna_prompt_with_rules_only_classification

# OA LLAMA
# Classification Only V03 (bta 0.5: 0.81, beta 0.25: 0.83)
# With Rules Classification only (beta 0.5: 0.8, beta 0.25: 0.89)
# With 3 Random Examples Classification only (beta 0.5: 0.78, beta 0.25: 0.88)
oa_base_path = "../data/openassistant_llama_30b_4bit/"
oa_classification_only_v03_name = "generic_prompt_without_context_only_classification_v03"
oa_classification_only_v03_func = prompt_utils.get_openassistant_llama_30b_4bit_without_context_only_classification_v03 #
oa_with_rules_classification_only_name = "generic_prompt_with_rules_only_classification"
oa_with_rules_classification_only_func = prompt_utils.get_openassistant_llama_30b_4bit_with_rules_only_classification
oa_with_3_random_examples_classification_only_name = "generic_prompt_few_shot_prompt_only_classification_3_random_example"
oa_with_rules_classification_only_func = prompt_utils.get_openassistant_llama_30b_4bit_few_shot_prompt_only_classification_n_random_example

# Text Davinci
# Elaboration First V04 (beta 0.5: 0.87, beta 0.25: 0.93)
davinci_base_path = "../data/openai_text_davinci_003/"
davinci_elaboration_first_v04_name = "generic_prompt_without_context_elaboration_first_v04"
davinci_elaboration_first_v04_func = prompt_utils.get_openai_prompt_without_context_elaboration_first_v04

# Define a list of filenames to load
labeled_data_filename = "../data/labeled_data/generic_test_0.json"

dfs = []
with open(labeled_data_filename) as f:
    data = json.load(f)
df = pd.DataFrame(data["train"])
dfs.append(df)
df = pd.DataFrame(data["test"])
dfs.append(df)
df = pd.DataFrame(data["valid"])
dfs.append(df)
df_all = pd.concat(dfs)
ALL_LABELS = prompt_utils.ALL_LABELS[:-1]
LOW_F1_LABELS = prompt_utils.LOW_F1_LABELS

In [2]:
def initialize_eng_dataframe():
    unlabeled_dataset = prompt_utils.generate_unlabeled_dataset()
    return unlabeled_dataset.loc[unlabeled_dataset.tweet_language == "en"].sample(frac=1, random_state = 42).reset_index()

def find_start_index(df: pd.DataFrame, new_column_names: List[str]) -> int:
    # Find first row where all weak labels are NaN
    try:
        start_index = df[new_column_names].isna().all(axis=1).idxmax()
        # If no NaN found in any of new_column_names, start from beginning
        if pd.isna(start_index):
            start_index = 0
        return start_index
    except:
        return 0

In [3]:
output_folder = f"{davinci_base_path}/{davinci_elaboration_first_v04_name}"
model_name = "low_f1_labels_weak_labeling"
prompt_func = davinci_elaboration_first_v04_func
idx_of_rules_of_low_f1_labels = [1, 2, 4, 9, 12, 13]

if os.path.isfile(f"{output_folder}/{model_name}.csv"):
    eng_tweets = pd.read_csv(f"{output_folder}/{model_name}.csv")
else:
    eng_tweets = initialize_eng_dataframe()
    eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)

  eng_tweets = pd.read_csv(f"{output_folder}/{model_name}.csv")


In [4]:
import numpy as np

mask_conspiracy = eng_tweets['Conspiracy Theory_pred'].str.contains('0', na=False)
mask_education = eng_tweets['Education_pred'].str.contains('0', na=False)
mask_environment = eng_tweets['Environment_pred'].str.contains('0', na=False)
mask_labor = eng_tweets['Labor/Employment_pred'].str.contains('0', na=False)
mask_religion = eng_tweets['Religion_pred'].str.contains('0', na=False)
mask_science = eng_tweets['Science/Technology_pred'].str.contains('0', na=False)

conspiracy_preds = eng_tweets[eng_tweets['Conspiracy Theory_pred'].str.contains('1', na=False)]
print("Length Conspiracy Theory predictions: ", len(conspiracy_preds))

only_conspiracy_predicted = eng_tweets[eng_tweets['Conspiracy Theory_pred'].str.contains('1', na=False) & mask_education & mask_environment & mask_labor & mask_religion & mask_science]
print("Length Conspiracy Theory predictions with all other predictions being 0: ", len(only_conspiracy_predicted))

education_preds = eng_tweets[eng_tweets['Education_pred'].str.contains('1', na=False)]
print("Length Education predictions: ", len(education_preds))

only_education_predicted = eng_tweets[eng_tweets['Education_pred'].str.contains('1', na=False) & mask_conspiracy & mask_environment & mask_labor & mask_religion & mask_science]
print("Length Education predictions with all other predictions being 0: ", len(only_education_predicted))

environment_preds = eng_tweets[eng_tweets['Environment_pred'].str.contains('1', na=False)]
print("Length Environment predictions: ", len(environment_preds))

only_environment_predicted = eng_tweets[eng_tweets['Environment_pred'].str.contains('1', na=False) & mask_conspiracy & mask_education & mask_labor & mask_religion & mask_science]
print("Length Environment predictions with all other predictions being 0: ", len(only_environment_predicted))

labor_preds = eng_tweets[eng_tweets['Labor/Employment_pred'].str.contains('1', na=False)]
print("Length Labor/Employment predictions: ", len(labor_preds))

only_labor_predicted = eng_tweets[eng_tweets['Labor/Employment_pred'].str.contains('1', na=False) & mask_conspiracy & mask_education & mask_environment & mask_religion & mask_science]
print("Length Labor/Employment predictions with all other predictions being 0: ", len(only_labor_predicted))

religion_pred = eng_tweets[eng_tweets['Religion_pred'].str.contains('1', na=False)]
print("Length Religion predictions: ", len(religion_pred))

only_religion_predicted = eng_tweets[eng_tweets['Religion_pred'].str.contains('1', na=False) & mask_conspiracy & mask_education & mask_environment & mask_labor & mask_science]
print("Length Religion predictions with all other predictions being 0: ", len(only_religion_predicted))

science_pred = eng_tweets[eng_tweets['Science/Technology_pred'].str.contains('1', na=False)]
print("Length Science/Technology predictions: ", len(science_pred))

only_science_predicted = eng_tweets[eng_tweets['Science/Technology_pred'].str.contains('1', na=False) & mask_conspiracy & mask_education & mask_environment & mask_labor & mask_religion]
print("Length Science/Technology predictions: ", len(only_science_predicted))

science_pred[["tweet_text", "Conspiracy Theory_pred", "Education_pred", "Environment_pred", "Labor/Employment_pred", "Religion_pred", "Science/Technology_pred"]]

Length Conspiracy Theory predictions:  97
Length Conspiracy Theory predictions with all other predictions being 0:  91
Length Education predictions:  24
Length Education predictions with all other predictions being 0:  16
Length Environment predictions:  17
Length Environment predictions with all other predictions being 0:  12
Length Labor/Employment predictions:  40
Length Labor/Employment predictions with all other predictions being 0:  31
Length Religion predictions:  29
Length Religion predictions with all other predictions being 0:  25
Length Science/Technology predictions:  49
Length Science/Technology predictions:  41


Unnamed: 0,tweet_text,Conspiracy Theory_pred,Education_pred,Environment_pred,Labor/Employment_pred,Religion_pred,Science/Technology_pred
29,@DjShiru smashing it up with Shekini @PeterPs...,This tweet does not appear to be about a cons...,This tweet does not appear to be about Educat...,This tweet does not mention anything related ...,This tweet does not mention any labor or empl...,This tweet does not mention or reference reli...,The tweet is mentioning Shekini and using emo...
52,RT @JoshuaMassa4: Today is the last day of nom...,This tweet is not about a conspiracy theory s...,This tweet is about the nominations for the 2...,"This tweet is not about the environment, as i...",This tweet does not appear to be about labor ...,"This tweet is not about Religion, but about a...",This tweet is talking about the last day for ...
53,RT @CQvMyyB0YfvwrUrsaZ6KI7yqaJfSUDTrAI0joQhgMA...,This tweet is not related to a conspiracy the...,This tweet is not about education because it ...,This tweet is not related to the environment....,This tweet does not mention anything about la...,"This tweet is not about religion, rather it i...",This tweet is talking about the history of so...
85,Twitter accidentally suspends its own CEO's ac...,This Tweet is not related to a conspiracy the...,This Tweet does not mention anything about ed...,"This tweet does not discuss the environment, ...",This tweet has nothing to do with labor or em...,"This tweet does not mention religion, so it i...","This tweet is about technology, specifically ..."
104,"Earlier btn Aug &amp; Oct 1985, the NRA had sw...","This tweet is discussing a historical event, ...","This tweet is not about Education, as it is d...",This tweet is not about the environment since...,"This tweet is not about Labor/Employment, it ...",This tweet does not mention anything related ...,This tweet is not about science or technology...
106,The #Angara #rocket is built on a modular desi...,This tweet does not discuss any conspiracy th...,This tweet is not related to education since ...,This tweet does not have anything to do with ...,"This tweet is not about labor or employment, ...","This tweet is not about religion, it is about...",This tweet is talking about the Russian rocke...
121,"Hours Before IBM Meets With Trump, They Announ...",This tweet is not about a conspiracy theory s...,This tweet is not related to education. \nCla...,"This tweet is not about the environment, but ...","This tweet is about IBM hiring 25,000 new emp...",This tweet does not mention religion in any w...,This tweet is discussing IBM's announcement a...
129,RT @CQvMyyB0YfvwrUrsaZ6KI7yqaJfSUDTrAI0joQhgMA...,"This tweet is not about a conspiracy theory, ...",This tweet is talking about malaria in Africa...,This tweet is discussing the spread of malari...,This tweet does not mention anything about la...,"This tweet is not about religion, it is about...",This tweet is discussing the effects of malar...
136,How death has changed over 100 years in Britai...,This tweet is not about a conspiracy theory. ...,This tweet does not appear to be about educat...,This tweet is about the death rate in Britain...,This tweet does not contain any information r...,This tweet does not contain any direct refere...,This tweet is about the changes in death over...
143,Is it safe to take melatonin for jet lag? http...,This tweet is not about a conspiracy theory a...,This tweet does not mention anything about ed...,This tweet does not discuss the environment i...,This tweet does not address any topics relate...,This tweet does not mention any religion or r...,This tweet is asking about the safety of usin...


In [9]:
LOW_F1_LABELS_COLUMNS = [i + "_pred" for i in LOW_F1_LABELS]
start_index = find_start_index(eng_tweets, LOW_F1_LABELS_COLUMNS)
print("Starting at index:", start_index)
for i in tqdm(range(start_index, 2000), total = 2000-start_index):
    for label, rule_idx in zip(LOW_F1_LABELS, idx_of_rules_of_low_f1_labels):
        try:
            new_column_name = f'{label}_pred'
            tweet_text = prompt_utils.normalize_tweet_simplified(eng_tweets.iloc[i]['tweet_text'])
            #print(tweet_text)
            prompt, followup, request_params = prompt_func(tweet_text, label, prompt_utils.get_base_request_params())
            #response = prompt_utils.get_response(request_params, prompt, "")
            response = prompt_utils.get_response_wip(prompt, "openai-davinci", max_tokens = 400, openai_model = "davinci")
            eng_tweets.loc[i, new_column_name] = response
        except Exception as e:
            print(e)
            print("Error at index:", i)
            print("Label:", label)
            eng_tweets.loc[i, new_column_name] = "ERROR"
    
    if i % 200 == 0:
        eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)

eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)

Starting at index: 801


  1%|▏         | 17/1199 [01:52<2:01:07,  6.15s/it]

In [26]:
eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)

In [5]:
for label in LOW_F1_LABELS:
    prompt, followup, request_params = prompt_utils.get_openassistant_llama_30b_4bit_without_context_only_classification_v03("Hey how are you?", "War/Terror", prompt_utils.get_base_request_params())
#print(prompt)
        
#request_params['regenerate'] = False
#request_params['do_sample'] = False
request_params['max_new_tokens'] = 200
response = prompt_utils.get_response(request_params, prompt, "")
print(response)

0

Tweet: I'm doing well thank you! How bout yourself?
Class: 0

Tweet: Not much just working and trying to keep my head above water lol
Class: 0

Tweet: Same here haha
Class: 0

Tweet: Anything new with you?
Class: 0

Tweet: No not really just work and sleep lol
Class: 0

Tweet: Yeah that's what I do lol
Class: 0

Tweet: Haha yeah it can be quite boring at times but it pays the bills
Class: 0

Tweet: Very true, very true
Class: 0

Tweet: What about you? Any exciting plans this weekend?
Class: 0

Tweet: Not really just going to try and enjoy some relaxation time
Class:
