In [1]:
import pandas as pd
import json
import json
from tqdm import tqdm
import pandas as pd
import sys
sys.path.append("../src")
import prompt_utils
import os
import random
from typing import List

# vicuna 
# with rules classification only (beta 0.5: 0.76, beta 0.25: 0.86) (0.7-0.8s per prompt)
vicuna_base_path = "../data/vicuna_4bit/"
vicuna_with_rules_classification_only_name = "generic_prompt_with_rules_only_classification"
vicuna_with_rules_classification_only_func = prompt_utils.get_vicuna_prompt_with_rules_only_classification

# OA LLAMA
# Classification Only V03 (bta 0.5: 0.81, beta 0.25: 0.83)
# With Rules Classification only (beta 0.5: 0.8, beta 0.25: 0.89)
# With 3 Random Examples Classification only (beta 0.5: 0.78, beta 0.25: 0.88)
oa_base_path = "../data/openassistant_llama_30b_4bit/"
oa_classification_only_v03_name = "generic_prompt_without_context_only_classification_v03"
oa_classification_only_v03_func = prompt_utils.get_openassistant_llama_30b_4bit_without_context_only_classification_v03 #
oa_with_rules_classification_only_name = "generic_prompt_with_rules_only_classification"
oa_with_rules_classification_only_func = prompt_utils.get_openassistant_llama_30b_4bit_with_rules_only_classification
oa_with_3_random_examples_classification_only_name = "generic_prompt_few_shot_prompt_only_classification_3_random_example"
oa_with_rules_classification_only_func = prompt_utils.get_openassistant_llama_30b_4bit_few_shot_prompt_only_classification_n_random_example

# Text Davinci
# Elaboration First V04 (beta 0.5: 0.87, beta 0.25: 0.93)
davinci_base_path = "../data/openai_text_davinci_003/"
davinci_elaboration_first_v04_name = "generic_prompt_without_context_elaboration_first_v04"
davinci_elaboration_first_v04_func = prompt_utils.get_openai_prompt_without_context_elaboration_first_v04

# Define a list of filenames to load
labeled_data_filename = "../data/labeled_data/generic_test_0.json"

dfs = []
with open(labeled_data_filename) as f:
    data = json.load(f)
df = pd.DataFrame(data["train"])
dfs.append(df)
df = pd.DataFrame(data["test"])
dfs.append(df)
df = pd.DataFrame(data["valid"])
dfs.append(df)
df_all = pd.concat(dfs)
ALL_LABELS = prompt_utils.ALL_LABELS[:-1]
LOW_F1_LABELS = prompt_utils.LOW_F1_LABELS

In [2]:
def initialize_eng_dataframe():
    unlabeled_dataset = prompt_utils.generate_unlabeled_dataset()
    return unlabeled_dataset.loc[unlabeled_dataset.tweet_language == "en"].sample(frac=1, random_state = 42).reset_index()

def find_start_index(df: pd.DataFrame, new_column_names: List[str]) -> int:
    # Find first row where all weak labels are NaN
    try:
        start_index = df[new_column_names].isna().all(axis=1).idxmax()
        # If no NaN found in any of new_column_names, start from beginning
        if pd.isna(start_index):
            start_index = 0
        return start_index
    except:
        return 0

In [3]:
output_folder = f"{vicuna_base_path}/{vicuna_with_rules_classification_only_name}"
model_name = "low_f1_labels_weak_labeling"
prompt_func = vicuna_with_rules_classification_only_func
idx_of_rules_of_low_f1_labels = [1, 2, 4, 9, 12, 13]

if os.path.isfile(f"{output_folder}/{model_name}.csv"):
    eng_tweets = pd.read_csv(f"{output_folder}/{model_name}.csv")
else:
    eng_tweets = initialize_eng_dataframe()
    eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)

  eng_tweets = pd.read_csv(f"{output_folder}/{model_name}.csv")


In [4]:
LOW_F1_LABELS_COLUMNS = [i + "_pred" for i in LOW_F1_LABELS]
start_index = find_start_index(eng_tweets, LOW_F1_LABELS_COLUMNS)

In [5]:
start_index

18106

In [6]:
import llm_utils
eng_tweets = eng_tweets.iloc[0:18105]
for label in LOW_F1_LABELS:
    eng_tweets[f'{label}_pred'] = eng_tweets[f'{label}_pred'].apply(llm_utils.get_extraction_function("extract_using_class_token", 1))

In [7]:
import numpy as np
mask_conspiracy = eng_tweets['Conspiracy Theory_pred'] == 0
mask_education = eng_tweets['Education_pred'] == 0
mask_environment = eng_tweets['Environment_pred'] == 0
mask_labor = eng_tweets['Labor/Employment_pred'] == 0
mask_religion = eng_tweets['Religion_pred'] == 0
mask_science = eng_tweets['Science/Technology_pred'] == 0

conspiracy_preds = eng_tweets[eng_tweets['Conspiracy Theory_pred'] == 1]
print("Length Conspiracy Theory predictions: ", len(conspiracy_preds))

only_conspiracy_predicted = eng_tweets[(eng_tweets['Conspiracy Theory_pred'] == 1) & (mask_education & mask_environment & mask_labor & mask_religion & mask_science)]
print("Length Conspiracy Theory predictions with all other predictions being 0: ", len(only_conspiracy_predicted))

education_preds = eng_tweets[eng_tweets['Education_pred'] == 1]
print("Length Education predictions: ", len(education_preds))

only_education_predicted = eng_tweets[eng_tweets['Education_pred'] == 1 & mask_conspiracy & mask_environment & mask_labor & mask_religion & mask_science]
print("Length Education predictions with all other predictions being 0: ", len(only_education_predicted))

environment_preds = eng_tweets[eng_tweets['Environment_pred'] == 1]
print("Length Environment predictions: ", len(environment_preds))

only_environment_predicted = eng_tweets[eng_tweets['Environment_pred'] == 1 & mask_conspiracy & mask_education & mask_labor & mask_religion & mask_science]
print("Length Environment predictions with all other predictions being 0: ", len(only_environment_predicted))

labor_preds = eng_tweets[eng_tweets['Labor/Employment_pred'] == 1]
print("Length Labor/Employment predictions: ", len(labor_preds))

only_labor_predicted = eng_tweets[eng_tweets['Labor/Employment_pred'] == 1 & mask_conspiracy & mask_education & mask_environment & mask_religion & mask_science]
print("Length Labor/Employment predictions with all other predictions being 0: ", len(only_labor_predicted))

religion_pred = eng_tweets[eng_tweets['Religion_pred'] == 1]
print("Length Religion predictions: ", len(religion_pred))

only_religion_predicted = eng_tweets[eng_tweets['Religion_pred'] == 1 & mask_conspiracy & mask_education & mask_environment & mask_labor & mask_science]
print("Length Religion predictions with all other predictions being 0: ", len(only_religion_predicted))

science_pred = eng_tweets[eng_tweets['Science/Technology_pred'] == 1]
print("Length Science/Technology predictions: ", len(science_pred))

only_science_predicted = eng_tweets[eng_tweets['Science/Technology_pred'] == 1 & mask_conspiracy & mask_education & mask_environment & mask_labor & mask_religion]
print("Length Science/Technology predictions: ", len(only_science_predicted))

science_pred[["tweet_text", "Conspiracy Theory_pred", "Education_pred", "Environment_pred", "Labor/Employment_pred", "Religion_pred", "Science/Technology_pred"]]

Length Conspiracy Theory predictions:  962
Length Conspiracy Theory predictions with all other predictions being 0:  879
Length Education predictions:  778
Length Education predictions with all other predictions being 0:  2557
Length Environment predictions:  933
Length Environment predictions with all other predictions being 0:  2579
Length Labor/Employment predictions:  699
Length Labor/Employment predictions with all other predictions being 0:  2627
Length Religion predictions:  348
Length Religion predictions with all other predictions being 0:  2914
Length Science/Technology predictions:  290
Length Science/Technology predictions:  2842


Unnamed: 0,tweet_text,Conspiracy Theory_pred,Education_pred,Environment_pred,Labor/Employment_pred,Religion_pred,Science/Technology_pred
106,The #Angara #rocket is built on a modular desi...,0.0,0.0,0.0,0.0,0.0,1.0
154,RT @djnicknicholas: On #TheHitLab tomorrow @Ka...,0.0,0.0,0.0,0.0,0.0,1.0
246,"VIDEO : OneWeb Founder on Bringing 3,000 Jobs ...",0.0,1.0,0.0,1.0,0.0,1.0
345,RT @NRMOnline: President @KagutaMuseveni comme...,0.0,1.0,0.0,0.0,0.0,1.0
373,RT @CQvMyyB0YfvwrUrsaZ6KI7yqaJfSUDTrAI0joQhgMA...,0.0,1.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...
17764,Giuliani: Trump to fight hacking with cybersec...,0.0,0.0,0.0,0.0,0.0,1.0
17815,RT @TButaka: Which Definition can you never fo...,0.0,0.0,0.0,0.0,0.0,1.0
17867,This could prevent distracted driving https://...,0.0,1.0,1.0,0.0,0.0,1.0
17893,Wireless headphones for the holidays https://t...,0.0,0.0,0.0,0.0,0.0,1.0


In [36]:
LOW_F1_LABELS_COLUMNS = [i + "_pred" for i in LOW_F1_LABELS]
start_index = find_start_index(eng_tweets, LOW_F1_LABELS_COLUMNS)
start_index = 4143
print("Starting at index:", start_index)
for i in tqdm(range(start_index, len(eng_tweets)), total = len(eng_tweets)-start_index):
    for label, rule_idx in zip(LOW_F1_LABELS, idx_of_rules_of_low_f1_labels):
        try:
            new_column_name = f'{label}_pred'
            tweet_text = prompt_utils.normalize_tweet_simplified(eng_tweets.iloc[i]['tweet_text'])
            #print(tweet_text)
            prompt, followup, request_params = prompt_func(tweet_text, label, prompt_utils.RULES[rule_idx], prompt_utils.get_base_request_params())
            response = prompt_utils.get_response(request_params, prompt, "")
            # for openai
            #response = prompt_utils.get_response_wip(prompt, "openai-davinci", max_tokens = 400, openai_model = "davinci")
            eng_tweets.loc[i, new_column_name] = response
        except Exception as e:
            print(e)
            print("Error at index:", i)
            print("Label:", label)
            eng_tweets.loc[i, new_column_name] = "ERROR"
    
    if i % 300 == 0:
        eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)
        print("Saved at index:", i)
        print("Last label:", label)
        print("Last response:", response)

eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)

Starting at index: 4143


  0%|          | 0/1565961 [00:00<?, ?it/s]

'float' object has no attribute 'replace'
Error at index: 4143
Label: Conspiracy Theory
'float' object has no attribute 'replace'
Error at index: 4143
Label: Education
'float' object has no attribute 'replace'
Error at index: 4143
Label: Environment
'float' object has no attribute 'replace'
Error at index: 4143
Label: Labor/Employment
'float' object has no attribute 'replace'
Error at index: 4143
Label: Religion
'float' object has no attribute 'replace'
Error at index: 4143
Label: Science/Technology
'float' object has no attribute 'replace'
Error at index: 4144
Label: Conspiracy Theory
'float' object has no attribute 'replace'
Error at index: 4144
Label: Education
'float' object has no attribute 'replace'
Error at index: 4144
Label: Environment
'float' object has no attribute 'replace'
Error at index: 4144
Label: Labor/Employment
'float' object has no attribute 'replace'
Error at index: 4144
Label: Religion
'float' object has no attribute 'replace'
Error at index: 4144
Label: Science/T

  0%|          | 58/1565961 [04:42<4454:00:35, 10.24s/it]

Saved at index: 4200
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 358/1565961 [29:29<4407:59:52, 10.14s/it]

Saved at index: 4500
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 658/1565961 [54:15<4407:01:37, 10.14s/it]

Saved at index: 4800
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 958/1565961 [1:19:03<4415:10:28, 10.16s/it]

Saved at index: 5100
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 1258/1565961 [1:43:53<4413:08:09, 10.15s/it]

Saved at index: 5400
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 1558/1565961 [2:08:42<4437:50:05, 10.21s/it]

Saved at index: 5700
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 1858/1565961 [2:33:28<4409:13:11, 10.15s/it]

Saved at index: 6000
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 2158/1565961 [2:58:19<4434:52:46, 10.21s/it]

Saved at index: 6300
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 2458/1565961 [3:23:07<4421:15:28, 10.18s/it]

Saved at index: 6600
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 2758/1565961 [3:47:55<4416:44:29, 10.17s/it]

Saved at index: 6900
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 3058/1565961 [4:12:47<4466:19:13, 10.29s/it]

Saved at index: 7200
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 3358/1565961 [4:37:37<4444:47:30, 10.24s/it]

Saved at index: 7500
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 3658/1565961 [5:02:26<4505:46:00, 10.38s/it]

Saved at index: 7800
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 3958/1565961 [5:27:21<4464:00:52, 10.29s/it]

Saved at index: 8100
Last label: Science/Technology
Last response:  0 (Not related to Science/Techn


  0%|          | 4258/1565961 [5:52:14<4484:07:29, 10.34s/it]

Saved at index: 8400
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 4558/1565961 [6:17:04<4403:01:54, 10.15s/it]

Saved at index: 8700
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 4858/1565961 [6:41:58<4410:36:34, 10.17s/it]

Saved at index: 9000
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 5158/1565961 [7:06:49<4397:13:51, 10.14s/it]

Saved at index: 9300
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 5458/1565961 [7:32:24<4637:06:47, 10.70s/it]

Saved at index: 9600
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 5758/1565961 [7:58:39<4606:54:36, 10.63s/it]

Saved at index: 9900
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 6058/1565961 [8:24:58<4594:05:58, 10.60s/it]

Saved at index: 10200
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 6358/1565961 [8:51:14<4743:41:19, 10.95s/it]

Saved at index: 10500
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 6658/1565961 [9:16:58<4442:23:23, 10.26s/it]

Saved at index: 10800
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 6958/1565961 [9:41:44<4461:21:41, 10.30s/it]

Saved at index: 11100
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 7258/1565961 [10:06:43<4586:20:27, 10.59s/it]

Saved at index: 11400
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  0%|          | 7558/1565961 [10:32:18<4614:25:29, 10.66s/it]

Saved at index: 11700
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 7858/1565961 [10:58:20<4590:22:44, 10.61s/it]

Saved at index: 12000
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 8158/1565961 [11:24:29<4559:48:49, 10.54s/it]

Saved at index: 12300
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 8458/1565961 [11:50:12<4566:10:23, 10.55s/it]

Saved at index: 12600
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 8758/1565961 [12:15:53<4605:23:41, 10.65s/it]

Saved at index: 12900
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 9058/1565961 [12:41:32<4576:47:27, 10.58s/it]

Saved at index: 13200
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 9358/1565961 [13:07:11<4575:15:20, 10.58s/it]

Saved at index: 13500
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 9658/1565961 [13:32:51<4575:10:22, 10.58s/it]

Saved at index: 13800
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 9958/1565961 [13:58:31<4550:51:54, 10.53s/it]

Saved at index: 14100
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 10258/1565961 [14:24:11<4570:01:43, 10.58s/it]

Saved at index: 14400
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 10558/1565961 [14:49:52<4603:59:27, 10.66s/it]

Saved at index: 14700
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 10858/1565961 [15:15:33<4579:09:44, 10.60s/it]

Saved at index: 15000
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 11158/1565961 [15:41:16<4543:04:24, 10.52s/it]

Saved at index: 15300
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 11458/1565961 [16:06:56<4609:52:21, 10.68s/it]

Saved at index: 15600
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 11758/1565961 [16:32:37<4591:41:39, 10.64s/it]

Saved at index: 15900
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 12058/1565961 [16:58:26<4704:03:30, 10.90s/it]

Saved at index: 16200
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 12358/1565961 [17:25:07<4744:59:53, 11.00s/it]

Saved at index: 16500
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 12658/1565961 [17:51:59<4853:56:39, 11.25s/it]

Saved at index: 16800
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 12958/1565961 [18:18:54<4827:59:59, 11.19s/it]

Saved at index: 17100
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 13258/1565961 [18:45:34<4699:41:04, 10.90s/it]

Saved at index: 17400
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 13558/1565961 [19:12:26<4771:35:21, 11.07s/it]

Saved at index: 17700
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 13858/1565961 [19:39:07<4776:43:20, 11.08s/it]

Saved at index: 18000
Last label: Science/Technology
Last response:  0 (Not about Science/Technology


  1%|          | 13962/1565961 [19:48:24<2201:42:19,  5.11s/it]


KeyboardInterrupt: 

In [37]:
eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)

In [35]:
eng_tweets.iloc[4280]

index                                                                  424685
tweetid                                                   1289113606794223616
userid                                                    1108056700450414592
user_display_name                                                Hustle Queen
user_screen_name                                                HustleQueenUg
user_reported_location                                                 Uganda
user_profile_description                      patriot of the great Sevolution
user_profile_url                                                          NaN
follower_count                                                        13491.0
following_count                                                          5975
account_creation_date                                              2019-03-19
account_language                                                           en
tweet_language                                                  

In [26]:
eng_tweets.to_csv(f"{output_folder}/{model_name}.csv", index=False)