In [89]:
import pandas as pd
import json
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

import requests
import json
from tqdm import tqdm
from emoji import demojize
from nltk.tokenize import TweetTokenizer
import os
from datetime import datetime

import numpy as np
import openai
import time

# Define a list of filenames to load
filenames = ["../data/labeled_data/generic_test_0.json"]

# Load all JSON data and concatenate into one DataFrame
dfs = []
for filename in filenames:
    with open(filename) as f:
        data = json.load(f)
    df = pd.DataFrame(data["train"])
    dfs.append(df)
    df = pd.DataFrame(data["test"])
    dfs.append(df)
    df = pd.DataFrame(data["valid"])
    dfs.append(df)
df_all = pd.concat(dfs)

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "[url]"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token
    
def normalizeTweet(tweet):
    tokens = TweetTokenizer().tokenize(tweet.replace("’", "'"))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("n 't", "n't")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("p . m .", "pm")
            .replace("p . m", "pm")
            .replace("a . m .", "am")
            .replace("a . m", "am")
    )
    return " ".join(normTweet.split())

def api(prompt):
    import requests

# For local streaming, the websockets are hosted without ssl - http://
HOST = 'http://127.0.0.1:5000'
URI = f'{HOST}/api/v1/generate'

# For reverse-proxied streaming, the remote will likely host with ssl - https://
# URI = 'https://your-uri-here.trycloudflare.com/api/v1/generate'

def get_openai_prompt_without_context_elaboration_first(tweet_text, label):
    prompt = f"Elaborate on whether you think the Tweet is about {label} or something else.\n\nTweet: {tweet_text}\n\n"
    followup = f"\nAssign the label 1 if it's about {label} or 0 for not based on the elaboration. Only output the number."
    return prompt, followup

def get_openai_prompt_without_context_only_classification(tweet_text, label):
    prompt = f"Classify the Tweet based on if it's about {label}. Use 1 or 0 as class.\n\nTweet: {tweet_text}\nClass: "
    return prompt, ""

def get_model_by_type(model_type):
    if model_type == "llama":
        return #get_llama_response
    elif model_type == "vicuna":
        return #get_vicuna_response
    elif model_type == "openassistant":
        return #get_openassistant_response
    elif "openai" in model_type:
        return get_openai_response
    elif "gpt-3.5" in model_type:
        return get_openai_response

def get_response(prompt, first_model_type, second_model_type = "", follow_up = "", prompting_type = "simple", context = "", openai_model = ""):
    
    valid_models = ["llama", "vicuna", "openassistant", "openai-davinci", "openai-gpt-3.5-turbo"]
    assert first_model_type in valid_models, "First model type needs to be one of the following: " + ", ".join(valid_models)
    first_model = get_model_by_type(first_model_type)

    if prompting_type == "two-way":
        if second_model_type == "":
            second_model = get_model_by_type(first_model_type)
        else:
            assert second_model_type in valid_models, "Second model type needs to be one of the following: " + ", ".join(valid_models)
            assert follow_up != "", "Follow up needs to be specified for two_way prompting type"
            second_model = get_model_by_type(second_model_type)

        if openai_model != "":
            #print("first prompt: ", prompt)
            first_response = first_model(prompt, context = context, model = openai_model)
            if "gpt" in second_model_type:
                first_response = [prompt, {"role": "assistant", "content": first_response}]
            #time.sleep(2)
            #print("First response: ", first_response)
            second_response = second_model(follow_up, context = prompt + first_response, model = openai_model)
            return second_response
        
    if prompting_type == "simple":
        return first_model(prompt)


def get_openai_response(prompt, context = [], model = "gpt-3.5-turbo"):
    # Use OpenAI's ChatCompletion API to get the chatbot's response

    if "gpt" in model:
        messages = []
        if context != []:
            for c in context:
                messages.append(c)
        messages.append(prompt)
    else:
        prompt = context + prompt
        #print("Context: ", context)
        #print("Full prompt: ", prompt)

    if model == "gpt-3.5-turbo":
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",  # The name of the OpenAI chatbot model to use
            messages=messages,   # The conversation history up to this point, as a list of dictionaries
            max_tokens=200,        # The maximum number of tokens (words or subwords) in the generated response
            stop=None,              # The stopping sequence for the generated response, if any (not used here)
            temperature=0.7,        # The "creativity" of the generated response (higher temperature = more creative)
        )

    elif model == "davinci":
        response = openai.Completion.create(
            model="text-davinci-003",  # The name of the OpenAI chatbot model to use
            prompt=prompt,   # The conversation history up to this point, as a list of dictionaries
            max_tokens=200,        # The maximum number of tokens (words or subwords) in the generated response
            stop=None,              # The stopping sequence for the generated response, if any (not used here)
            temperature=0.7,        # The "creativity" of the generated response (higher temperature = more creative)
        )

    # Find the first response from the chatbot that has text in it (some responses may not have text)
    for choice in response.choices:
        if "text" in choice:
            return choice.text

    # If no response with text is found, return the first response's content (which may be empty)
    return response.choices[0].message.content

In [90]:
import pandas as pd

# Your original DataFrame: df_all
# all_labels: list of labels

balanced_dfs = []

rules = ["Oxford dictionary's definition of war: “situation in which two or more countries or groups of people fight against each other over a period of time”. Oxford dictionary's definition of terror (terrorism): “violent action or the threat of violent action that is intended to cause fear, usually for political purposes”. Remark: This category includes also causes and consequences of war/terror (e.g. “the current situation in Ukraine may cause a supply crisis for wheat products”).",
"Oxford dictionary's definition of conspiracy: “a secret plan by a group of people to do something harmful or illegal”. Remark: Assignment of this category may depend on viewpoint and political stance of rater, which can be mitigated by focusing on the definition above. If the content of a tweet describes a conspiratorial activity/process, it will be labeled “conspiracy theory”.",
"Oxford dictionary's definition of education: “a process of teaching, training and learning, especially in schools, colleges or universities, to improve knowledge and develop skills”. Remark: Does not include education/training of soldiers (🡪war/terror).",
"Oxford dictionary's definition of election: “the process of choosing a person or a group of people for a position, especially a political position, by voting”. Remark: This category includes all activities aimed at rallying the population for participation in a public election, description of election outcomes, and conduct of the election itself.",
"Oxford dictionary's definition of environment: “the natural world in which people, animals and plants live”. Remark: This category is typically used for tweet content revolving around activities and processes affecting the environment in some way.",
"Oxford dictionary's definition of government: “the group of people who are responsible for controlling a country or a state”. Oxford dictionary's definition of public: “ordinary people who are not members of a particular group or organization” Remark: This category includes also statements/content about the public perception of activities/processes of government (i.e. voiced criticism or praise for a government).",
"Oxford dictionary's definition of health: “the condition of a person's body or mind”. Remark: This category includes also statements related to public health. In such a case both Health and Government/Public must be selected.",
"Oxford dictionary's definition of immigration: “the process of coming to live permanently in a different country from the one you were born in”. Oxford dictionary's definition of integration: “the act or process of mixing people who have previously been separated, usually because of colour, race, religion, etc.”",
"Oxford dictionary's definition of justice: “the legal system used to punish people who have committed crimes”. Oxford dictionary's definition of crime: “activities that involve breaking the law”. Remark: This category does not include statements/content on war crimes (🡪 war/terror).",
"Oxford dictionary's definition of labor: “work, especially physical work”. Oxford dictionary's definition of employment: “work, especially when it is done to earn money; the state of being employed”.",
"Oxford dictionary's definition of macroeconomics: “the study of large economic systems, such as those of whole countries or areas of the world”. Oxford dictionary's definition of regulation: ”an official rule made by a government or some other authority”. Remark: In case of statements/content on economic regulations, this category may likely co-occur with Government/Public category.", 
"Oxford dictionary's definition of media: “the main ways that large numbers of people receive information and entertainment, that is television, radio, newspapers and the internet”. Oxford dictionary's definition of journalism: “the work of collecting and writing news stories for newspapers, magazines, radio, television or online news sites; the news stories that are written”. Remark: This category will be used for statements/content which explicitly references other media outlets or journalists (e.g. “BBC has reported that …”, “Bellingcat has discovered a secret operation of X”). Content which appears “news-worthy” does not generally fall into this category (🡪 newsworthiness is very subjective and context-dependent).",
"Oxford dictionary's definition of religion: “the belief in the existence of a god or gods, and the activities that are connected with the worship of them, or in the teachings of a spiritual leader”.",
"Oxford dictionary's definition of science: “knowledge about the structure and behavior of the natural and physical world, based on facts that you can prove, for example by experiments”. Oxford dictionary's definition of technology: “scientific knowledge used in practical ways in industry, for example in designing new machines”."]

all_labels = ["War/Terror", "Conspiracy Theory", "Education", "Election Campaign", "Environment", 
              "Government/Public", "Health", "Immigration/Integration", 
              "Justice/Crime", "Labor/Employment", 
              "Macroeconomics/Economic Regulation", "Media/Journalism", "Religion", "Science/Technology"]

for label in all_labels:
    # Initialize an empty DataFrame for the balanced dataset
    balanced_df = pd.DataFrame()
    # Get the rows with the current label
    label_rows = df_all[df_all['annotations'].apply(lambda x: label in x)]
    
    # Get the rows without the current label
    non_label_rows = df_all[df_all['annotations'].apply(lambda x: label not in x)]
    
    # Sample 65 rows with the current label
    sample_label_rows = label_rows.sample(n=65, random_state=42)
    
    # Sample 65 rows without the current label
    sample_non_label_rows = non_label_rows.sample(n=65, random_state=42)
    
    # Combine the samples
    combined_sample = pd.concat([sample_label_rows, sample_non_label_rows], ignore_index=True)
    
    # Add the samples to the balanced DataFrame
    balanced_df = pd.concat([balanced_df, combined_sample], ignore_index=True)

    balanced_dfs.append(balanced_df)

In [91]:
# Usage:
openai.api_key = "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"

df_all['normalized_tweet'] = None
normalized_tweets_db = {}

for idx, label in enumerate(all_labels):

    sample_df = balanced_dfs[idx]

    print("Starting requesting for label: " + label + "\n")

    new_column_name = f'{label}_pred'

    # Add a new column for the API results
    df_all[new_column_name] = None
    #current_timestamp = datetime.now().strftime("%H%M%S")
    output_folder = f"../data/openai_text_davinci_003/generic_prompt_without_context_elaboration_first/"

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    i = 0
    # Iterate over the rows of the sample_df
    for index, row in tqdm(sample_df.iterrows(), total=sample_df.shape[0]):

        tweet_text = normalizeTweet(row['text'])
        df_all.loc[lambda df: df['id'] == row["id"], 'normalized_tweet'] = tweet_text

        """prompt, followup = get_openai_prompt_without_context_elaboration_first(tweet_text, label)
        prompt = {"role": "user", "content": prompt}
        followup = {"role": "system", "content": followup}
        response = get_response(prompt, "openai-gpt-3.5-turbo", prompting_type = "two-way", follow_up=followup)"""

        """prompt, followup = get_openai_prompt_without_context_only_classification(tweet_text, label)
        response = get_response(prompt, "openai-davinci", prompting_type = "simple")"""

        prompt, followup = get_openai_prompt_without_context_elaboration_first(tweet_text, label)
        response = get_response(prompt, "openai-davinci", prompting_type = "two-way", follow_up=followup, openai_model="davinci")
        #print(response)

        # Save the response in the 'api_results' column
        df_all.loc[lambda df: df['id'] == row["id"], new_column_name] = response
        
        i+=1
        # Save the DataFrame to a CSV file every 100 steps
        if (i + 1) % 100 == 0:
            output_path = os.path.join(output_folder, 'generic_test_0.csv')
            df_all.to_csv(output_path, index=False)
            print(f"Saved progress at index {index}")
            print("Sample Tweet: ", tweet_text)
            print("Sample Annotation: ", response)

    # Save the final DataFrame to a CSV file
    output_path = os.path.join(output_folder, 'generic_test_0.csv')
    df_all.to_csv(output_path, index=False)

    # Save the final DataFrame to a CSV file
output_path = os.path.join(output_folder, 'generic_test_0.csv')
df_all.to_csv(output_path, index=False)

Starting requesting for label: War/Terror



  0%|          | 0/130 [00:00<?, ?it/s]

Full prompt:  Elaborate on whether you think the Tweet is about War/Terror or something else.

Tweet: The Western coalition led by the #UnitedStates struck a blow at the village of Hoveibaria , where a temporary camp for refugees from Iraq was placed . As a result of the #attack , at least 18 people were killed #Syria #Khaseke [url]




  0%|          | 0/130 [00:03<?, ?it/s]


KeyboardInterrupt: 