In [3]:
import os
import time
import openai
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from scipy.stats import mannwhitneyu
from sectors.config import INDUSTRY_DATA_DIR


load_dotenv()
openai.organization = os.getenv("OPENAI_ORGANIZATION_ID")
openai.api_key = os.getenv("OPENAI_SECRET_KEY")

In [4]:
def call_openai_api(prompt: str, model_name: str):
    try:
        out = (
            openai.ChatCompletion.create(
                model=model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=1,
                temperature=0,
            )
            .choices[0]
            .message.content
        )
        return out
    except Exception as e:
        print(f"Error occurred: {e}")
        time.sleep(60)
        return None
    
    
def pretraining_knowledge(company_name: str):
    out = False
    while not out:
        out = call_openai_api(f"Do you have some kind of pretraining knowledge about a company called '{company_name}'? Answer with Yes or No.",
                              "gpt-3.5-turbo")
    if "yes" in out.lower():
        return True
    elif "no" in out.lower():
        return False
    
    
TEST_PATH = INDUSTRY_DATA_DIR / "test_preprocessed.json"
test = pd.read_json(TEST_PATH, lines=True)

test["pretraining_knowledge"] = test["legal_name"].apply(pretraining_knowledge)
test["pretraining_knowledge"].value_counts()

Error occurred: The server is overloaded or not ready yet.
Error occurred: The server is overloaded or not ready yet.
Error occurred: The server is overloaded or not ready yet.
Error occurred: The server is overloaded or not ready yet.


pretraining_knowledge
False    680
True     159
Name: count, dtype: int64

In [5]:
save_path = f"{INDUSTRY_DATA_DIR}/test_pretraining_knowledge.json"
test.to_json(save_path, orient="records", lines=True, index=True)

# Analysis

In [12]:
path = f"{INDUSTRY_DATA_DIR}/test_pretraining_knowledge.json"
test = pd.read_json(path, lines=True)

mannwhitneyu(test[test["pretraining_knowledge"]]["pred_correct"], test[~test["pretraining_knowledge"]]["pred_correct"])

MannwhitneyuResult(statistic=50993.5, pvalue=0.2434340078492524)

In [5]:
n1 = 680
n2 = 159

N = 839
U = 50993.5

Z = abs((U - (n1*n2/2)) / np.sqrt((n1*n2*(n1+n2+1))/12))

# Calculate the effect size
r = Z / np.sqrt(N)
r

0.038482205113296156