In [None]:
import os
import sys
import csv
import constants as constants
from langchain_community.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator

# Set API Key in Environment
os.environ["OPENAI_API_KEY"] = constants.APIKEY
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Initialize the ChatOpenAI client
openai_client = ChatOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Function to query OpenAI
def query_openai(prompt):
    response = openai_client.completions.create(
        engine="InstructGPT",  # this is recommended by openai for cases like this in
        # which we want  to specify restrictive instructions regarding the results
        prompt=prompt,
        max_tokens=100
    )
    return response.choices[0].text.strip()


covid_entity_cat_examples = {
    'MEDICAL-CONDITIONS': ['COVID-19', 'coronavirous'],
    'SYMPTOMS': ['cough', 'shortness of breath'],
    'TREATMENTS': ['vaccine', 'hydroxychloroquine'],
    'EVENTS': ['lockdown', 'quarantine'],
    'POLICIES': ['mask mandate', 'travel ban'],
    'METRICS': ['cases', 'deaths', 'hospitalizations', 'recoveries', 'recovery rate']
}


# setting up a standardized template to use in all queries
prompt_template = PromptTemplate(
    input_variables=["category", "examples"],
    template="""List 10 {category} related to COVID-19, e.g., {examples}. I want results
    in a comma-separated format written in a continuous line. Each result
    needs to be surrounded by double quotes."""
)

# Initialize the chain with the OpenAI client and prompt template
chain = LLMChain(llm=openai_client, prompt=prompt_template)


with open('results.csv', 'a') as resfile:
    writer = csv.writer(resfile)
    writer.writerow(['label']+[f'word_{i}' for i in range(1,10+1)])

    for cat, ex in covid_entity_cat_examples.items():
        query = prompt_template.format(category=cat, examples=", ".join(ex))
        response = chain.run({"category": cat, "examples": ", ".join(ex)})
        spl_results = response.split(',')
        writer.writerow([cat] + [result.strip() for result in spl_results[:10]])




In [None]:
# I also need the names of the 10 most popular organizations linked to statements about covid-19
prompt_template2 = PromptTemplate(
    input_variables=[],  # we use no variables-placeholders for this specific query,
    template = "List the 10 most talked-about organizations related to the COVID-19 pandemic."
)
chain2 = LLMChain(llm=openai_client, prompt=prompt_template2)
response_orgs = chain2.run({})

In [None]:
from itertools import chain
import re
full_orgs = [i for i in chain.from_iterable([re.sub(r'\d\.?', '', i).strip().split('/') for i in response_orgs.split('\n')])]

In [None]:
abbrevs = [i for i in chain.from_iterable([re.findall(r'\((.*?)\)', org) for org in full_orgs])]


In [None]:
full_orgs = [re.sub(r'\((.*?)\)', '', org).strip().lower() for org in full_orgs]


['world health organization',
 'centers for disease control and prevention',
 'national institutes of health',
 'food and drug administration',
 'bill & melinda gates foundation',
 'united nations',
 'johns hopkins university',
 'pfizer',
 'biontech',
 'moderna',
 'astrazeneca']

In [None]:
all_orgs = [i.lower() for i in abbrevs] + full_orgs


['who',
 'cdc',
 'nih',
 'fda',
 'un',
 'world health organization',
 'centers for disease control and prevention',
 'national institutes of health',
 'food and drug administration',
 'bill & melinda gates foundation',
 'united nations',
 'johns hopkins university',
 'pfizer',
 'biontech',
 'moderna',
 'astrazeneca']

In [None]:

# 'who' might seem like the pronoun, but I will make these checks after I have got rid of all stopwords, so no such issue exists

In [None]:
# from a single google search, I will manually extend this category a bit
# At first we were thinking of considering vaccines to be different entities, but there is significant overlap in the names

all_vaccines = ['johnson & johnson', 'j&j', 'jnj', 'jannsen', 'sinopharm', 'comirnaty', 'spikevax', 'vaxzevria', 'bbibp-corv', 'sinovac', 'coronavac', 'covaxin', 'bharat biotech', 'sputnik', 'gamaleya', 'covovax', 'novavax', 'cansinobio', 'convidecia', 'epivaccorona', 'vector institute', 'coviran', 'shifa pharmed', 'soberana', 'finlay institute', 'center for genetic engineering']
all_orgs += all_vaccines
all_orgs
# I will retain all dashes in my preprocessing, as else i would mess with scientific namings

['who',
 'cdc',
 'nih',
 'fda',
 'un',
 'world health organization',
 'centers for disease control and prevention',
 'national institutes of health',
 'food and drug administration',
 'bill & melinda gates foundation',
 'united nations',
 'johns hopkins university',
 'pfizer',
 'biontech',
 'moderna',
 'astrazeneca',
 'johnson & johnson',
 'j&j',
 'jnj',
 'jannsen',
 'sinopharm',
 'comirnaty',
 'spikevax',
 'vaxzevria',
 'bbibp-corv',
 'sinovac',
 'coronavac',
 'covaxin',
 'bharat biotech',
 'sputnik',
 'gamaleya',
 'covovax',
 'novavax',
 'cansinobio',
 'convidecia',
 'epivaccorona',
 'vector institute',
 'coviran',
 'shifa pharmed',
 'soberana',
 'finlay institute',
 'center for genetic engineering']

In [None]:
with open('organizations.csv', 'a') as resfile:
    writer = csv.writer(resfile)
    writer.writerow(all_orgs)
