# Jupyter notebook sample

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# imports

import os
from tqdm import tqdm
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset
import chromadb
from sklearn.manifold import TSNE
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings


In [3]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

In [39]:
import src.scraper.profile_scraper as profile_scraper
faculty = profile_scraper.FacultyProfileScraper("https://gsbs.uth.edu/directory/profile?id=462a94d5-93b0-46e2-b02d-d6d5c2355ecb")
faculty.links
# faculty.text

{'pubmed': 'https://www.ncbi.nlm.nih.gov/myncbi/ziyi.li.1/bibliography/public/',
 'website': 'https://sites.google.com/site/ziyiliemory/e'}

In [5]:
from src.summarize import summarize
summarize(faculty.text)

API key found and looks good so far!


'Sure! Here is the processed text file content based on the provided website information for the faculty member, Yunxin Fu:\n\n---\n\n**Yunxin Fu**  \nProfessor  \nBiostatistics  \n713/500-9813  \nReuel Stallones Building  \n1200 Pressler Street, Houston, TX 77030  \n[View CV]\n\n**About**  \nI was trained as a biostatistician specialized in computational biology, and have spent much of my career in developing population genetics theory and statistical methods for analyzing population samples of DNA sequences, including algorithms for simulating samples for the analysis of large scale data. Recently I have been involved in the analysis of polymorphism data from the 1000 Genomes Project and related large data sets. In addition to my continuous work on population genetics theory and evolution, my recent interests include within-individual polymorphism generated by both classical experiments and next generation sequencing for the purpose of understanding the mutational process during indi

In [6]:
# load json into python dictionary
import json 
input_files = ['data/sph_faculty_list.jsonl', 
              'data/gsbs_faculty_list.jsonl', 
              'data/sbmi_faculty_list.jsonl']
faculty_list = []
for input_file in input_files:
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            faculty_list.append(json.loads(line.strip()))

len(faculty_list)
faculty_list[0]

{'image_url': 'https://web.sph.uth.edu/thumbs/addy.jpg',
 'name': 'Robert Addy',
 'profile': '<< Back\nRobert Addy\nFaculty Associate\nHealth Promotion & Behavioral Sciences\n713/500-9758\nView CV\nAbout\nDr. Addy has over 25 years of research experience, including collecting, managing, and analyzing data in multiple contexts.  He has served as data manager for multiple projects involving single and multiple sites, as well as cross-sectional and longitudinal community-based studies in diverse settings (e.g. children, adolescents, adults, schools, clinics, and communities) and with diverse populations, including Hispanic, African-American, American Indian and Alaska Native populations.  He has extensive experience in compiling, cleaning, and processing data in preparation for analysis and distribution.  He is part of a teaching team that provides graduate level classes in program evaluation and data management at UTSPH.\nCenter Affiliation\nCenter for Health Promotion and Prevention Res

In [7]:
# vectorize the faculty documents
# vectorizer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
vectorizer = OpenAIEmbeddings(model="text-embedding-ada-002", 
                              openai_api_key=os.getenv('OPENAI_API_KEY'))

In [8]:
DB = "faculties_vectorstore"

In [9]:
client = chromadb.PersistentClient(path=DB)

In [10]:
# Check if the collection exists and delete it if it does
collection_name = "faculties"

# For old versions of Chroma, use this line instead of the subsequent one
# existing_collection_names = [collection.name for collection in client.list_collections()]
existing_collection_names = client.list_collections()

# if collection_name in [c.name for c in existing_collection_names]:
#     client.delete_collection(collection_name)
#     print(f"Deleted existing collection: {collection_name}")
    
# collection = client.create_collection(collection_name)

In [11]:
# # Uncomment if you'd rather not wait for the full 400,000
# # NUMBER_OF_DOCUMENTS = 20000
# # vectors = vectorizer.encode(faculty_documents).astype(float).tolist()
# faculty_profile = []
# for faculty in faculty_list:
#     faculty_profile.append(faculty['profile'])
# vectors = vectorizer.embed_documents(faculty_profile)
# # metadata is everything except the about field
# metadatas = [{"name": faculty['name'], 'url': faculty['profile_url']} for faculty in faculty_list]
# ids = [f"doc_{j}" for j in range(len(faculty_profile))]
# collection.add(
#     ids=ids,
#     documents=faculty_profile,
#     embeddings=vectors,
#     metadatas=metadatas
# )

In [12]:
collection = client.get_or_create_collection('faculties')

In [13]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])

In [14]:
def find_similars(description):
    results = collection.query(query_embeddings=vectorizer.embed_query(description), n_results=10)
    documents = results['documents'][0][:]
    name = [m['name'] for m in results['metadatas'][0][:]]
    link = [m['url'] for m in results['metadatas'][0][:]]
    return documents, name, link

In [15]:
find_similars("I am looking for a faculty member who is an expert in epidemiology.")

(['<< Back\nElena Feofanova\nAssistant Professor Non-Tenure Instruction\nEpidemiology\n713/500-9827\nReuel Stallones Building\n1200 Pressler Street, Houston, TX 77030\nView CV\nResearch Interests\nEpidemiology',
  '<< Back\nEric Jones\nAssociate Professor\nEpidemiology\n915/975-8528\nMCA Medical Center of the Americas Foundation\n5130 Gateway East Blvd.  MCA 308\nView CV\nAbout\nAs a social epidemiologist, I conduct research on social aspects of wellbeing. A major focus of my studies is on human behavior and wellbeing in extreme circumstances (like disasters, warfare. Migration, cancer), focusing on the role of relationships in stress, depression, anxiety, and functioning. My broad interest in collective action and social support, often using social network analysis, has also led me recently to study health care coordination, drug and violence prevention coalitions, informal and formal science education, and interorganizational dynamics. I continue to be involved in creating and mainta

In [16]:
def make_context(similars):
    message = "To provide some context, here are some faculty members that might be relevant to your description.\n\n"
    documents, names, links = similars
    for similar, name, link in zip(documents, names, links):
        message += f'''Potentially related faculty:\n{name}\n
        website: {link}\n
        {similar}\n\n'''
    return message
make_context(find_similars("I am looking for a faculty member who is an expert in epidemiology."))



In [17]:
def messages_for(description, similars):
    user_prompt = f"Here is my description: {description}\n\n"
    user_prompt += make_context(similars)
    return {"role": "user", "content": user_prompt}
messages_for("I am looking for a faculty member who is an expert in epidemiology.", find_similars("I am looking for a faculty member who is an expert in epidemiology."))

{'role': 'user',

In [18]:
# environment

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

openai = OpenAI()

In [25]:
# print in markdown format
from IPython.display import Markdown, display
def gpt_4o_mini_rag(description, history):

    system_message = "You are a academic advisor. You estimate the relevance of faculty members to a given description. Suggest relevant faculty members. Don't forget to include a link to the faculty member's profile. You should give explanation for your choice in markdown format."
    system_message = {"role": "system", "content": system_message}
    similars = find_similars(description=description)
    messages = messages_for(description, similars)
    messages = [system_message] + history + [messages]
    
    stream = openai.chat.completions.create(
        model="gpt-4o-mini", 
        messages=messages,
        seed=42,
        stream=True
    )
    
    response = ""
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        yield response



def display_markdown_response(description):
    response = gpt_4o_mini_rag(description, [])
    print(response)
    # display(Markdown(response))

display_markdown_response("I am keen on clinical trial. Can you give me faculty members who are experts in develop statistical methods for clinical trial?")

<generator object gpt_4o_mini_rag at 0x1bdd28dc0>


In [40]:
import gradio as gr
MODEL = "gpt-4o-mini"



gr.ChatInterface(fn=gpt_4o_mini_rag, 
                  title="Faculty Advisor Chat",
                    description="Ask about faculty members and their expertise",
                    theme='earneleh/paris',
                    examples=[
                        "I am looking for a faculty member who is an expert in epidemiology",
                        "Can you recommend someone who works on clinical trials?",
                        "Who specializes in machine learning?"
    ],
                 type="messages",

                 ).launch(share=True)


Invalid credentials in Authorization header


* Running on local URL:  http://127.0.0.1:7866
* Running on public URL: https://f621f34332679e690b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import src.scraper.profile_scraper as profile_scraper
faculty = profile_scraper.FacultyProfileScraper("https://gsbs.uth.edu/directory/profile?id=462a94d5-93b0-46e2-b02d-d6d5c2355ecb")
faculty.links
# faculty.text

{'pubmed': 'https://www.ncbi.nlm.nih.gov/myncbi/ziyi.li.1/bibliography/public/',
 'website': 'https://sites.google.com/site/ziyiliemory/e'}

In [None]:
import src.scraper.profile_scraper as profile_scraper
faculty = profile_scraper.FacultyProfileScraper("https://gsbs.uth.edu/directory/profile?id=462a94d5-93b0-46e2-b02d-d6d5c2355ecb")
faculty.links
# faculty.text

{'pubmed': 'https://www.ncbi.nlm.nih.gov/myncbi/ziyi.li.1/bibliography/public/',
 'website': 'https://sites.google.com/site/ziyiliemory/e'}