In [1]:
import torch
import time
import os
import langchain
from langchain.chat_models import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from loguru import logger
from dotenv import load_dotenv
from math import ceil
load_dotenv()
num_question_to_make_per_hundred = 10


In [27]:
chat = ChatOpenAI(model_name='gpt-3.5-turbo-16k')

import pandas as pd
res = []
columns = ["EPI_NUM", "EpiName", "Podcast", "Person", "Abstract"]
df = pd.DataFrame(columns=columns)
df.set_index("EPI_NUM", drop=True, append=False, inplace=False, verify_integrity=False)
print(df)

pd.__version__


def pipeline_cleanup(doc):
    root_prompt = f"""I want you to clean up the following text downloaded from a webpage. 
    Our goal is to keep the actual article without changing the content. 
    1. remove social media, privacy stuff from the end of the document.
    3. Only return the cleaned up article, and nothing else. """ 
    
    
    prompt1 = f"""
    -----------article-----------------  
    {doc}
    """
    
    messages = [
        SystemMessage(content=root_prompt),
        HumanMessage(content=prompt1)
    ]
    
    cleaned_text = chat(messages).content
    return cleaned_text

def pipeline_write_questions(doc):
    root_prompt = f"""Write two questions based on the given document. 
    
    [Q1]. Write a ONE PART question about someone's work discussed in the document - The question should not need any context to be understandable. 
    [Q2]. Write a ONE PART question about an abstract subject discussed in the document. - - The question should not need any context to be understandable. 
    
    
    Questions have to make sense without any context.
    Questions should have only on part.
    
    Avoid the following:
        Questions do not point to the article. 
        The words "article" and its synonyms.
        The words "author" and its synonyms.
        The words "author" and its synonyms.


    GOOD EXAMPLES:
        1. What was Immanuel Kant's preoccupation regarding working out things about the world without experience?
        2. What did Peter Singer argue in his book "The Life You Can Save"?
    
    
    BAD EXAMPLES:
        1. Who is being interviewed in this episode of Philosophy Bites? (requires context)
        2. Have I killed the man intentionally? (requires context)
        3. What is the main theme or concept discussed in the article? (requires context)
        4. Can you explain what metaphysics is and how it can be characterized? (Two parts)
        5. What are green virtues and why should we be concerned with them? (Two parts)
    
    do not output anything but the questions listed in the above order.
    """ 
    
    
    prompt1 = f"""
    -----------article-----------------  
    {doc}
    """
    
    
    messages = [
        SystemMessage(content=root_prompt),
        HumanMessage(content=prompt1)
    ]
    
    ai_questions = chat(messages).content
    return ai_questions


def parse_ai_questions(ai_questions, doc_idx, podcast_title, episode_title):
    qs = ai_questions.split('\n')
    question_dict = {"EPI_NUM": doc_idx ,"EpiName": episode_title, "Podcast":podcast_title}

    question_type = {
                "Q1":"Person",
                "Q2":"Abstract",
                }

    for q in qs:
        q_code = q[1:3]
        if q_code in question_type:
            q_type = question_type[q_code]
            question_dict[q_type] = q[5:]
    
    logger.info(f"{doc_idx}, {question_dict}")
    return question_dict
    

Empty DataFrame
Columns: [EPI_NUM, EpiName, Podcast, Person, Abstract]
Index: []


In [3]:
import time
# from engine.constants import Podcast
from datetime import date
from math import ceil
# cleaning data for Philosophize This
base_url = ".\..\data\philosophize_this\episode_transcripts" 
transcripts_path = base_url

res = []
cleaned_docs = {}

transcripts = os.listdir(transcripts_path)

pz_episode_count = len(transcripts)


num_questions_to_make = num_question_to_make_per_hundred * ceil(pz_episode_count / 100) * 2
print(num_questions_to_make)



40


In [23]:

pz_episode_indices = torch.randint(1, pz_episode_count, (num_questions_to_make + 5, )).unique()


pz_episode_indices = sorted(pz_episode_indices.tolist())
pz_episode_indices = [10, 12, 15, 16, 19, 27, 31, 32, 41, 47, 48, 58, 61, 63, 64, 73, 75, 76, 86, 88, 90, 92, 93, 100, 103, 104, 106, 108, 114, 115, 120, 121, 123, 127, 128, 129, 131, 132, 142, 145, 152, 160, 169, 170, 173, 176]
print(pz_episode_indices)
print(len(pz_episode_indices))

[10, 12, 15, 16, 19, 27, 31, 32, 41, 47, 48, 58, 61, 63, 64, 73, 75, 76, 86, 88, 90, 92, 93, 100, 103, 104, 106, 108, 114, 115, 120, 121, 123, 127, 128, 129, 131, 132, 142, 145, 152, 160, 169, 170, 173, 176]
46


In [25]:
import os

processed_url = ".\\..\\data\\philosophize_this\\transcripts_processed" 


for doc_idx in pz_episode_indices:
    doc_path = f"{base_url}\{str(doc_idx).rjust(3, '0')}.txt"
    cleaned_doc_path = r"{processed_url}\\{doc_idx}.txt".format(processed_url=processed_url, doc_idx=doc_idx)
    if not os.path.isfile(cleaned_doc_path):
        print(f"Cleaning for {doc_idx}")
        # open the raw doc
        doc = open(doc_path, 'r', encoding="utf8").read()
        logger.info(f"loading {doc_idx} done, cleaning")
        # clean up the doc
        cleaned_doc = pipeline_cleanup(doc)
        cleaned_docs[doc_idx] = cleaned_doc
        time.sleep(2)
        # save the doc in cleaned documents
        f = open(f"{doc_idx}.txt", 'a+')
        f.write(cleaned_doc)
        f.close()
    else: # open the doc
        print(f"Loading doc {doc_idx} from cache")
        cleaned_doc_file = open(cleaned_doc_path, 'r')
        cleaned_docs[doc_idx] = cleaned_doc_file.read()
    
    

# question creation pipeline 
        


transcripts = os.listdir(transcripts_path)




# podcast_title = Podcast.PHILOSOPHIZE_THIS
podcast_title = "Philosophize This"


Loading doc 10 from cache
Loading doc 12 from cache
Loading doc 15 from cache
Loading doc 16 from cache
Loading doc 19 from cache
Loading doc 27 from cache
Loading doc 31 from cache
Loading doc 32 from cache
Loading doc 41 from cache
Loading doc 47 from cache
Loading doc 48 from cache
Loading doc 58 from cache
Loading doc 61 from cache
Loading doc 63 from cache
Loading doc 64 from cache
Loading doc 73 from cache
Loading doc 75 from cache
Loading doc 76 from cache
Loading doc 86 from cache
Loading doc 88 from cache
Loading doc 90 from cache
Loading doc 92 from cache
Loading doc 93 from cache
Loading doc 100 from cache
Loading doc 103 from cache
Loading doc 104 from cache
Loading doc 106 from cache
Loading doc 108 from cache
Loading doc 114 from cache
Loading doc 115 from cache
Loading doc 120 from cache
Loading doc 121 from cache
Loading doc 123 from cache
Loading doc 127 from cache
Loading doc 128 from cache
Loading doc 129 from cache
Loading doc 131 from cache
Loading doc 132 from cac

In [29]:
# question creation pipeline


today = date.today()
for doc_idx in pz_episode_indices:
    doc = cleaned_docs[doc_idx]
    ai_questions = pipeline_write_questions(doc)
    logger.info(ai_questions)
    res.append((doc_idx, ai_questions))
    questions = parse_ai_questions(ai_questions, doc_idx, podcast_title=podcast_title, episode_title=doc_idx) # episode title is not passed correctly
    for key, val in questions.items():
        logger.info(f"{key}: {val}")
    df.loc[len(df)]=questions
    df.to_csv(path_or_buf=f"questions-{podcast_title}-{today}.csv", mode='w', index=True)
    time.sleep(2)
    

[32m2023-12-28 21:37:42.073[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1m[Q1]. What did Diogenes carve into a wall in the middle of town?
[Q2]. According to Epicurus, what is the ultimate form of pleasure and the goal of life?[0m
[32m2023-12-28 21:37:42.073[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_ai_questions[0m:[36m96[0m - [1m10, {'EPI_NUM': 10, 'EpiName': 10, 'Podcast': 'Philosophize This', 'Person': ' What did Diogenes carve into a wall in the middle of town?', 'Abstract': ' According to Epicurus, what is the ultimate form of pleasure and the goal of life?'}[0m
[32m2023-12-28 21:37:42.073[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mEPI_NUM: 10[0m
[32m2023-12-28 21:37:42.073[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mEpiName: 10[0m
[32m2023-12-28 21:37:42.073[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mPodcast: Philosoph