In [52]:
from langchain.document_loaders import YoutubeLoader

urls = [
    "https://www.youtube.com/watch?v=_8bMMqy37y8&ab_channel=ThisWeekinStartups",
    "https://www.youtube.com/watch?v=1SWEF-lyW28&ab_channel=ThisWeekinStartups",
    "https://www.youtube.com/watch?v=oc5tHbEK0IQ&ab_channel=ThisWeekinStartups",
    "https://www.youtube.com/watch?v=jrd4snFDSVA&ab_channel=ThisWeekinStartups",
]

data = []
for url in urls:
    loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
    data.append(loader.load())

In [53]:
for doc in data:
    print(doc[0].metadata)

{'source': '_8bMMqy37y8&ab_channel=ThisWeekinStartups', 'title': 'ChatGPT vs Hollywood writers and the WGA strike with Lon Harris | E1750', 'description': None, 'view_count': 95554, 'thumbnail_url': 'https://i.ytimg.com/vi/_8bMMqy37y8/hq720.jpg', 'publish_date': datetime.datetime(2023, 5, 25, 0, 0), 'length': 4611, 'author': 'This Week in Startups'}
{'source': '1SWEF-lyW28&ab_channel=ThisWeekinStartups', 'title': 'Demoing Google’s MusicLM, AssemblyAI, and other AI tools with Sunny Madra | E1747', 'description': None, 'view_count': 104069, 'thumbnail_url': 'https://i.ytimg.com/vi/1SWEF-lyW28/hq720.jpg', 'publish_date': datetime.datetime(2023, 5, 22, 0, 0), 'length': 3526, 'author': 'This Week in Startups'}
{'source': 'oc5tHbEK0IQ&ab_channel=ThisWeekinStartups', 'title': 'Fireside chat with Jason Calacanis & Brad Gerstner hosted by Mubadala’s Ibrahim Ajami | E1746', 'description': None, 'view_count': 96371, 'thumbnail_url': 'https://i.ytimg.com/vi/oc5tHbEK0IQ/hq720.jpg', 'publish_date': 

In [54]:
import tiktoken

def num_tokens_from_string(string):
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [55]:
for doc in data:
    print(
        f"Tokens: {num_tokens_from_string(doc[0].page_content)}, chars: {len(doc[0].page_content)}"
    )

Tokens: 16086, chars: 75385
Tokens: 11837, chars: 55701
Tokens: 19134, chars: 92022
Tokens: 11027, chars: 52236


In [66]:
from copy import deepcopy

high_engagement_data = deepcopy(data)
for doc in high_engagement_data:
    chunk = len(doc[0].page_content) // 6
    doc[0].page_content = doc[0].page_content[:chunk]

In [67]:
for doc in high_engagement_data:
    print(
        f"Tokens: {num_tokens_from_string(doc[0].page_content)}, chars: {len(doc[0].page_content)}"
    )

Tokens: 2624, chars: 12564
Tokens: 1968, chars: 9283
Tokens: 3224, chars: 15337
Tokens: 1813, chars: 8706


In [64]:
import json
import asyncio
import time
from langchain.chains import LLMChain
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI


async def chain_run(chain, doc, response):
    result = await chain.arun({"text": doc, "response_json": response})
    return result


async def generate_concurrently(data):
    response_json = json.dumps(
        {"topic": "topic here", "theme": "theme here", "summary": "summary  here"}
    )
    prompt_template = """Write a concise summary of the following:
        {text}
        Make sure to return the summary is concise and is 150 words at max.
        Also return the topic of discussion  in the given text. The topic should not be longer than 5 words . You should figure out the theme of discussion of the text as well. 
        Make sure to return your response in json format given below:
        {response_json}
        """
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)
    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["text", "response_json"]
    )
    chain = LLMChain(llm=llm, prompt=PROMPT, output_key="good_vids")
    tasks = [chain_run(chain, doc, response_json) for doc in data]
    responses = await asyncio.gather(*tasks)
    return responses

In [68]:
s = time.perf_counter()
good_vids_summaries = await generate_concurrently(high_engagement_data)
elapsed = time.perf_counter() - s
print("\033[1m" + f"Concurrently executed in {elapsed:0.2f} seconds." + "\033[0m")

[1mConcurrently executed in 16.03 seconds.[0m


In [69]:
good_vids_summaries

['{"topic": "Writer\'s Strike", "theme": "Impact of streaming on writer\'s career", "summary": "The writer\'s strike is impacting the entertainment industry as writers are demanding better pay and working conditions. The rise of streaming services has changed the landscape of the industry, with shorter seasons and fewer full-time writing positions. While rates have not dipped dramatically, writers are working for less time and finding it harder to build a full career. The competition for jobs is increasing, and many writers are now freelancers who move from one gig to another. This has led to a shortage of showrunners and head writers, making it difficult to train the next generation of talent. The strike is highlighting the need for better compensation and working conditions for writers in the industry."}',
 '{"topic": "AI Tools for Music Generation", "theme": "Exploring the capabilities of AI in generating music", "summary": "In this episode of This Week in Startups, Sunny Madra demo

In [70]:
from langchain.document_loaders import YoutubeLoader

low_engagement_urls = [
    "https://www.youtube.com/watch?v=UeIV4KcSUlk",
    "https://www.youtube.com/watch?v=hNcLMN_bZCM",
    "https://www.youtube.com/watch?v=ANd4jPLnMAU",
    "https://www.youtube.com/watch?v=J8YnxrGEzT4",
]

vid_data = []
for url in low_engagement_urls:
    loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
    vid_data.append(loader.load())

In [71]:
for doc in vid_data:
    print(doc[0].metadata)

{'source': 'UeIV4KcSUlk', 'title': 'Banking crisis impact, more Meta cuts, and GPT-4 with Sunny Madra and Vinny Lingham | E1700', 'description': None, 'view_count': 20414, 'thumbnail_url': 'https://i.ytimg.com/vi/UeIV4KcSUlk/hq720.jpg', 'publish_date': datetime.datetime(2023, 3, 16, 0, 0), 'length': 4973, 'author': 'This Week in Startups'}
{'source': 'hNcLMN_bZCM', 'title': "Bing dodges $100B bullet & IVP's Tom Loverro on the looming startup collapse | E1678", 'description': None, 'view_count': 20514, 'thumbnail_url': 'https://i.ytimg.com/vi/hNcLMN_bZCM/hq720.jpg', 'publish_date': datetime.datetime(2023, 2, 14, 0, 0), 'length': 4441, 'author': 'This Week in Startups'}
{'source': 'ANd4jPLnMAU', 'title': 'Coinbase cuts 20%, Microsoft to invest $10B into OpenAI & Ascend Elements CEO Mike O’Kronley | E1654', 'description': None, 'view_count': 30715, 'thumbnail_url': 'https://i.ytimg.com/vi/ANd4jPLnMAU/hq720.jpg', 'publish_date': datetime.datetime(2023, 1, 10, 0, 0), 'length': 5704, 'author

In [72]:
from copy import deepcopy

low_engagement_data = []
low_engagement_data = deepcopy(vid_data)
for doc in low_engagement_data:
    chunk = len(doc[0].page_content) // 6
    doc[0].page_content = doc[0].page_content[:chunk]

In [73]:
low_engagement_vids = [doc[0] for doc in low_engagement_data]

s = time.perf_counter()
bad_vid_summaries = await generate_concurrently(low_engagement_vids)
elapsed = time.perf_counter() - s
print("\033[1m" + f"Concurrent executed in {elapsed:0.2f} seconds." + "\033[0m")

[1mConcurrent executed in 15.75 seconds.[0m


In [74]:
bad_vid_summaries

['{"topic": "Banking crisis", "theme": "Impact of Silicon Valley Bank shutdown on startups and banking industry", "summary": "Sunny Madra and Vinny Lingham discuss the impact of the Silicon Valley Bank shutdown on startups and the banking industry. They talk about the anxiety and uncertainty among entrepreneurs and investors, the lack of clarity from banks and regulators, and the $620 billion of unrealized losses due to the rise in interest rates. They also touch on the potential adoption of a CBDC by the US and the impact of social media on banking. The discussion includes tips on how to protect startups and treasuries in these changing times and how to play the startup game with changing rules."}',
 '{"topic": "Data sharing by apps", "theme": "Privacy concerns and need for federal legislation", "summary": "A recent report by The Washington Post reveals that 11 companies have been selling bundles of health data, including information on antidepressants, insomnia, attention issues, Alz

In [75]:
import json


def format_json_objects(data, meta_info):
    formatted_strings = []
    for i, (summary, metadata) in enumerate(zip(data, meta_info), start=1):
        obj = json.loads(summary)  # Convert JSON object to dictionary
        topic = obj["topic"]
        theme = obj["theme"]
        summary = obj["summary"]
        views = metadata["view_count"]
        title = metadata["title"].split("|")[0].strip()

        formatted_string = f"Video {i}\nTitle: {title}\nView Count: {views}\nTopic: {topic}\nTheme : {theme}\nSummary: {summary}\n====================="
        formatted_strings.append(formatted_string)

    result = "\n".join(formatted_strings)
    return result


good_vids_metadata = [doc[0].metadata for doc in high_engagement_data]
bad_vids_metadata = [doc[0].metadata for doc in low_engagement_data]
good_vids_prompt_str = format_json_objects(good_vids_summaries, good_vids_metadata)
bad_vids_prompt_str = format_json_objects(bad_vid_summaries, bad_vids_metadata)

In [76]:
good_vids_prompt_str



In [77]:
response_json = json.dumps(
    {
        "new_ideas": [
            {"topic": "topic 1 here", "theme": "theme 1 here"},
            {"topic": "topic 2 here", "theme": "theme 2 here"},
        ]
    }
)
prompt_template = """ You are helpful AI assistant that helps to increase the engagement of youtube videos by analyzing the scripts of old videos.\
Looking at the given videos below in High_engagement_Videos and Low_engagement_videos sections,\
come up with new ideas for next videos.\
    
High_Engagement_Videos:
{good_perf_vids}
    
Low_Engagement_Videos:
{bad_perf_vids}
    
Make sure to return at least 3 new ideas and return your response in the json format given below. Do not repeat the already existing topics.
{response_json}

Look at the titles of videos, extract the guest names from them, and understand the common patterns and attributes.\
Based on the commonalities and view count of videos, create new video ideas.\
The new videos should not have any content from Low_Engagement_Videos.\
Make sure to not include any speaker name in your suggested video topics or themes.
If you don't know the answer, just say "Hmm, I'm not sure."\
Don't try to make up an answer. 
"""
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)
PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["good_perf_vids", "bad_perf_vids", "response_json"],
)
final_chain = LLMChain(llm=llm, prompt=PROMPT, verbose=True)
res = final_chain.run(
    {
        "good_perf_vids": good_vids_prompt_str,
        "bad_perf_vids": bad_vids_prompt_str,
        "response_json": response_json,
    }
)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m You are helpful AI assistant that helps to increase the engagement of youtube videos by analyzing the scripts of old videos.Looking at the given videos below in High_engagement_Videos and Low_engagement_videos sections,come up with new ideas for next videos.    
High_Engagement_Videos:
Video 1
Title: ChatGPT vs Hollywood writers and the WGA strike with Lon Harris
View Count: 95554
Topic: Writer's Strike
Theme : Impact of streaming on writer's career
Summary: The writer's strike is impacting the entertainment industry as writers are demanding better pay and working conditions. The rise of streaming services has changed the landscape of the industry, with shorter seasons and fewer full-time writing positions. While rates have not dipped dramatically, writers are working for less time and finding it harder to build a full career. The competition for jobs is increasing, and many writers are now freelancers wh

In [78]:
import pprint

pp = pprint.PrettyPrinter(indent=4)

pp.pprint(json.loads(res))

{   'new_ideas': [   {   'theme': 'Exploring the potential of VR technology in '
                                  'creating immersive experiences for viewers',
                         'topic': 'The Future of Virtual Reality in '
                                  'Entertainment Industry'},
                     {   'theme': 'Discussing the ethical implications of '
                                  'using AI in healthcare and its impact on '
                                  'patient care',
                         'topic': 'The Ethics of AI in Healthcare'},
                     {   'theme': 'Analyzing the growth of social audio '
                                  'platforms and its impact on traditional '
                                  'audio formats and user behavior',
                         'topic': 'The Rise of Social Audio: Opportunities and '
                                  'Challenges'}]}
