# Create Scripts from Wikipedia Articles

The code below creates the scripts for the YouTube videos using the articles posted on the figures from Wikipedia.

TO DO

    * Add the date an article was generated. If it passes a certain threshold and a video is yet to be posted then we should really pull the data again or add a feature to check when the site was last updated and compare against that.
    * Instead of using OpenAI use llama.cpp and an open sourced model.

## Import Libraries

In [42]:
import docx2txt
import json
import openpyxl
import os
import re
import urllib.parse
import wikipedia

import pandas as pd

from docx import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

## Set OpenAI Key

Set the OpenAI key from the credentials file

In [43]:
os.environ["OPENAI_API_KEY"] = json.load(open("../Credentials.json"))['OPENAI_API_KEY']

In [1]:
def need_script(df_loc):

    df = pd.read_excel(df_loc)

    need_script = df[(df.Script_Created == "No")]

    return need_script

In [2]:
def parse_wikipedia_url(identifier):
    # Decode the url
    decoded_string = urllib.parse.unquote(identifier)
    # Parse a wikipedia url to get the title
    match = re.search(r'/([^/]+)$', decoded_string)
    return match.group(1)

In [3]:
def get_context(identifier):
    # Get the title from the wiki URL
    page_title = parse_wikipedia_url(identifier)
    
    # Get the page data and return the body text as context
    page = wikipedia.page(title=page_title,
                         auto_suggest=False,
                          redirect=True)
    context = page.content
    
    return context

In [4]:
yt_prompt = """
    Please provide me with 10 very long and interesting facts about {figure} using only the context given below. 
    I have a youtube channel based on influential figures so this information must be accurate as my channel depends on it. 
    Please make each fact approximately 300 tokens long. DO NOT add additional text before the list of facts.
    ----------
    Context: ```{context}```
    ----------
"""

In [5]:
yt_short_prompt = """
    Please provide me with 10 short but interesting facts about {figure} using only the context given below. 
    I have a youtube channel based on influential figures so this information must be accurate as my channel depends on it. 
    DO NOT add additional text before the list of facts.
    ----------
    Context: ```{context}```
    ----------
"""

In [6]:
def create_chain(llm_model="gpt-4o", temperature=0.1, prompt=yt_prompt):
    
    model = ChatOpenAI(model=llm_model, temperature=temperature)

    prompt = ChatPromptTemplate.from_template(prompt)

    output_parser = StrOutputParser()

    chain = prompt | model | output_parser

    return chain

In [7]:
def create_script(figure, script_loc, df_loc,
                   llm_model="gpt-4o", temperature=0.1, prompt=yt_prompt):

        df = pd.read_excel(df_loc)
    
        # Get the context from Wikipedia and run the chain
        wiki_url = df.loc[(df.Name == figure),'Wikipedia'].values[0]
        print(f"Wikipedia URL: {wiki_url}")
    
        context = get_context(wiki_url)

        # Create the chain and invoke it
        chain = create_chain(llm_model, temperature, prompt)
        script = chain.invoke({"figure": figure, "context": context})

        # Count the number of words in the paragraph
        word_count = len(script.split())

        # Print the figure's name and the word count
        print(f"{figure}: {word_count} Words")
        
        # Create a new Document and add the created script then save as figure name
        doc = Document()
        doc.add_paragraph(script)
        save_loc = script_loc + figure + ".docx"
        doc.save(save_loc)
        
        # Convert the No to Yes in needs audio column
        df.loc[(df.Name == figure), "Script_Created"] = "Yes"
        df.loc[(df.Name == figure), "Word_Count"] = word_count
        df.loc[(df.Name == figure), "Model_Used"] = llm_model
        
        # Once all done stop the engine and then overwrite the Excel file
        df.to_excel(df_loc, index=False)