# Create Scripts from Wikipedia Articles

The code below creates the scripts for the YouTube videos using the articles posted on the figures from Wikipedia.

TO DO

    * Add the date an article was generated. If it passes a certain threshold and a video is yet to be posted then we should really pull the data again or add a feature to check when the site was last updated and compare against that.

## Import Libraries

In [1]:
import docx2txt
import json
import openpyxl
import os
import re
import wikipedia

import pandas as pd

from docx import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

## Set OpenAI Key

Set the OpenAI key from the credentials file

In [2]:
os.environ["OPENAI_API_KEY"] = json.load(open("../Credentials.json"))['OPENAI_API_KEY']

## Read the Background Documents

In [3]:
script_loc = "../Scripts/"

set_scripts_loc = "../Set Scripts/"
intro = docx2txt.process(set_scripts_loc + 'Intro.docx')
outro = docx2txt.process(set_scripts_loc + 'Outro.docx')

historical_figures_list = pd.read_excel(r'../Historical Figures List.xlsx')

## Functions to get Context from Wikipedia

In [4]:
def parse_wikipedia_url(identifier):
    # Parse a wikipedia url to get the title
    match = re.search(r'/([^/]+)$', identifier)
    return match.group(1)

def get_context(identifier):
    # Get the title from the wiki URL
    page_title = parse_wikipedia_url(identifier)
    
    # Get the page data and return the body text as context
    page = wikipedia.page(title=page_title,
                         auto_suggest=False,
                          redirect=True)
    context = page.content
    
    return context

## Create Chain

In [5]:
llm_model = "gpt-4-1106-preview"

model = ChatOpenAI(model=llm_model, temperature=0.1)

prompt_template = """
    Please provide me with 10 very long and interesting facts about {figure} using only the context given below. 
    I have a youtube channel based on influential figures so this information must be accurate as my channel depends on it. 
    Please make each fact approximately 300 tokens long.
    ----------
    Context: ```{context}```
    ----------
"""

prompt = ChatPromptTemplate.from_template(prompt_template)

output_parser = StrOutputParser()

chain = prompt | model | output_parser

In [6]:
# Here we want to capture any figures who need audio. We can create this in one loop.
need_script = historical_figures_list[(historical_figures_list.Script_Created == "No")]

In [7]:
need_script

Unnamed: 0,Figure_ID,Name,Description,Script_Created,AI_Voice_Generated,Images_Obtained,Youtube_Video_Created,Youtube_Video_Posted,Youtube_URL,Word_Count,Model_Used,Wikipedia
233,234,Bobby Fischer,American chess player,No,No,No,No,No,,,,https://en.wikipedia.org/wiki/Bobby_Fischer
234,235,Garry Kasparov,Chess player,No,No,No,No,No,,,,https://en.wikipedia.org/wiki/Garry_Kasparov


In [8]:
# We only do this if audio is needed
if need_script.shape[0] > 0:
    
    # Get the figures we need scripts for
    figure_scripts = list(need_script.iloc[:,1].values)
    
    for figure in figure_scripts:

        # Get the context from Wikipedia and run the chain
        wiki_url = historical_figures_list.loc[(historical_figures_list.Name == figure),'Wikipedia'].values[0]
        context = get_context(wiki_url)
        
        script = chain.invoke({"figure": figure, "context": context})

        # Count the number of words in the paragraph
        word_count = len(script.split())

        # Print the figure's name and the word count
        print(f"{figure}: {word_count} Words")
        
        # Create a new Document and add the created script then save as figure name
        doc = Document()
        doc.add_paragraph(script)
        save_loc = script_loc + figure + ".docx"
        doc.save(save_loc)
        
        # Convert the No to Yes in needs audio column
        historical_figures_list.iloc[(historical_figures_list.Name == figure),3] = "Yes"
        historical_figures_list.iloc[(historical_figures_list.Name == figure),9] = word_count
        historical_figures_list.iloc[(historical_figures_list.Name == figure),10] = llm_model
        
        # Once all done stop the engine and then overwrite the Excel file
        historical_figures_list.to_excel(r'../Historical Figures List.xlsx', index=False)
    
    print("Complete, my guy!")

Bobby Fischer: 814 Words
Garry Kasparov: 842 Words
Complete, my guy!
