In [None]:
# PURPOSE: This script will be used to scrape the web for data and rewrite it with GPT-3

import requests
import json
import os
from datetime import datetime

import openai
import frontmatter
from bs4 import BeautifulSoup

In [None]:
def extract_text_from_class(url: str, class_name: str) -> str:
    try:
        # Send a request to the URL
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find elements with the specified class
        elements = soup.find_all(class_=class_name)

        # Extract and return the text from these elements
        text = [element.get_text(strip=True) for element in elements]
        return text[0]

    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []

def split_text(text, max_words=2900):
    words = text.split(" ")
    for i in range(0, len(words), max_words):
        yield ' '.join(words[i:i + max_words])

def rewrite_text(chunk, open_ai_url, headers, is_first_chunk):
    prompt = """REWRITE THE FOLLOWING TEXT IN A CONFIDENT TONE. THE TEXT YOU GENERATE SHOULD BE 
     COMPLETELY DIFFERENT FROM THE ORIGINAL TEXT. 
     
     If there is any 'proprietary language' such as 'Our Research' or 'Our Study', please remove it. 
    Make sure every sentence is rewritten.  Also, please remove any references to 
    'McKinsey' or 'Quantum Black'.\n"""

    if is_first_chunk:
        prompt += """
        Add frontmatter to the top of the document with the following information,
        - summary point is only one sentence
        - Provide a maximum of three points 
        - the title and subTitle have quotes (i.e. "<title>") 

        Return the text starting with the frontmater (i.e. "---") and nothing else.
        ---
        title: "<title in quotes>"
        subTitle: "<subTitle in quotes>"
        category: <category>
        date: <Month Year> 
        headers:
        -  "Cache-Control: max-age=86400"
        recommended: true

        points:
        - <point 1>
        - <point 2>
        - <point 3>
        ---
        """

    prompt = prompt + f"**text**\n{chunk}"
    data = {
        "model": "gpt-3.5-turbo",
        "messages": [{"role": "user", "content": prompt}]
    }
    #print("Prompt: \n\n\n", prompt, ' \n\n\n\n')

    response = requests.post(open_ai_url, headers=headers, data=json.dumps(data))
    try:
        return json.loads(response.text)['choices'][0]['message']['content']
    except Exception as e:
        print( json.loads( response.text ), '<<< response \n\n\n' )



def create_folder_and_file_from_md(text, source_url, base_path="../general"):
    # Parse the frontmatter
    parsed = frontmatter.loads(text)
    title = parsed.metadata.get('title', 'Untitled').replace(':', '-')

    # Create a folder named after the title
    folder_path = os.path.join(base_path, title)
    folder_path = folder_path.replace(':', '-')
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Write the content into a file within the new folder
    file_path = os.path.join(folder_path, title + '.md')
    with open(file_path, 'w') as file:
        file.write(frontmatter.dumps(parsed))

    # save source
    file_path = os.path.join(folder_path, 'source.txt')
    with open(file_path, 'w') as file:
        file.write(source_url)

def process_text(text, url, headers):
    rewritten_chunks = []
    is_first_chunk = True

    for chunk in split_text(text):
        rewritten_chunk = rewrite_text(chunk, url, headers, is_first_chunk)
        rewritten_chunks.append(rewritten_chunk)
        is_first_chunk = False  # Only the first chunk includes the frontmatter prompt
        #print(f'-------------------- Completed chunk @ {datetime.now()}')

    return ' '.join(rewritten_chunks)

In [None]:
import os
os.environ.get('GPT_ACCESS_KEY')

In [None]:
# Extract text from a given URL and class name

# URL and class name
urls = [ 
    "https://www.mckinsey.com/industries/real-estate/our-insights/generative-ai-can-change-real-estate-but-the-industry-must-change-to-reap-the-benefits",
    "https://www.mckinsey.com/capabilities/quantumblack/our-insights/winning-with-ai-is-a-state-of-mind",
    "https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/the-data-dividend-fueling-generative-ai",
    "https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/data-ethics-what-it-means-and-what-it-takes",
    "https://www.mckinsey.com/capabilities/quantumblack/our-insights/demystifying-data-mesh",
    "https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/how-data-can-help-tech-companies-thrive-amid-economic-uncertainty",
    "https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-data-driven-enterprise-of-2025",
    "https://www.mckinsey.com/capabilities/risk-and-resilience/our-insights/model-risk-management-2-point-0-evolves-to-address-continued-uncertainty-of-risk-related-events",
    "https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/rewired-and-running-ahead-digital-and-ai-leaders-are-leaving-the-rest-behind",
    "https://www.mckinsey.com/capabilities/growth-marketing-and-sales/our-insights/how-generative-ai-can-boost-consumer-marketing",
    "https://www.mckinsey.com/capabilities/strategy-and-corporate-finance/our-insights/gen-ai-a-guide-for-cfos",
    "https://www.mckinsey.com/industries/consumer-packaged-goods/our-insights/how-consumer-companies-outcompete-high-performing-operating-models",
    "https://www.mckinsey.com/capabilities/quantumblack/our-insights/how-to-unlock-the-full-value-of-data-manage-it-like-a-product",
    "https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/technologys-generational-moment-with-generative-ai-a-cio-and-cto-guide",
    "https://www.mckinsey.com/capabilities/quantumblack/our-insights/four-essential-questions-for-boards-to-ask-about-generative-ai",
    "https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/the-economic-potential-of-generative-ai-the-next-productivity-frontier",
    "https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights/tech-forward/realizing-more-value-from-data-projects"
]
class_name = "mdc-o-content-body mck-u-dropcap"

# Extract and print text
for url in urls:
    extracted_text = extract_text_from_class(url, class_name)

    # usage
    extracted_text = extracted_text
    open_ai_url = "https://api.openai.com/v1/chat/completions"
    openai_api_key ='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' #os.environ.get('GPT_ACCESS_KEY')
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_api_key}"
    }

    final_text = process_text(extracted_text, open_ai_url, headers)
    create_folder_and_file_from_md(final_text, url)
    #print(f'-------------------- Response {final_text} \n\n\n')
    print(f'-------------------- Completed {url} @ {datetime.now()} \n\n')

In [None]:
# -------------------- OLD CODE -------------------- 
def split_text(text, max_words=3000):
    words = text.split()
    for i in range(0, len(words), max_words):
        yield ' '.join(words[i:i + max_words])

def rewrite_text_with_openai(chunk):
    response = openai.Completion.create(
        engine="davinci", 
        prompt="Rewrite the following text in a confident tone. Remove :\n" + chunk,
        max_tokens=4096  # Adjust as needed
    )
    return response.choices[0].text.strip()

def process_text(text):
    rewritten_chunks = []

    for chunk in split_text(text):
        rewritten_chunk = rewrite_text_with_openai(chunk)
        rewritten_chunks.append(rewritten_chunk)

    return ' '.join(rewritten_chunks)

# Your API key for OpenAI
openai_api_key = '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'
openai.api_key = openai_api_key

# Your long text
long_text = extracted_text

# Process the text
final_text = process_text(long_text[0])
print(final_text)


In [None]:

def parse_markdown_with_frontmatter(text):
    # Split the text at the start of frontmatter and end of frontmatter
    frontmatter_end_index = text.find('---', 3) + 3
    frontmatter = text[:frontmatter_end_index]
    content = text[frontmatter_end_index:]

    # Split the content into sections based on '\n\n'
    sections = content.split('\n\n')

    return frontmatter, sections

frontmatter_, sections = parse_markdown_with_frontmatter(rewritten_text)
#print("Frontmatter:\n\n", frontmatter)
#for section in sections:
#    print("Section:\n\n", section)

In [None]:

# rewrite the text via OpenAI
openai_api_key = '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'
url = "https://api.openai.com/v1/chat/completions"
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai_api_key}"
}

data = {
    "model": "gpt-3.5-turbo",
    "messages": [{"role": "user", "content": f"""
    Rewrite the following text in a confident tone. If there is any "proprietary 
    language such as "Our Research" or "Our Study" please remove it. Also, please
    remove any references to "McKinsey" or "Quantum Black".

Add frontmatter to the top of the document with the following information,
where summary point is only one sentence and there is a maximum of three points.
---
title: <title>
subTitle: <subTitle>
category: <category>
date: <Month Year> 
headers:
  Cache-Control: max-age=86400
recommended: true

points:
  - <point 1>
  - <point 2>
  - <point 3>
---

    **text**
    {extracted_text[0]}
    """}]
}
response = requests.post(url, headers=headers, data=json.dumps(data))
rewritten_text = json.loads( response.text )['choices'][0]['message']['content']

In [None]:

## OLD ##
def split_text(text, max_words=2000):
    words = text[0].split()
    for i in range(0, len(words), max_words):
        yield ' '.join(words[i:i + max_words])

def rewrite_text(chunk, open_ai_url, headers):
    data = {
        "model": "gpt-3.5-turbo",
        "messages": [{"role": "user", "content": f"""
        Rewrite the following text in a confident tone. If there is any "proprietary 
        language such as "Our Research" or "Our Study" please remove it. Also, please
        remove any references to "McKinsey" or "Quantum Black".

        Add frontmatter to the top of the document with the following information,
        summary point is only one sentence and there is a maximum of three points. And, the 
        "title" and "subTitle" should be wrapped in quotes.
        ---
        title: <title>
        subTitle: <subTitle>
        category: <category>
        date: <Month Year> 
        headers:
        Cache-Control: max-age=86400
        recommended: true

        points:
        - <point 1>
        - <point 2>
        - <point 3>
        ---

            **text**
            {chunk}
        """}]
    }
    print(len(data['messages'][0]['content']), '<<<< len(data) \n\n\n')
    print(data, '<<<< data \n\n\n')
    print(open_ai_url, '<<<< open_ai_url \n')
    response = requests.post(open_ai_url, headers=headers, data=json.dumps(data))

    print( json.loads( response.text ), '<<< response \n\n\n' )
    return json.loads(response.text)['choices'][0]['message']['content']

def process_text(text, url, headers):
    rewritten_chunks = []

    for chunk in split_text(text):
        rewritten_chunk = rewrite_text(chunk, url, headers)
        rewritten_chunks.append(rewritten_chunk)
        print(f'Completed chunk @ {datetime.now()}')

    return ' '.join(rewritten_chunks)

# usage
extracted_text = extracted_text
open_ai_url = "https://api.openai.com/v1/chat/completions"
openai_api_key = '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'
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai_api_key}"
}

final_text = process_text(extracted_text, open_ai_url, headers)
print(final_text)