In [15]:
import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell, new_code_cell

def markdown_to_jupyter(markdown_text):
    nb = new_notebook()
    lines = markdown_text.split('\n')
    current_cell = []
    in_code_block = False
    
    for line in lines:
        if line.startswith('```'):
            if in_code_block:
                # End of code block
                nb.cells.append(new_code_cell('\n'.join(current_cell)))
                current_cell = []
                in_code_block = False
            else:
                # Start of code block
                if current_cell:
                    nb.cells.append(new_markdown_cell('\n'.join(current_cell)))
                    current_cell = []
                in_code_block = True
        else:
            current_cell.append(line)
    
    # Add any remaining content as a markdown cell
    if current_cell:
        nb.cells.append(new_markdown_cell('\n'.join(current_cell)))
    
    return nb

# Example usage
markdown_text = r'''
# Climate Tech Fundraisers Data Scraping and Processing

This notebook demonstrates the process of scraping, cleaning, processing, and performing feature engineering on a climate tech fundraiser dataset. The data is collected from various newsletters and blogs, then cleaned and enriched with additional information for further analysis.

## Table of Contents

1. [Introduction](#introduction)
2. [Data Collection and Cleaning](#data-collection-and-cleaning)
   - [Data Crawling and Scraping](#data-crawling-and-scraping)
   - [Data Extraction with GPT-4](#data-extraction-with-gpt-4)
3. [Deduplication and Location Standardization](#deduplication-and-location-standardization)
   - [Deduplication of Fundraiser Entities](#deduplication-of-fundraiser-entities)
   - [Fetching Addresses Using Selenium](#fetching-addresses-using-selenium)
   - [Location Standardization and Geolocation](#location-standardization-and-geolocation)
   - [Extracting Telephone Numbers](#extracting-telephone-numbers)
   - [Country and Continent Information](#country-and-continent-information)
4. [Outlier Mitigation and Final Steps](#outlier-mitigation-and-final-steps)
5. [Data Display Preparation](#data-display-preparation)

---

<a id='introduction'></a>
## 1. Introduction

In this notebook, we will create a dataset of climate technology fundraisers from March 2020 to January 2024 by scraping newsletters and blogs published by various sources. The dataset captures key investments, innovative startups, and pivotal fundraising events driving solutions to environmental challenges.

---

<a id='data-collection-and-cleaning'></a>
## 2. Data Collection and Cleaning

<a id='data-crawling-and-scraping'></a>
### 2.1 Data Crawling and Scraping

We will use Selenium to crawl and scrape newsletters and blogs from climate websites. This generic code snippet can be adapted for any newsletter website.

```python
# Import necessary libraries
from bs4 import BeautifulSoup
from currency_converter import CurrencyConverter
from io import StringIO
import openai
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import dateparser
import numpy as np
import os
import pandas as pd
import re
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

# Set up OpenAI API key
os.environ["OPENAI_API_KEY"] = "<Your-OpenAI-API-Key>"
openai.api_key = os.environ.get("OPENAI_API_KEY")

# Initialize data structures
website_page_html = dict()

# URL of the webpage to scrape
url = 'https://www.keepcool.co/'

# Set up the Selenium WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Go to the webpage
driver.get(url)

# Wait for the page to load and extract links
WebDriverWait(driver, 10).until(
    EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a[href]'))
)
link_elements = driver.find_elements(By.CSS_SELECTOR, 'a[href]')
links = [element.get_attribute('href') for element in link_elements]

# Close the browser
driver.quit()

# Filter links to match certain criteria
post_links = [l for l in links if f'{url}/p' in l]

# Function to process HTML tags
def process_tag(tag):
    try:
        # If the tag is a link, return it as is
        if tag.name == "a":
            return str(tag)
        # If the tag contains other tags, process each child
        if tag.find('a') or tag.find('href'):
            return ''.join(process_tag(child) if child else '' for child in tag.children)
        # For text elements, return the text
        return tag.string if tag.string else ''
    except:
        return ''

# Scrape individual posts
driver = webdriver.Chrome(service=service)
posts = dict()

for link in post_links:
    print('Processing:', link)

    # Go to the post page
    driver.get(link)

    content = driver.page_source
    soup = BeautifulSoup(content, 'html.parser')

    # Extract information
    organization_name = soup.find('meta', property='og:site_name')['content']
    title = soup.find('title').text
    authors = ''.join(soup.select_one('.bh__byline_wrapper').get_text('\n').split('\n')[:-1])
    date_posted = soup.select_one('.bh__byline_wrapper .text-wt-text-on-background').text
    content_blocks = soup.select('#content-blocks p')

    # Process content blocks
    content_with_html = []
    for block in content_blocks:
        processed_block = process_tag(block)
        content_with_html.append(f'<p>{processed_block}</p>')

    content = '\n'.join(content_with_html)
    posts[link] = {
        'title': title,
        'authors': authors,
        'date_posted': date_posted,
        'content': content
    }

# Close the browser
driver.quit()
website_page_html['Keep Cool'] = {'url': url, 'posts': posts}
```

---

<a id='data-extraction-with-gpt-4'></a>
### 2.2 Data Extraction with GPT-4

We use GPT-4 to extract fundraiser information from the scraped content and transform it into a tabular format.

```python
# Helper functions and regex patterns
def contains_fundraising_keywords(text):
    keywords = ["$", "€", "million", "billion", "dollar"]
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in keywords)

emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"
    u"\U0001F300-\U0001F5FF"
    u"\U0001F680-\U0001F6FF"
    u"\U0001F1E0-\U0001F1FF"
    u"\U00002500-\U00002BEF"
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U00010000-\U0010ffff"
    u"\u2640-\u2642"
    u"\u2600-\u2B55"
    u"\u200d"
    u"\u23cf"
    u"\u23e9"
    u"\u231a"
    u"\ufe0f"
    u"\u3030"
    "]+", flags=re.UNICODE)

# Define the prompt for GPT-4
fundraising_analysis_prompt = """I want you to act as a data analyst. Analyze the newsletter below and consolidate data in a tabular format. The columns in the table should be the company or agency raising the funds or making the donation, the amount raised or donated, the date the fundraiser was published or the funding was made available as outlined in the newsletter, a brief description of the fundraiser or the work done by the company, and a list of links to the fundraiser company's website or the article talking about the funding. Output only the table and no other text. Finally, a column to indicate the sector each fundraising company falls under. Choose from one of the following climate (clean) technology sectors: 

Built Environment: Making our buildings more energy efficient and reliable
Carbon Technology: How we can lower the concentration of CO2 in Earth's atmosphere
Energy and Grid: Electricity powers everything — It's time to electrify
Food and Agriculture: Regenerative agriculture and ways to sustainably produce food
Industry and Manufacturing: Decarbonize the entire lifecycle of the production of goods
Intelligence and Adaptation: Climate risk and data that help us adapt to the effects of climate change
Supporting Catalysts: Businesses that indirectly support decarbonization efforts
Transportation and Mobility: alternative mobility and electrifying how we move people and things
Other: not fitting in any category above"""

newsletter_prompt_template = """Title of newsletter: <TITLE>.
Newsletter date: <NEWSLETTER_DATE>.
Newsletter content: <NEWSLETTER_CONTENT>."""

# Process each newsletter
fundraisers = dict()

for publication_firm in website_page_html:
    fundraisers[publication_firm] = dict()

    for newsletter_link in website_page_html[publication_firm]['posts']:
        print('Processing:', publication_firm, newsletter_link)

        newsletter_post = website_page_html[publication_firm]['posts'][newsletter_link]
        newsletter_content = newsletter_post['content']
        content_paragraphs = newsletter_content.replace('</p><p>', '</p>\n<p>').split('\n')
        fundraising_paragraphs = [p for p in content_paragraphs if contains_fundraising_keywords(p)]
        newsletter_fundraising_content = '\n'.join(fundraising_paragraphs)

        # Clean and prepare the content
        soup = BeautifulSoup(newsletter_fundraising_content, 'html.parser')
        for img_tag in soup.find_all('img'):
            img_tag.decompose()

        newsletter_prompt = fundraising_analysis_prompt + emoji_pattern.sub(
            r'', newsletter_prompt_template.replace('<TITLE>', newsletter_post['title'])
            .replace('<NEWSLETTER_DATE>', newsletter_post['date_posted'])
            .replace('<NEWSLETTER_CONTENT>', str(soup))
        )
        newsletter_prompt = newsletter_prompt.replace(' class="link"', '').replace(' target="_blank"', '').replace(
            ' rel="noopener noreferrer nofollow"', '').replace(
            ' style="-webkit-text-decoration:underline #5c14d9;color:#5c14d9;font-style:italic;text-decoration:underline #5c14d9;"', ''
        ).replace('<strong>', '').replace('</strong>', '').replace('<blockquote>', '').replace('</blockquote>', '')

        # Call GPT-4 API
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[{"role": "user", "content": newsletter_prompt}]
        )
        fundraisers[publication_firm][newsletter_link] = response.choices[0].message.content

# Consolidate the extracted data into a DataFrame
extended_fr_list = []
for publication_firm in fundraisers:
    firm_url = website_page_html[publication_firm]['url']
    for newsletter_link in fundraisers[publication_firm].keys():
        newsletter_post = website_page_html[publication_firm]['posts'][newsletter_link]
        fr_df = pd.read_csv(StringIO(fundraisers[publication_firm][newsletter_link]), sep='|').dropna(axis=1, how='all').iloc[1:]
        fr_df = fr_df[[col for col in fr_df.columns if not col.startswith('Unnamed:')]]
        if len(fr_df.columns) == 7:
            fr_df.drop(list(fr_df.columns)[1], axis=1, inplace=True)
        extended_fr_list.extend([
            [
                publication_firm, firm_url, newsletter_post['title'], newsletter_link, newsletter_post['authors'],
                newsletter_post['date_posted'], *list(row)[1:]
            ] for row in fr_df.itertuples()
        ])

fundraisers_df = pd.DataFrame(
    data=extended_fr_list,
    columns=[
        'name_of_newsletter_firm', 'url_link_to_firms_website', 'title_of_newsletter', 'link_to_newsletter',
        'names_of_newsletter_authors', 'newsletter_date', 'fundraising_entity', 'amount_raised',
        'date_of_funding_reported', 'fundraiser_description', 'link_to_fundraising_announcement', 'clean_technology_sector'
    ]
)
```

---

<a id='deduplication-and-location-standardization'></a>
## 3. Deduplication and Location Standardization

<a id='deduplication-of-fundraiser-entities'></a>
### 3.1 Deduplication of Fundraiser Entities

We perform deduplication using fuzzy string matching.

```python
from rapidfuzz import process, fuzz

def deduplicate(records):
    seen = dict()
    for record in records:
        if record is None:
            continue
        if len(seen) == 0:
            seen[record] = record
            continue
        # Check if similar record is already seen
        match, score, _ = process.extractOne(record, seen.keys(), scorer=fuzz.WRatio)

        if score < 93:  # Threshold of 93%
            seen[record] = record
        elif score < 100:
            seen[record] = match
            print(f"'{record}' matched with '{match}' at score {score}")

    return seen

# Apply deduplication
unique_companies_dict = deduplicate(sorted(fundraisers_df['fundraising_entity'].unique()))
fundraisers_df['fundraising_entity'] = fundraisers_df['fundraising_entity'].replace(unique_companies_dict)
```

---

<a id='fetching-addresses-using-selenium'></a>
### 3.2 Fetching Addresses Using Selenium

We scrape addresses from online databases like CB Insights.

```python
from selenium.webdriver.common.by import By

# Setup WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

def fetch_headquarters_location_with_selenium(company_name):
    try:
        # Navigate to the company's CB Insights page
        url = f"https://www.cbinsights.com/company/{company_name}"
        driver.get(url)

        # Wait for the page to load
        driver.implicitly_wait(2)

        # Find the 'Headquarters Location' element
        header = driver.find_element(By.XPATH, "//h2[contains(text(), 'Headquarters Location')]")
        address = header.find_element(By.XPATH, "./following-sibling::address")

        location = address.text
        return location.replace('\n', ', ').replace(',,', ',')
    except:
        return 'Address not found'

fundraiser_loc_dict = dict()

for entity in fundraisers_df['fundraising_entity'].unique():
    loc = fetch_headquarters_location_with_selenium(
        entity.lower().replace(' ', '-').replace('.', '-').strip(' °')
    )
    if loc not in ("Failed to retrieve search results.", "Address not found", ""):
        fundraiser_loc_dict[entity] = loc

driver.quit()

fundraisers_df['Location'] = fundraisers_df['fundraising_entity'].map(fundraiser_loc_dict)
```

---

<a id='location-standardization-and-geolocation'></a>
### 3.3 Location Standardization and Geolocation

We standardize company locations and convert them into GPS coordinates using GeoPy.

```python
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="geoapiExercises")

def standardize_location(row):
    if row['Location'] != 'N/A':
        try:
            location = geolocator.geocode(row['Location'], exactly_one=True, timeout=10)
            if location:
                return location.address
            else:
                return row['Location']
        except GeocoderTimedOut:
            return row['Location']
    else:
        return row['Location']

def get_geolocation(row):
    if row['Standardized location'] != 'N/A':
        try:
            location = geolocator.geocode(row['Standardized location'], exactly_one=True, timeout=10)
            if location:
                return (location.latitude, location.longitude)
        except GeocoderTimedOut:
            return None
    return None

# Apply standardization
fundraisers_df['Standardized location'] = fundraisers_df.apply(standardize_location, axis=1)
fundraisers_df['GeoLocation'] = fundraisers_df.apply(get_geolocation, axis=1)
```

---

<a id='extracting-telephone-numbers'></a>
### 3.4 Extracting Telephone Numbers

We extract telephone numbers from the standardized locations.

```python
def extract_telephone(address):
    # Regex pattern to match phone numbers
    pattern = re.compile(r'[\s,]*(?:\+?\d{1,3}[\s-]?)?(?:\(\d{1,3}\)[\s-]?)?\d{1,4}[\s-]?\d{1,4}[\s-]?\d{1,4}(?:[\s-]?\d{1,4})?[\s-]?\d{1,4}$')
    match = pattern.search(address)
    if match:
        phone = match.group()
        cleaned_address = pattern.sub('', address).strip()
        return cleaned_address, phone
    return address, None

fundraisers_df['Standardized location'], fundraisers_df['Telephone'] = zip(
    *fundraisers_df['Standardized location'].apply(extract_telephone)
)
```

---

<a id='country-and-continent-information'></a>
### 3.5 Country and Continent Information

We use GPS coordinates to get country and continent information using `pycountry`.

```python
import pycountry
import pycountry_convert as pc

def get_continent(country_name):
    try:
        country_alpha2 = pycountry.countries.get(name=country_name).alpha_2
        continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    except Exception as e:
        print(f"Error processing country name '{country_name}': {e}")
        return None

def get_location_info(row):
    country, continent = None, None
    if row['GeoLocation'] is not None:
        try:
            location = geolocator.reverse(row['GeoLocation'], exactly_one=True)
            address = location.raw['address']
            country_code = address.get('country_code', '').upper()
            if country_code:
                country = pycountry.countries.get(alpha_2=country_code).name
                continent = get_continent(country)
        except Exception as e:
            print(f"Error at index {row.name}: {e}")
    return pd.Series([country, continent])

fundraisers_df[['Country', 'Continent']] = fundraisers_df.apply(get_location_info, axis=1)
```

---

<a id='outlier-mitigation-and-final-steps'></a>
## 4. Outlier Mitigation and Final Steps

We omitted records with unrealistic amounts or irrelevant data. We manually reviewed records with significant discrepancies to ensure location accuracy. Clean technology sectors were also inferred using zero-shot text classification to ensure consistency.

---

<a id='data-display-preparation'></a>
## 5. Data Display Preparation

We prepare the DataFrame for display by selecting relevant columns and cleaning data.

```python
def create_climate_fundraisers_data_display_df(climate_fundraisers_df):
    data_display_df = climate_fundraisers_df[
        [
            'fundraising_entity', 'fundraiser_description', 'date_of_funding_reported', 'amount_raised',
            'normalized_amount_raised', 'clean_technology_sector', 'Standardized location', 'Country',
            'Telephone', 'link_to_fundraising_announcement', 'title_of_newsletter', 'link_to_newsletter',
            'names_of_newsletter_authors', 'newsletter_date', 'name_of_newsletter_firm', 'url_link_to_firms_website'
        ]
    ]

    # Clean links and amounts
    data_display_df['link_to_fundraising_announcement'] = data_display_df['link_to_fundraising_announcement'].apply(
        lambda x: max(re.findall(r'\(https?://[^\s)]+\)', x), key=len).strip('()') if '[' in x else None
    )
    data_display_df['normalized_amount_raised'] = data_display_df['normalized_amount_raised'].apply(
        lambda x: int(x[1:]) if x != 'N/A' else None
    )
    data_display_df['link_to_newsletter'] = data_display_df['link_to_newsletter'] + "?ref=" + data_display_df['title_of_newsletter']
    data_display_df.drop('title_of_newsletter', axis=1, inplace=True)
    data_display_df['url_link_to_firms_website'] = data_display_df['url_link_to_firms_website'] + "?ref=" + data_display_df['name_of_newsletter_firm']

    return data_display_df

# Create the display DataFrame
data_display_df = create_climate_fundraisers_data_display_df(fundraisers_df)
```

---

**Note:** Ensure all required libraries are installed and replace `<Your-OpenAI-API-Key>` with your actual OpenAI API key. Adjust code snippets as needed based on actual data and website structures.
'''

In [16]:
notebook = markdown_to_jupyter(markdown_text)
with open('output_notebook.ipynb', 'w') as f:
    nbformat.write(notebook, f)