<a href="https://colab.research.google.com/github/AC20202021/ArthurCFitler/blob/master/Claude_White_House.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import os
import zipfile
import datetime
from tqdm.notebook import tqdm
from IPython.display import HTML, display
from datetime import datetime
import ipywidgets as widgets
import time
import random

class WhiteHouseEOScraper:
    def __init__(self):
        self.base_url = "https://www.whitehouse.gov/presidential-actions/executive-orders/"
        self.eo_links = []
        self.eo_dates = []
        self.eo_titles = []
        self.download_dir = "executive_orders"

    def get_page_content(self, url, retries=3, delay=1):
        """Get the content of a page with retries and random delay to be respectful."""
        for attempt in range(retries):
            try:
                time.sleep(delay + random.uniform(0.5, 1.5))  # Be polite to the server
                response = requests.get(url, headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
                })
                response.raise_for_status()
                return response.text
            except Exception as e:
                if attempt == retries - 1:
                    raise e
                time.sleep(2 ** attempt)  # Exponential backoff
        return None

    def parse_date(self, date_str):
        """Parse date string into datetime object."""
        try:
            return datetime.strptime(date_str, "%B %d, %Y")
        except ValueError:
            try:
                return datetime.strptime(date_str, "%B %d,%Y")
            except ValueError:
                try:
                    return datetime.strptime(date_str, "%b %d, %Y")
                except ValueError:
                    print(f"Could not parse date: {date_str}")
                    return None

    def scrape_eo_links(self, start_date=None, end_date=None):
        """Scrape all executive order links from the White House website."""
        self.eo_links = []
        self.eo_dates = []
        self.eo_titles = []

        page_num = 1
        more_pages = True

        with tqdm(desc="Scraping pages", unit="page") as pbar:
            while more_pages:
                if page_num == 1:
                    url = self.base_url
                else:
                    url = f"{self.base_url}page/{page_num}/"

                try:
                    html_content = self.get_page_content(url)
                    soup = BeautifulSoup(html_content, 'html.parser')

                    # Find all EO entries
                    eo_items = soup.select('.wp-block-whitehouse-post-template')

                    if not eo_items:
                        more_pages = False
                        continue

                    for item in eo_items:
                        title_elem = item.select_one('.wp-block-post-title a')
                        date_elem = item.select_one('.wp-block-post-date time')

                        if title_elem and date_elem:
                            link = title_elem['href']
                            title = title_elem.text.strip()
                            date_str = date_elem.text.strip()
                            date = self.parse_date(date_str)

                            # Apply date filtering
                            if date:
                                if (start_date is None or date >= start_date) and (end_date is None or date <= end_date):
                                    self.eo_links.append(link)
                                    self.eo_dates.append(date)
                                    self.eo_titles.append(title)

                    # Check if there's a next page
                    next_page = soup.select_one('.wp-block-query-pagination-next')
                    if not next_page:
                        more_pages = False

                    page_num += 1
                    pbar.update(1)

                except Exception as e:
                    print(f"Error scraping page {page_num}: {e}")
                    more_pages = False

        print(f"Found {len(self.eo_links)} Executive Orders matching your criteria.")
        return self.eo_links

    def download_eos(self):
        """Download all executive orders."""
        if not self.eo_links:
            print("No Executive Orders to download.")
            return

        # Create directory if it doesn't exist
        if not os.path.exists(self.download_dir):
            os.makedirs(self.download_dir)

        # Create zip file
        zip_filename = f"{self.download_dir}.zip"

        with zipfile.ZipFile(zip_filename, 'w') as zipf:
            for i, (link, date, title) in enumerate(tqdm(zip(self.eo_links, self.eo_dates, self.eo_titles),
                                                      desc="Downloading Executive Orders",
                                                      total=len(self.eo_links))):
                try:
                    # Get the EO content
                    eo_content = self.get_page_content(link)

                    # Create a safe filename from the title and date
                    safe_title = re.sub(r'[^\w\-]', '_', title)
                    safe_title = re.sub(r'_+', '_', safe_title)  # Replace multiple underscores with one
                    date_str = date.strftime("%Y-%m-%d")
                    filename = f"{date_str}_{safe_title}.html"

                    # Save the file locally first
                    filepath = os.path.join(self.download_dir, filename)
                    with open(filepath, 'w', encoding='utf-8') as f:
                        f.write(eo_content)

                    # Add to zip file
                    zipf.write(filepath, filename)

                except Exception as e:
                    print(f"Error downloading {link}: {e}")

        print(f"Downloaded {len(self.eo_links)} Executive Orders to {zip_filename}")
        return zip_filename

# Create an interactive UI with date selection
def create_ui():
    scraper = WhiteHouseEOScraper()

    # Date pickers
    start_date_picker = widgets.DatePicker(description='Start Date:', disabled=False)
    end_date_picker = widgets.DatePicker(description='End Date:', disabled=False)

    # Optional date checkbox
    use_start_date = widgets.Checkbox(value=True, description='Use start date', disabled=False)
    use_end_date = widgets.Checkbox(value=True, description='Use end date', disabled=False)

    # Button
    scrape_button = widgets.Button(description='Scrape Executive Orders')
    output = widgets.Output()

    # Layout
    date_filters = widgets.VBox([
        widgets.HBox([use_start_date, start_date_picker]),
        widgets.HBox([use_end_date, end_date_picker])
    ])

    # UI assembly
    ui = widgets.VBox([
        widgets.HTML(value="<h2>White House Executive Order Scraper</h2>"),
        widgets.HTML(value="<p>Select a date range to filter Executive Orders. Leave a date unchecked to not use that filter.</p>"),
        date_filters,
        scrape_button,
        output
    ])

    # Button click handler
    def on_button_click(b):
        with output:
            output.clear_output()
            print("Starting scrape process...")

            # Get selected dates
            start_date = start_date_picker.value if use_start_date.value else None
            end_date = end_date_picker.value if use_end_date.value else None

            if start_date:
                start_date = datetime.combine(start_date, datetime.min.time())
            if end_date:
                end_date = datetime.combine(end_date, datetime.max.time())

            # Display selected date range
            if start_date and end_date:
                print(f"Searching for EOs from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
            elif start_date:
                print(f"Searching for EOs from {start_date.strftime('%Y-%m-%d')} to present")
            elif end_date:
                print(f"Searching for EOs from the beginning to {end_date.strftime('%Y-%m-%d')}")
            else:
                print("Searching for all Executive Orders")

            # Run the scraper
            scraper.scrape_eo_links(start_date, end_date)
            if scraper.eo_links:
                zip_file = scraper.download_eos()
                if zip_file:
                    print(f"\nComplete! Your executive orders are in {zip_file}")
                    # Create a download link
                    from google.colab import files
                    files.download(zip_file)
            else:
                print("No Executive Orders found matching your criteria.")

    scrape_button.on_click(on_button_click)
    return ui

# Show the UI
display(create_ui())

VBox(children=(HTML(value='<h2>White House Executive Order Scraper</h2>'), HTML(value='<p>Select a date range …

In [None]:
import os
import re
import zipfile
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm.notebook import tqdm
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import io
import base64

# Suppress warnings
warnings.filterwarnings('ignore')

class EOAnalyzer:
    def __init__(self):
        self.eo_data = None
        self.df = None
        self.stop_words = None
        self.lemmatizer = None

    def initialize_nltk(self):
        """Download necessary NLTK data."""
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt', quiet=True)

        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords', quiet=True)

        try:
            nltk.data.find('corpora/wordnet')
        except LookupError:
            nltk.download('wordnet', quiet=True)

        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def extract_eo_from_zip(self, zip_file_path):
        """Extract Executive Orders from a zip file and parse them."""
        if not os.path.exists(zip_file_path):
            raise FileNotFoundError(f"Zip file not found: {zip_file_path}")

        self.eo_data = []

        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            file_list = zip_ref.namelist()

            for file_name in tqdm(file_list, desc="Extracting and parsing EOs"):
                if file_name.endswith('.html'):
                    try:
                        with zip_ref.open(file_name) as file:
                            html_content = file.read().decode('utf-8')

                            # Parse the HTML
                            soup = BeautifulSoup(html_content, 'html.parser')

                            # Extract EO number and title
                            title = None
                            if soup.select_one('.wp-block-whitehouse-topper__headline'):
                                title = soup.select_one('.wp-block-whitehouse-topper__headline').get_text(strip=True)

                            # Extract date
                            date_str = None
                            date_elem = soup.select_one('.wp-block-whitehouse-topper__meta--date time')
                            if date_elem:
                                date_str = date_elem.get_text(strip=True)

                            # Extract main content
                            content = ""
                            content_div = soup.select_one('.wp-block-post-content')
                            if content_div:
                                # Get all paragraphs
                                paragraphs = content_div.find_all('p')
                                content = ' '.join([p.get_text(strip=True) for p in paragraphs])

                            # Try to extract EO number from title or content
                            eo_number = None
                            eo_number_pattern = r'Executive Order (?:No\.?\s*)?(\d+)'
                            match = re.search(eo_number_pattern, title or '')
                            if not match:
                                match = re.search(eo_number_pattern, content)
                            if match:
                                eo_number = match.group(1)

                            # Parse the date
                            date = None
                            if date_str:
                                try:
                                    date = datetime.strptime(date_str, '%B %d, %Y')
                                except ValueError:
                                    try:
                                        date = datetime.strptime(date_str, '%b %d, %Y')
                                    except ValueError:
                                        pass

                            # Get filename without extension as a backup title
                            filename = os.path.splitext(os.path.basename(file_name))[0]

                            self.eo_data.append({
                                'filename': filename,
                                'title': title or filename,
                                'eo_number': eo_number,
                                'date': date,
                                'date_str': date_str,
                                'content': content,
                                'word_count': len(content.split()),
                                'char_count': len(content)
                            })
                    except Exception as e:
                        print(f"Error processing {file_name}: {e}")

        # Convert to DataFrame
        self.df = pd.DataFrame(self.eo_data)

        # Extract year and month for time-based analysis
        if 'date' in self.df.columns and not self.df['date'].isna().all():
            self.df['year'] = self.df['date'].dt.year
            self.df['month'] = self.df['date'].dt.month
            self.df['year_month'] = self.df['date'].dt.strftime('%Y-%m')

        print(f"Loaded {len(self.df)} Executive Orders.")
        return self.df

    def preprocess_text(self, text):
        """Preprocess text by tokenizing, removing stopwords, and lemmatizing."""
        if not text or pd.isna(text):
            return []

        # Convert to lowercase and tokenize
        tokens = word_tokenize(text.lower())

        # Remove stopwords, punctuation, and lemmatize
        processed_tokens = [
            self.lemmatizer.lemmatize(token) for token in tokens
            if token.isalpha() and token not in self.stop_words and len(token) > 1
        ]

        return processed_tokens

    def search_keywords(self, keywords, case_sensitive=False):
        """Search for keywords in EO content and return matching documents."""
        if not self.df is not None or self.df.empty:
            return pd.DataFrame()

        if isinstance(keywords, str):
            keywords = [keywords]

        # Function to check if any keyword is in the content
        def contains_keywords(text):
            if not text or pd.isna(text):
                return False

            if not case_sensitive:
                text = text.lower()
                kws = [k.lower() for k in keywords]
            else:
                kws = keywords

            return any(kw in text for kw in kws)

        # Create a copy of the dataframe with an additional column for keyword matches
        result_df = self.df.copy()
        result_df['contains_keywords'] = result_df['content'].apply(contains_keywords)

        # Filter for documents containing any of the keywords
        matches = result_df[result_df['contains_keywords']]

        print(f"Found {len(matches)} Executive Orders containing keywords: {', '.join(keywords)}")
        return matches

    def advanced_search(self, query_dict, case_sensitive=False):
        """
        Advanced search with multiple keyword sets joined by logical operators.
        query_dict: Dictionary where keys are set names and values are lists of keywords.
                   Sets are implicitly joined with OR, keywords within a set with AND.
        """
        if not self.df is not None or self.df.empty:
            return pd.DataFrame()

        result_df = self.df.copy()

        for set_name, keywords in query_dict.items():
            if not keywords:
                continue

            if isinstance(keywords, str):
                keywords = [keywords]

            # Function to check if all keywords in this set are in the content
            def contains_all_keywords(text):
                if not text or pd.isna(text):
                    return False

                if not case_sensitive:
                    text = text.lower()
                    kws = [k.lower() for k in keywords]
                else:
                    kws = keywords

                return all(kw in text for kw in kws)

            # Add a column for this keyword set
            result_df[f'contains_{set_name}'] = result_df['content'].apply(contains_all_keywords)

        # Get the keyword set columns
        keyword_cols = [col for col in result_df.columns if col.startswith('contains_') and col != 'contains_keywords']

        # Filter for documents containing any of the keyword sets
        if keyword_cols:
            result_df['contains_keywords'] = result_df[keyword_cols].any(axis=1)
            matches = result_df[result_df['contains_keywords']]
        else:
            matches = pd.DataFrame()

        print(f"Found {len(matches)} Executive Orders matching the advanced search criteria.")
        return matches

    def count_keyword_occurrences(self, keywords, case_sensitive=False):
        """Count occurrences of each keyword across all documents."""
        if not self.df is not None or self.df.empty:
            return None

        if isinstance(keywords, str):
            keywords = [keywords]

        # Function to count occurrences of keywords in text
        def count_occurrences(text):
            if not text or pd.isna(text):
                return {k: 0 for k in keywords}

            counts = {}
            for keyword in keywords:
                if not case_sensitive:
                    count = text.lower().count(keyword.lower())
                else:
                    count = text.count(keyword)
                counts[keyword] = count
            return counts

        # Count occurrences in each document
        occurrence_data = []

        for _, row in self.df.iterrows():
            counts = count_occurrences(row['content'])
            data_row = {
                'title': row['title'],
                'date': row['date'],
                'eo_number': row['eo_number'],
                **counts
            }
            occurrence_data.append(data_row)

        # Create a DataFrame with the counts
        occurrence_df = pd.DataFrame(occurrence_data)

        # Add total columns
        occurrence_df['total_occurrences'] = occurrence_df[keywords].sum(axis=1)

        # Sort by date
        if 'date' in occurrence_df.columns:
            occurrence_df = occurrence_df.sort_values('date')

        return occurrence_df

    def visualize_keyword_frequency_plotly(self, keywords, case_sensitive=False, chart_type='bar', colormap='viridis'):
        """Create interactive Plotly visualizations for keyword frequencies."""
        if not self.df is not None or self.df.empty:
            return None

        occurrence_df = self.count_keyword_occurrences(keywords, case_sensitive)

        if occurrence_df is None or occurrence_df.empty:
            print("No data available for visualization.")
            return None

        # For bar chart or pie chart
        if chart_type in ['bar', 'pie']:
            total_counts = occurrence_df[keywords].sum().reset_index()
            total_counts.columns = ['Keyword', 'Count']

            if chart_type == 'bar':
                fig = px.bar(
                    total_counts,
                    x='Keyword',
                    y='Count',
                    title='Total Occurrences of Keywords in Executive Orders',
                    labels={'Count': 'Number of Occurrences', 'Keyword': 'Keyword'},
                    color='Count',
                    color_continuous_scale=colormap
                )

                fig.update_layout(
                    xaxis_title='Keyword',
                    yaxis_title='Number of Occurrences',
                    font=dict(size=14),
                    height=600,
                    width=900
                )

            else:  # pie chart
                fig = px.pie(
                    total_counts,
                    values='Count',
                    names='Keyword',
                    title='Proportion of Keyword Occurrences in Executive Orders',
                    color_discrete_sequence=px.colors.sequential.Viridis
                )

                fig.update_layout(
                    font=dict(size=14),
                    height=700,
                    width=900
                )

        # For time series chart
        elif chart_type == 'time':
            if 'date' in occurrence_df.columns and not occurrence_df['date'].isna().all():
                # Group by month and calculate the frequency
                time_data = []

                for keyword in keywords:
                    monthly_counts = occurrence_df.set_index('date')[keyword].resample('M').sum()
                    for date, count in monthly_counts.items():
                        time_data.append({
                            'Date': date,
                            'Keyword': keyword,
                            'Count': count
                        })

                time_df = pd.DataFrame(time_data)

                fig = px.line(
                    time_df,
                    x='Date',
                    y='Count',
                    color='Keyword',
                    markers=True,
                    title='Keyword Occurrences Over Time',
                    labels={'Count': 'Number of Occurrences', 'Date': 'Date'},
                    color_discrete_sequence=px.colors.qualitative.Safe
                )

                fig.update_layout(
                    xaxis_title='Date',
                    yaxis_title='Number of Occurrences',
                    legend_title_text='Keyword',
                    font=dict(size=14),
                    height=600,
                    width=1000
                )

            else:
                print("Time series plot not available: Missing or invalid date information.")
                return None

        # For heatmap
        elif chart_type == 'heatmap':
            # Create a pivot table with years as rows and keywords as columns
            if 'year' in occurrence_df.columns:
                pivot_data = []

                for keyword in keywords:
                    yearly_counts = occurrence_df.groupby('year')[keyword].sum()
                    for year, count in yearly_counts.items():
                        pivot_data.append({
                            'Year': year,
                            'Keyword': keyword,
                            'Count': count
                        })

                pivot_df = pd.DataFrame(pivot_data)
                pivot_table = pivot_df.pivot(index='Year', columns='Keyword', values='Count').fillna(0)

                fig = px.imshow(
                    pivot_table,
                    labels=dict(x="Keyword", y="Year", color="Count"),
                    x=pivot_table.columns,
                    y=pivot_table.index,
                    color_continuous_scale=colormap,
                    title='Keyword Frequency Heatmap by Year'
                )

                fig.update_layout(
                    xaxis_title='Keyword',
                    yaxis_title='Year',
                    font=dict(size=14),
                    height=600,
                    width=900
                )

            else:
                print("Heatmap not available: Missing year information.")
                return None

        # For stacked bar chart by year
        elif chart_type == 'stacked_bar':
            if 'year' in occurrence_df.columns:
                yearly_data = occurrence_df.groupby('year')[keywords].sum().reset_index()

                fig = px.bar(
                    yearly_data,
                    x='year',
                    y=keywords,
                    title='Yearly Keyword Occurrences in Executive Orders',
                    labels={'year': 'Year', 'value': 'Number of Occurrences', 'variable': 'Keyword'},
                    color_discrete_sequence=px.colors.qualitative.Bold
                )

                fig.update_layout(
                    xaxis_title='Year',
                    yaxis_title='Number of Occurrences',
                    legend_title_text='Keyword',
                    barmode='stack',
                    font=dict(size=14),
                    height=600,
                    width=1000
                )

            else:
                print("Stacked bar chart not available: Missing year information.")
                return None

        # Display the figure
        fig.show()
        return fig

    def create_word_cloud(self, text_column='content', max_words=100, background_color='white'):
        """Generate a word cloud from the EO content."""
        if not self.df is not None or self.df.empty:
            return

        # Combine all text
        all_text = ' '.join(self.df[text_column].fillna('').astype(str))

        # Preprocess the text
        processed_tokens = self.preprocess_text(all_text)
        processed_text = ' '.join(processed_tokens)

        # Generate word cloud
        wordcloud = WordCloud(
            width=800,
            height=400,
            max_words=max_words,
            background_color=background_color,
            colormap='viridis',
            contour_width=1,
            contour_color='steelblue'
        ).generate(processed_text)

        # Display the word cloud
        plt.figure(figsize=(16, 8))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Executive Orders', fontsize=20)
        plt.tight_layout()
        plt.show()

        return wordcloud

    def create_summary_dashboard(self):
        """Create a summary dashboard with key statistics and visualizations."""
        if not self.df is not None or self.df.empty:
            print("No data available for the summary dashboard.")
            return

        # Create a 2x2 subplot layout
        fig = make_subplots(
            rows=2,
            cols=2,
            subplot_titles=(
                'EOs by Year',
                'Average Word Count by Year',
                'Top 10 Longest EOs',
                'Distribution of EO Word Counts'
            ),
            specs=[
                [{"type": "bar"}, {"type": "scatter"}],
                [{"type": "bar"}, {"type": "histogram"}]
            ]
        )

        # Plot 1: Executive Orders by Year
        if 'year' in self.df.columns:
            yearly_counts = self.df.groupby('year').size().reset_index(name='count')

            fig.add_trace(
                go.Bar(
                    x=yearly_counts['year'],
                    y=yearly_counts['count'],
                    marker_color='royalblue'
                ),
                row=1, col=1
            )

            # Plot 2: Average Word Count by Year
            yearly_word_counts = self.df.groupby('year')['word_count'].mean().reset_index()

            fig.add_trace(
                go.Scatter(
                    x=yearly_word_counts['year'],
                    y=yearly_word_counts['word_count'],
                    mode='lines+markers',
                    marker=dict(color='firebrick'),
                    line=dict(width=3)
                ),
                row=1, col=2
            )

        # Plot 3: Top 10 Longest EOs
        top_eos = self.df.sort_values('word_count', ascending=False).head(10)

        fig.add_trace(
            go.Bar(
                x=top_eos['eo_number'].astype(str),
                y=top_eos['word_count'],
                marker_color='green',
                hovertext=top_eos['title']
            ),
            row=2, col=1
        )

        # Plot 4: Word Count Distribution
        fig.add_trace(
            go.Histogram(
                x=self.df['word_count'],
                nbinsx=20,
                marker_color='purple'
            ),
            row=2, col=2
        )

        # Update layout
        fig.update_layout(
            height=900,
            width=1200,
            title_text="Executive Orders Analysis Dashboard",
            showlegend=False
        )

        # Update axes labels
        fig.update_xaxes(title_text="Year", row=1, col=1)
        fig.update_yaxes(title_text="Number of EOs", row=1, col=1)

        fig.update_xaxes(title_text="Year", row=1, col=2)
        fig.update_yaxes(title_text="Average Word Count", row=1, col=2)

        fig.update_xaxes(title_text="Executive Order Number", row=2, col=1)
        fig.update_yaxes(title_text="Word Count", row=2, col=1)

        fig.update_xaxes(title_text="Word Count", row=2, col=2)
        fig.update_yaxes(title_text="Frequency", row=2, col=2)

        fig.show()

        return fig

# Create the UI for the analyzer
def create_eo_analyzer_ui():
    # Initialize the analyzer
    analyzer = EOAnalyzer()
    analyzer.initialize_nltk()

    # Set up the UI components
    upload_widget = widgets.FileUpload(
        accept='.zip',
        multiple=False,
        description='Upload EO ZIP:',
        layout=widgets.Layout(width='300px')
    )

    # Keyword search
    keyword_input = widgets.Textarea(
        value='climate, energy, immigration, border, healthcare',
        placeholder='Enter keywords separated by commas',
        description='Keywords:',
        layout=widgets.Layout(width='500px', height='60px')
    )

    case_sensitive_checkbox = widgets.Checkbox(
        value=False,
        description='Case Sensitive',
        layout=widgets.Layout(width='150px')
    )

    # Visualization options
    visualization_dropdown = widgets.Dropdown(
        options=[
            ('Bar Chart', 'bar'),
            ('Pie Chart', 'pie'),
            ('Time Series', 'time'),
            ('Heatmap', 'heatmap'),
            ('Stacked Bar Chart', 'stacked_bar'),
            ('Word Cloud', 'wordcloud'),
            ('Dashboard', 'dashboard')
        ],
        value='bar',
        description='Visualization:',
        layout=widgets.Layout(width='200px')
    )

    colormap_dropdown = widgets.Dropdown(
        options=[
            ('Viridis', 'viridis'),
            ('Plasma', 'plasma'),
            ('Inferno', 'inferno'),
            ('Magma', 'magma'),
            ('Cividis', 'cividis'),
            ('Cool', 'cool'),
            ('Rainbow', 'rainbow')
        ],
        value='viridis',
        description='Color Scheme:',
        layout=widgets.Layout(width='200px')
    )

    # Analysis buttons
    search_button = widgets.Button(
        description='Search Keywords',
        button_style='primary',
        layout=widgets.Layout(width='150px')
    )

    visualize_button = widgets.Button(
        description='Visualize',
        button_style='success',
        layout=widgets.Layout(width='150px')
    )

    export_button = widgets.Button(
        description='Export Analysis',
        button_style='info',
        layout=widgets.Layout(width='150px')
    )

    # Save filtered results button
    save_filtered_button = widgets.Button(
        description='Save Filtered Results',
        button_style='warning',
        layout=widgets.Layout(width='180px')
    )

    # Status and results
    status_output = widgets.Output(layout=widgets.Layout(height='100px'))
    results_output = widgets.Output(layout=widgets.Layout(border='1px solid #ddd', padding='10px'))
    viz_output = widgets.Output(layout=widgets.Layout(border='1px solid #ddd', padding='10px'))

    # Organize UI layout
    header = widgets.HTML(
        """
        <div style="background-color:#f8f9fa; padding:10px; border-radius:5px; margin-bottom:10px;">
            <h1 style="text-align:center; color:#2c3e50;">Executive Orders Analysis Tool</h1>
            <p style="text-align:center; color:#7f8c8d;">
                Upload your EO zip file, search for keywords, and generate visualizations to analyze patterns.
            </p>
        </div>
        """
    )

    upload_section = widgets.VBox([
        widgets.HTML("<h3>Step 1: Upload Executive Orders Zip File</h3>"),
        upload_widget,
        status_output
    ])

    search_section = widgets.VBox([
        widgets.HTML("<h3>Step 2: Search for Keywords</h3>"),
        widgets.HBox([keyword_input, case_sensitive_checkbox]),
        search_button
    ])

    visualization_section = widgets.VBox([
        widgets.HTML("<h3>Step 3: Generate Visualizations</h3>"),
        widgets.HBox([visualization_dropdown, colormap_dropdown]),
        widgets.HBox([visualize_button, export_button, save_filtered_button])
    ])

    results_section = widgets.VBox([
        widgets.HTML("<h3>Results:</h3>"),
        results_output
    ])

    viz_section = widgets.VBox([
        widgets.HTML("<h3>Visualizations:</h3>"),
        viz_output
    ])

    # Main UI layout
    main_ui = widgets.VBox([
        header,
        upload_section,
        search_section,
        visualization_section,
        results_section,
        viz_section
    ])

    # Store shared state
    shared_state = {
        'zip_path': None,
        'filtered_df': None
    }

    # Define button actions
    def on_upload_change(change):
        if not change['new']:
            return

        with status_output:
            clear_output()
            print("Uploading file...")

            try:
                # Save the uploaded file
                file_data = list(change['new'].values())[0]
                file_name = file_data['metadata']['name']

                # Save the file
                with open(file_name, 'wb') as f:
                    f.write(file_data['content'])

                print(f"Uploaded {file_name}")
                shared_state['zip_path'] = file_name

                # Load the data
                analyzer.extract_eo_from_zip(file_name)

            except Exception as e:
                print(f"Error: {e}")

    def on_search_click(b):
        if not analyzer.df is not None or analyzer.df.empty:
            with status_output:
                clear_output()
                print("Please upload a zip file first.")
            return

        with results_output:
            clear_output()
            print("Searching for keywords...")

            # Get keywords from input
            keywords_text = keyword_input.value
            keywords = [k.strip() for k in keywords_text.split(',') if k.strip()]

            # Search for keywords
            filtered_df = analyzer.search_keywords(
                keywords,
                case_sensitive=case_sensitive_checkbox.value
            )

            # Store the filtered dataframe
            shared_state['filtered_df'] = filtered_df

            # Display results
            if not filtered_df.empty:
                display(HTML(f"<p>Found {len(filtered_df)} Executive Orders containing the keywords.</p>"))

                # Show a preview of the results
                display(HTML("<h4>Top 10 EOs containing keywords:</h4>"))

                # Create a summary table
                summary_df = filtered_df[['title', 'date', 'eo_number', 'word_count']]

                if 'date' in summary_df.columns and not summary_df['date'].isna().all():
                    summary_df = summary_df.sort_values('date', ascending=False)

                display(summary_df.head(10))
            else:
                display(HTML("<p>No Executive Orders found containing the specified keywords.</p>"))

    def on_visualize_click(b):
        if not analyzer.df is not None or analyzer.df.empty:
            with status_output:
                clear_output()
                print("Please upload a zip file first.")
            return

        if shared_state['filtered_df'] is None:
            with status_output:
                clear_output()
                print("Please search for keywords first.")
            return

        with viz_output:
            clear_output()
            print("Generating visualization...")

            # Get keywords and visualization type
            keywords_text = keyword_input.value
            keywords = [k.strip() for k in keywords_text.split(',') if k.strip()]
            viz_type = visualization_dropdown.value
            colormap = colormap_dropdown.value

            # Generate visualization
            if viz_type == 'wordcloud':
                analyzer.create_word_cloud(max_words=150, background_color='white')
            elif viz_type == 'dashboard':
                analyzer.create_summary_dashboard()
            else:
                analyzer.visualize_keyword_frequency_plotly(
                    keywords,
                    case_sensitive=case_sensitive_checkbox.value,
                    chart_type=viz_type,
                    colormap=colormap
                )

    def on_export_click(b):
        if not analyzer.df is not None or analyzer.df.empty:
            with status_output:
                clear_output()
                print("Please upload a zip file first.")
            return

        if shared_state['filtered_df'] is None:
            with status_output:
                clear_output()
                print("Please search for keywords first.")
            return

        with status_output:
            clear_output()
            print("Exporting analysis...")

            # Get keywords
            keywords_text = keyword_input.value
            keywords = [k.strip() for k in keywords_text.split(',') if k.strip()]

            # Export analysis
            analyzer.export_analysis_to_html(
                keywords,
                case_sensitive=case_sensitive_checkbox.value,
                filename='eo_analysis.html'
            )

            print("Analysis exported to 'eo_analysis.html'")

    def on_save_filtered_click(b):
        if shared_state['filtered_df'] is None:
            with status_output:
                clear_output()
                print("Please search for keywords first.")
            return

        with status_output:
            clear_output()
            print("Saving filtered results...")

            # Save filtered results
            analyzer.save_filtered_eos(shared_state['filtered_df'], 'filtered_eos.csv')

            print("Filtered results saved to 'filtered_eos.csv'")

    # Attach event handlers
    upload_widget.observe(on_upload_change, names='value')
    search_button.on_click(on_search_click)
    visualize_button.on_click(on_visualize_click)
    export_button.on_click(on_export_click)
    save_filtered_button.on_click(on_save_filtered_click)

    return main_ui

# Display the UI
display(create_eo_analyzer_ui())

VBox(children=(HTML(value='\n        <div style="background-color:#f8f9fa; padding:10px; border-radius:5px; ma…