<a href="https://colab.research.google.com/github/ElizabethWaithera/Data_sciences_projects/blob/main/website_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
from typing import List, Dict

class WebsiteAnalyzer:
    def __init__(self, urls: List[str]):
        self.urls = urls
        self.design_patterns = {}
        self.color_scheme = {}
        self.layout_patterns = {}
        self.component_library = {}

    def fetch_website_content(self, url: str) -> tuple:
        """Fetches HTML and CSS content from a URL"""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()  # Raise an exception for bad status codes
            soup = BeautifulSoup(response.text, 'html.parser')

            # Get HTML content
            html_content = str(soup)

            # Extract CSS content
            css_content = ""
            # From style tags
            for style in soup.find_all('style'):
                css_content += style.string or ""

            # From external stylesheets
            for link in soup.find_all('link', rel='stylesheet'):
                if link.get('href'):
                    try:
                        css_url = link['href']
                        if not css_url.startswith(('http://', 'https://')):
                            base_url = '{uri.scheme}://{uri.netloc}'.format(
                                uri=urlparse(url))
                            css_url = base_url + css_url
                        css_response = requests.get(css_url, headers=headers, timeout=5)
                        if css_response.status_code == 200:
                            css_content += css_response.text
                    except Exception as e:
                        print(f"Warning: Could not fetch CSS from {css_url}: {str(e)}")
                        continue

            return html_content, css_content
        except Exception as e:
            print(f"Error fetching content from {url}: {str(e)}")
            return None, None

    def analyze_structure(self, html_content: str) -> Dict:
        """Analyzes website structure and identifies key patterns"""
        soup = BeautifulSoup(html_content, 'html.parser')

        structure = {
            'layout_type': self._detect_layout_type(soup),
            'main_sections': self._identify_main_sections(soup),
            'navigation_type': self._analyze_navigation(soup),
            'grid_system': self._detect_grid_system(soup),
            'responsive_elements': self._find_responsive_elements(soup)
        }
        return structure

    def _detect_layout_type(self, soup) -> str:
        """Detect the main layout type used"""
        if soup.find_all(class_=re.compile(r'grid|flex')):
            return 'modern-flex-grid'
        elif soup.find_all('table', recursive=False):
            return 'table-based'
        else:
            return 'standard-flow'

    def _identify_main_sections(self, soup) -> List[str]:
        """Identify main sections of the website"""
        sections = []
        for tag in ['header', 'nav', 'main', 'footer', 'aside']:
            if soup.find(tag):
                sections.append(tag)
        return sections

    def _analyze_navigation(self, soup) -> str:
        """Analyze navigation structure"""
        nav = soup.find('nav')
        if nav:
            if nav.find('ul'):
                return 'hierarchical'
            elif nav.find_all(class_=re.compile(r'menu|navigation')):
                return 'custom-menu'
        return 'simple'

    def _detect_grid_system(self, soup) -> str:
        """Detect if a grid system is being used"""
        grid_classes = soup.find_all(class_=re.compile(r'grid|row|col'))
        if grid_classes:
            return 'grid-based'
        return 'no-grid'

    def _find_responsive_elements(self, soup) -> List[str]:
        """Find responsive design elements"""
        responsive_elements = []
        media_queries = soup.find_all(class_=re.compile(r'mobile|tablet|desktop|lg|md|sm|xs'))
        if media_queries:
            responsive_elements.append('media-queries')
        if soup.find('meta', attrs={'name': 'viewport'}):
            responsive_elements.append('viewport-meta')
        return responsive_elements

    def analyze_all_websites(self) -> Dict:
        """Analyzes all provided websites"""
        results = {}

        for url in self.urls:
            print(f"\nAnalyzing {url}...")
            try:
                html_content, css_content = self.fetch_website_content(url)

                if html_content and css_content:
                    results[url] = {
                        'structure': self.analyze_structure(html_content),
                        'design_system': self.extract_design_system(html_content, css_content)
                    }
                    print(f"Successfully analyzed {url}")
                else:
                    print(f"Could not fetch content from {url}")
            except Exception as e:
                print(f"Error analyzing {url}: {str(e)}")
                continue

        return results

    def extract_design_system(self, html_content: str, css_content: str) -> Dict:
        """Extracts complete design system information"""
        return {
            'colors': self._extract_colors(css_content),
            'typography': self._extract_typography(css_content),
            'spacing': self._extract_spacing(css_content),
            'components': self._extract_components(html_content)
        }

    def _extract_colors(self, css_content: str) -> Dict:
        """Extract color patterns from CSS"""
        colors = {
            'primary': set(),
            'background': set(),
            'text': set()
        }

        # Find all color values in CSS
        color_pattern = r'#[0-9a-fA-F]{3,6}|rgb\([^)]+\)|rgba\([^)]+\)'
        found_colors = re.findall(color_pattern, css_content)

        for color in found_colors:
            colors['primary'].add(color)

        return {k: list(v) for k, v in colors.items()}  # Convert sets to lists

    def _extract_typography(self, css_content: str) -> Dict:
        """Extract typography patterns"""
        typography = {
            'fonts': set(),
            'sizes': set()
        }

        # Find font families
        font_pattern = r'font-family:\s*([^;}]+)'
        fonts = re.findall(font_pattern, css_content)
        typography['fonts'].update(fonts)

        # Find font sizes
        size_pattern = r'font-size:\s*([^;}]+)'
        sizes = re.findall(size_pattern, css_content)
        typography['sizes'].update(sizes)

        return {k: list(v) for k, v in typography.items()}

    def _extract_spacing(self, css_content: str) -> Dict:
        """Extract spacing patterns"""
        spacing = {
            'margins': set(),
            'paddings': set()
        }

        # Find margin and padding values
        margin_pattern = r'margin:\s*([^;}]+)'
        padding_pattern = r'padding:\s*([^;}]+)'

        spacing['margins'].update(re.findall(margin_pattern, css_content))
        spacing['paddings'].update(re.findall(padding_pattern, css_content))

        return {k: list(v) for k, v in spacing.items()}

    def _extract_components(self, html_content: str) -> List[str]:
        """Extract common components"""
        soup = BeautifulSoup(html_content, 'html.parser')
        components = []

        # Look for common component patterns
        if soup.find('nav'): components.append('navigation')
        if soup.find('footer'): components.append('footer')
        if soup.find_all(class_=re.compile(r'button|btn')): components.append('buttons')
        if soup.find_all(class_=re.compile(r'card')): components.append('cards')
        if soup.find_all('form'): components.append('forms')

        return components

# Example usage:
websites = ["https://www.dior.com/en_ie", "https://eu.louisvuitton.com/eng-e1/homepage"]
analyzer = WebsiteAnalyzer(websites)
results = analyzer.analyze_all_websites()

# Print results nicely formatted
for url, data in results.items():
    print(f"\nAnalysis results for {url}:")
    print("\nStructure:")
    for key, value in data['structure'].items():
        print(f"  {key}: {value}")
    print("\nDesign System:")
    for key, value in data['design_system'].items():
        print(f"  {key}: {value}")


Analyzing https://www.dior.com/en_ie...
Successfully analyzed https://www.dior.com/en_ie

Analyzing https://eu.louisvuitton.com/eng-e1/homepage...
Error fetching content from https://eu.louisvuitton.com/eng-e1/homepage: HTTPSConnectionPool(host='eu.louisvuitton.com', port=443): Read timed out. (read timeout=10)
Could not fetch content from https://eu.louisvuitton.com/eng-e1/homepage

Analysis results for https://www.dior.com/en_ie:

Structure:
  layout_type: modern-flex-grid
  main_sections: ['header', 'nav', 'main', 'footer']
  navigation_type: simple
  grid_system: grid-based
  responsive_elements: ['media-queries', 'viewport-meta']

Design System:
  colors: {'primary': ['rgb(0 0 0/70%)', 'rgb(0 0 0/40%)', 'rgba(51, 56, 60, 0.04)', '#f6f6f6', '#b3b3b3', 'rgba(35, 39, 42, 1)', '#303030', '#f3f3f3', 'rgba(0, 0, 0, 0.6)', '#acb2b4', 'rgba(0,0,0,.1)', 'rgba(177, 179, 180, 1)', '#5D676C', 'rgba(0,0,0,.05)', '#f2f2f4', '#f3f3f5', '#2ac212', 'rgb(0 0 0/50.2%)', '#acce', 'rgba(0,0,0,.5)', '