
# üåü Exercise 5: Scrape & Analyze Weather Data (AccuWeather)

This notebook:
- Uses **Selenium** to load a JavaScript-enabled AccuWeather page  
- Parses the rendered HTML with **BeautifulSoup**  
- Extracts **temperature, condition, humidity**  
- Computes **average temperature** and **most common condition**  
- Saves results to CSV

> Works in **VSCode Jupyter** and **Google Colab**. If running in Colab, the install cell will set up Chromium automatically.


In [None]:

# üîß Install dependencies
# - In Colab: also install Chromium & Chromedriver
# - In VSCode Jupyter: only pip install needed packages

import sys, subprocess

def run(cmd):
    print("$", cmd)
    subprocess.run(cmd, shell=True, check=True)

IN_COLAB = False
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB:
    run("apt-get -qq update")
    run("apt-get -qq install -y chromium-browser chromium-chromedriver")
    run(f"{sys.executable} -m pip -q install --upgrade pip")
    run(f"{sys.executable} -m pip -q install selenium beautifulsoup4 lxml pandas python-dateutil")
else:
    run(f"{sys.executable} -m pip -q install --upgrade pip")
    run(f"{sys.executable} -m pip -q install selenium webdriver-manager beautifulsoup4 lxml pandas python-dateutil")
    
print("Environment:", "Colab" if IN_COLAB else "Local/VSCode")


In [None]:

# üì¶ Imports & constants
import re, time, os
from collections import Counter
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

URL = "https://www.accuweather.com/en/us/attica/30607/weather-forecast/2139413"

# Regex helpers
TEMP_RE = re.compile(r"(-?\d+)\s*¬∞")              # e.g., 73¬∞
HUM_RE  = re.compile(r"Humidity:?\s*(\d+)\s*%", re.I)
COND_WORDS = [
    "Sunny","Cloudy","Partly","Mostly","Clear","Rain","Showers","Storm",
    "Thunder","Snow","Overcast","Fog","Haze","Drizzle","Wind"
]

def best_condition(text: str):
    hits = [w for w in COND_WORDS if re.search(rf"\b{re.escape(w)}\b", text, re.I)]
    if not hits:
        return None
    hits.sort(key=len, reverse=True)  # prefer longer terms
    return hits[0]

def parse_weather(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "lxml")
    rows = []
    # AccuWeather markup changes; try generic article blocks first
    cards = soup.select("article, li, section, div")
    for c in cards:
        text = " ".join(c.stripped_strings)
        if ("¬∞" not in text) and ("Humidity" not in text):
            continue
        temps = TEMP_RE.findall(text)
        hum_val = None
        m = HUM_RE.search(text)
        if m:
            hum_val = int(m.group(1))
        cond = best_condition(text) or "Unknown"

        temp_val = None
        if len(temps) >= 2:
            try:
                temp_val = round((int(temps[0]) + int(temps[1]))/2, 1)  # average hi/lo
            except Exception:
                pass
        elif len(temps) == 1:
            try:
                temp_val = int(temps[0])
            except Exception:
                pass

        if any([temp_val is not None, hum_val is not None, cond]):
            rows.append({"temp": temp_val, "humidity": hum_val, "condition": cond})

    df = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True)
    if not df.empty:
        df["temp"] = pd.to_numeric(df["temp"], errors="coerce")
        df["humidity"] = pd.to_numeric(df["humidity"], errors="coerce")
        df["condition"] = df["condition"].fillna("Unknown")
        # keep sensible rows
        df = df.dropna(how="all", subset=["temp","humidity","condition"]).reset_index(drop=True)
    return df


In [None]:

# üöó Selenium setup for Colab (Chromium) AND Local VSCode (Chrome via webdriver-manager)

import sys
IN_COLAB = False
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

def create_driver(headless=True):
    chrome_opts = Options()
    if headless:
        chrome_opts.add_argument("--headless=new")
    chrome_opts.add_argument("--no-sandbox")
    chrome_opts.add_argument("--disable-dev-shm-usage")
    chrome_opts.add_argument("--window-size=1400,1200")
    chrome_opts.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36"
    )

    if IN_COLAB:
        # Use Chromium installed via apt
        chrome_opts.binary_location = "/usr/bin/chromium-browser"
        driver_path_candidates = ["/usr/lib/chromium-browser/chromedriver", "/usr/bin/chromedriver"]
        driver_path = next((p for p in driver_path_candidates if os.path.exists(p)), None)
        if driver_path is None:
            raise FileNotFoundError("Chromedriver not found in Colab. Re-run the install cell.")
        service = Service(driver_path)
        return webdriver.Chrome(service=service, options=chrome_opts)
    else:
        # Local VSCode: use webdriver-manager to match your installed Chrome
        from webdriver_manager.chrome import ChromeDriverManager
        service = Service(ChromeDriverManager().install())
        return webdriver.Chrome(service=service, options=chrome_opts)


In [None]:

# üåê Load page, wait for JS content, collect HTML
driver = create_driver(headless=True)
driver.get(URL)

# Wait for forecast-like containers; layout can vary
CANDIDATES = [
    "article",
    "[data-qa='daily']",
    "[class*='DailyForecast']"
]
for sel in CANDIDATES:
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, sel))
        )
        break
    except Exception:
        pass

# Small scroll to trigger lazy content
for _ in range(2):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1.2)

html = driver.page_source
driver.quit()

# üß© Parse
df = parse_weather(html)
display(df.head(20) if not df.empty else df)


In [None]:

# üìä Analysis: average temperature & most common condition
if not df.empty:
    avg_temp = df["temp"].dropna().mean() if df["temp"].notna().any() else None
    most_common_condition = df["condition"].value_counts().idxmax() if df["condition"].notna().any() else None

    print("\n=== Analysis ===")
    print(f"Average Temperature: {avg_temp:.1f}¬∞" if avg_temp is not None else "Average Temperature: N/A")
    print(f"Most Common Condition: {most_common_condition or 'N/A'}")

    # üíæ Save results
    out_csv = "accuweather_attica_forecast.csv"
    df.to_csv(out_csv, index=False)
    print(f"Saved ‚Üí {out_csv}")
else:
    print("No data extracted. Try re-running after a few seconds or adjust selectors.")
