In [None]:
"""
    WEB SCRAPPING
"""

In [None]:
import requests
from bs4 import BeautifulSoup
import csv 
import time
import random
from pathlib import Path
from typing import List, Dict, Optional

In [None]:
#Http/s ayarlari oturum vs
BASE_URL='https://quotes.toscrape.com'
START_URL=f"{BASE_URL}/"

In [None]:
session=requests.Session()
session.headers.update({
    "user-agent":"Welcome to page"
})


In [None]:
DEFAULT_TIMEOUT=10
MAX_RETRIES=3
BACKOFF_BASE=1.5


In [None]:
#URL cekme
def fetch(url:str)->Optional[requests.Response]:
    for attempt in range(1,MAX_RETRIES+1):
        try:
            response=session.get(url,timeout=DEFAULT_TIMEOUT)
            if response.status_code==200:
                return response
            else:
                print(f"(UYARI) {url} -> HTTP {response.status_code}")
        except requests.RequestException as e:
            print(f"(HATA) {url} istek hatasi: {e}")

        back_time=BACKOFF_BASE**attempt+random.uniform(0,0.5)
        time.sleep(back_time)
    print("PEsss!!")
    return None

In [None]:
##Parser yaz
def quotes_parser(html:str)->List[Dict]:
    soup = BeautifulSoup(html, "html.parser")
    results=[]

    for quote in soup.select("div.quote"):
        text = quote.select_one("span.text")
        author=quote.select_one("small.author")
        tags=quote.select_one("div.tags a.tag")

        results.append({
            "text":text.get_text(strip=True) if text else "",
            "author": author.get_text(strip=True) if author else "",
            "tags": ",".join(t.get_text(strip=True) for t in tags) if tags else ""
        })
    return results

In [72]:
res=fetch(BASE_URL)
result=quotes_parser(res.text)
with open("quote.txt","a") as f:
    for q in result:
        f.write(f'Text {q["text"]}\nAuthor: {q["author"]}\nTags: {q["tags"]}\n{"-"*30}')


In [68]:
def find_next_page_url(html:str)->str|None:
    soup=BeautifulSoup(html,"html.parser")
    next_link= soup.select_one("li.next>a")
    if not next_link or not next_link.get("href"):
            return None
    else:
        return BASE_URL + next_link.get("href")

In [69]:
def save_to_csv(rows:List[Dict],out_path:Path):
    out_path.parent.mkdir(parents=True,exist_ok=True)
    fieldnames=["text","author","tags"]
    write_header=not out_path.exists()

    with out_path.open("a", newline="",encoding="utf-8") as f:
        writer=csv.DictWriter(f,fieldnames=fieldnames)
        if write_header:
            writer.writeheader()
        writer.writerows(rows)

In [70]:
def crawl_all(start_url:str=START_URL,out_csv: str='./data/quotes.csv'):
    current_url=start_url
    csv_path=Path(out_csv)
    page_no=1
    total=0
    print(f"Veri kazima basladi!!...")
    while current_url:
        print(f"[GET] {page_no} crawling...")
        resp=fetch(current_url)
        if resp is None:
            print("Durdu!!")
            break
        quotes=quotes_parser(resp.text)
        if quotes:
            save_to_csv(quotes,csv_path)
            total+=1
            print(f'[ok] {page_no} : {len(quotes)} kayir eklendi')
        else:
            print("Uyari burada bir sey yok!!")
        time.sleep(random.uniform(0.8,1.6))

        next_url=find_next_page_url(resp.text)
        if next_url:
            current_url = next_url
            page_no += 1
        else:
            print("Bitti")
            break

In [71]:
if __name__=="__main__":
    crawl_all()


Veri kazima basladi!!...
[GET] 1 crawling...
[ok] 1 : 10 kayir eklendi
[GET] 2 crawling...
[ok] 2 : 10 kayir eklendi
[GET] 3 crawling...
[ok] 3 : 10 kayir eklendi
[GET] 4 crawling...
[ok] 4 : 10 kayir eklendi
[GET] 5 crawling...
[ok] 5 : 10 kayir eklendi
[GET] 6 crawling...
[ok] 6 : 10 kayir eklendi
[GET] 7 crawling...
[ok] 7 : 10 kayir eklendi
[GET] 8 crawling...
[ok] 8 : 10 kayir eklendi
[GET] 9 crawling...
[ok] 9 : 10 kayir eklendi
[GET] 10 crawling...
[ok] 10 : 10 kayir eklendi
Bitti
