In [7]:
!pip -q install requests beautifulsoup4 pandas networkx matplotlib lxml html5lib

In [10]:
import re, requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse

GSW_PLAYERS = {
    "Stephen Curry","Klay Thompson","Kevin Durant","Draymond Green","Zaza Pachulia",
    "Andre Iguodala","Shaun Livingston","David West","JaVale McGee","Jordan Bell",
    "Kevon Looney","Omri Casspi","Nick Young","Patrick McCaw","Quinn Cook"
}

def box_to_pbp_url(bbr_box_url: str) -> str:
    return bbr_box_url.replace("/boxscores/","/boxscores/pbp/")

ASSIST_RE = re.compile(r"\(assist by ([^)]+)\)", re.IGNORECASE)
MAKES_RE = re.compile(r"^(.*?)\s+makes\s+", re.IGNORECASE)

def parse_pbp_pairs(pbp_url: str, gsw_players: set):
    """Retorna lista de (assistant, finisher) para o GSW a partir do BBRef PBP."""
    html = requests.get(pbp_url, timeout=30).text
    soup = BeautifulSoup(html, "lxml")


    desc_cells = soup.select("td[class^='font-xxs'] , td[class*='pbp'] , td[class*='descr'] , td:nth-of-type(6)")
    if not desc_cells:
        desc_cells = soup.find_all("td")

    pairs = []
    for td in desc_cells:
        text = " ".join(td.get_text(" ").split())
        if "assist by" in text.lower() and " makes " in text.lower():
            m_a = ASSIST_RE.search(text)
            m_f = MAKES_RE.search(text)
            if m_a and m_f:
                assistant = m_a.group(1).strip()
                finisher = m_f.group(1).strip()
                if finisher in gsw_players:
                    pairs.append((assistant, finisher))
    return pairs


In [12]:
import time, re, requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import pandas as pd
from collections import Counter
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

links = pd.read_csv("/content/gsw10_games_links_2017_18.csv")

GSW_PLAYERS = {
    "Stephen Curry","Klay Thompson","Kevin Durant","Draymond Green","Zaza Pachulia",
    "Andre Iguodala","Shaun Livingston","David West","JaVale McGee","Jordan Bell",
    "Kevon Looney","Omri Casspi","Nick Young","Patrick McCaw","Quinn Cook"
}

def box_to_pbp_bbref(bbr_box_url: str) -> str:
    return bbr_box_url.replace("/boxscores/","/boxscores/pbp/")

def game_to_pbp_espn(espn_game_url: str) -> str:
    gid = espn_game_url.split("gameId/")[-1].split("/")[0]
    return f"https://www.espn.com/nba/playbyplay/_/gameId/{gid}"

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    s.headers.update({"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Colab scraper (academic)"})
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

session = make_session()

ASSIST_BY = re.compile(r"\(assist by ([^)]+)\)", re.IGNORECASE)
MAKES    = re.compile(r"^(.*?)\s+makes\s+", re.IGNORECASE)

def parse_bbref_pairs(html: str, gsw_players: set):
    soup = BeautifulSoup(html, "lxml")
    desc_tds = soup.find_all("td")
    pairs = []
    for td in desc_tds:
        text = " ".join(td.get_text(" ").split())
        if "assist by" in text.lower() and " makes " in text.lower():
            m_a = ASSIST_BY.search(text)
            m_f = MAKES.search(text)
            if m_a and m_f:
                assistant = m_a.group(1).strip()
                finisher  = m_f.group(1).strip()
                if finisher in gsw_players:
                    pairs.append((assistant, finisher))
    return pairs

ESP_ASSIST_BY   = re.compile(r"\(assist by ([^)]+)\)", re.IGNORECASE)
ESP_NAME_ASSIST = re.compile(r"\(([^)]+) assists\)", re.IGNORECASE)

def parse_espn_pairs(html: str, gsw_players: set):
    soup = BeautifulSoup(html, "lxml")
    pairs = []
    for tr in soup.find_all("tr"):
        text = " ".join(tr.get_text(" ").split())
        if " makes " in text.lower() and ("assist" in text.lower()):
            m_f = MAKES.search(text)
            finisher = m_f.group(1).strip() if m_f else None
            m1 = ESP_ASSIST_BY.search(text)
            m2 = ESP_NAME_ASSIST.search(text) if not m1 else None
            assistant = (m1.group(1).strip() if m1 else (m2.group(1).strip() if m2 else None))
            if finisher and assistant and finisher in gsw_players:
                pairs.append((assistant, finisher))
    return pairs

all_rows = []
failures = []

for _, row in links.iterrows():
    date = row["date"]; opp = row["opp"]
    bbr_box = str(row["bbr_box"]).strip()
    espn_game = str(row["espn_game"]).strip()

    bbref_pbp = box_to_pbp_bbref(bbr_box) if bbr_box else None
    espn_pbp  = game_to_pbp_espn(espn_game) if espn_game else None

    counts = Counter()

    ok = False
    try:
        if bbref_pbp:
            r = session.get(bbref_pbp, timeout=60)
            r.raise_for_status()
            pairs = parse_bbref_pairs(r.text, GSW_PLAYERS)
            if pairs:
                for a,b in pairs: counts[(a,b)] += 1
                ok = True
    except Exception as e:
        print(f"[BBRef timeout/falha] {date} {opp}: {e}")

    if not ok and espn_pbp:
        try:
            r = session.get(espn_pbp, timeout=60)
            r.raise_for_status()
            pairs = parse_espn_pairs(r.text, GSW_PLAYERS)
            if pairs:
                for a,b in pairs: counts[(a,b)] += 1
                ok = True
        except Exception as e:
            print(f"[ESPN fallback falhou] {date} {opp}: {e}")

    out_rows = [[date, opp, "GSW", a, b, c] for (a,b), c in counts.items()]
    df_game = pd.DataFrame(out_rows, columns=["date","opponent","team","assistant","finisher","assists"])
    df_game.to_csv(f"/content/pairs_{date}_{opp}.csv", index=False)
    all_rows.extend(out_rows)

    print(f"[{date} {opp}] pares extraídos: {len(out_rows)}")
    time.sleep(2.0)

agg = pd.DataFrame(all_rows, columns=["date","opponent","team","assistant","finisher","assists"])
agg.to_csv("/content/out_pairs_10games.csv", index=False)

summary = agg.groupby(["date","opponent"]).agg(
    total_assists=("assists","sum"),
    unique_pairs=("assists","count"),
    gsw_players=("finisher", lambda s: len(set(s)))
).reset_index()
summary.to_csv("/content/summary_by_game.csv", index=False)

print("Concluído. Arquivos gerados:")
print("/content/out_pairs_10games.csv  (agregado)")
print("/content/summary_by_game.csv    (resumo por jogo)")



[2017-10-17 HOU] pares extraídos: 0
[2017-10-25 TOR] pares extraídos: 0
[2017-11-02 SAS] pares extraídos: 0
[2017-11-16 BOS] pares extraídos: 0
[2017-12-25 CLE] pares extraídos: 0
[2018-01-04 HOU] pares extraídos: 0
[2018-01-15 CLE] pares extraídos: 0
[2018-01-27 BOS] pares extraídos: 0
[2018-02-10 SAS] pares extraídos: 0
[2018-02-14 POR] pares extraídos: 0
Concluído. Arquivos gerados:
/content/out_pairs_10games.csv  (agregado)
/content/summary_by_game.csv    (resumo por jogo)
