# Day20
## 網頁結構解析：使用 lxml 套件操作 XPath
- 使用 lxml.html
- 使用 XPath 語法獲取子節點

## 作業說明
由於 Day18 作業我們已經練習過一些定位工具，今天使用和 Day19 一樣的網站，針對 XPath 更多變化用法再深入練習吧。

- 題目網站：
https://pokemondb.net/pokedex/all
- 使用 XPath 技巧把寶可夢表格抓下來

In [1]:
import lxml.html
import requests

### `GET` Request

In [2]:
url = "https://pokemondb.net/pokedex/all"
headers = {
   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
   "Accept-Encoding": "gzip, deflate, br",
   "Accept-Language": "zh-TW,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
   "Cache-Control": "max-age=0",
   "Upgrade-Insecure-Requests": "1",
   "Referer": "https://pokemondb.net/",
   "X-Frame-Options": "SAMEORIGIN",
   "X-Content-Type-Options": "nosniff",
   "X-XSS-Protection": "1; mode=block"
}

r = requests.get(url, headers=headers)
r.encoding = 'utf-8'

print(r.status_code, "\n\n", r.text[:1000])


200 

 <!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="utf-8">
	<title>Pokémon Pokédex: list of Pokémon with stats | Pokémon Database</title>

	<link rel="preconnect" href="https://img.pokemondb.net">
	<link rel="preconnect" href="https://s.pokemondb.net">
	<link rel="preload" href="/static/fonts/fira-sans-v17-latin-400.woff2" as="font" type="font/woff2" crossorigin>
	<link rel="preload" href="/static/fonts/fira-sans-v17-latin-400i.woff2" as="font" type="font/woff2" crossorigin>
	<link rel="preload" href="/static/fonts/fira-sans-v17-latin-600.woff2" as="font" type="font/woff2" crossorigin>
		<link rel="stylesheet" href="/static/css/pokemondb-9acb309c4f.css">

	<meta name="viewport" content="width=device-width, initial-scale=1">

	<meta property="og:description" name="description" content="The Pokédex contains detailed stats for every creature from the Pokémon games, up to and including the latest Scarlet/Violet games.">
	<link rel="canonical" href="https://pokemondb.net/pokedex

### 轉為 HTML Element 物件
- 使用 `lxml.html.fromstring()`

In [5]:
# 轉為 Element 物件
tree = lxml.html.fromstring(r.content)
tree

<Element html at 0x28fb65bfa70>

### 指定相符特徵的節點
- 找到寶可夢資訊表格
- 使用：`tree.xpath('//<tag_name>[@<attribute>=<attribute_value>]')`


In [6]:
# Hint: 
# table = tree.xpath('//table[@id="_____"]')[0]
table = tree.xpath('//table[@id="pokedex"]')[0]
table

<Element table at 0x28fb65bf9d0>

### 連續查找
- 取得所有表格中的列

In [44]:
# Hint: 
# header = table.xpath("./_____")[0].xpath(".//__")
# body_rows = table.xpath("./_____")[0].xpath(".//__")

header = table.xpath("./thead")[0].xpath(".//th")
body_rows = table.xpath("./tbody")[0].xpath(".//tr")

In [45]:
header

[<Element th at 0x28fb681b700>,
 <Element th at 0x28fb6818190>,
 <Element th at 0x28fb681b520>,
 <Element th at 0x28fb681b250>,
 <Element th at 0x28fb681a490>,
 <Element th at 0x28fb681b930>,
 <Element th at 0x28fb681a760>,
 <Element th at 0x28fb681b430>,
 <Element th at 0x28fb681bde0>,
 <Element th at 0x28fb681bac0>]

In [46]:
body_rows

[<Element tr at 0x28fbadc3c50>,
 <Element tr at 0x28fbadc28a0>,
 <Element tr at 0x28fbadc3ca0>,
 <Element tr at 0x28fbadc3e80>,
 <Element tr at 0x28fbadc2e40>,
 <Element tr at 0x28fbadc3390>,
 <Element tr at 0x28fbadc3c00>,
 <Element tr at 0x28fbadc35c0>,
 <Element tr at 0x28fbadc3b10>,
 <Element tr at 0x28fbadc2080>,
 <Element tr at 0x28fbadc1950>,
 <Element tr at 0x28fbadc1d10>,
 <Element tr at 0x28fbadc1d60>,
 <Element tr at 0x28fbadc1ea0>,
 <Element tr at 0x28fbadc1e50>,
 <Element tr at 0x28fbadc18b0>,
 <Element tr at 0x28fbadc19a0>,
 <Element tr at 0x28fbadc1220>,
 <Element tr at 0x28fbadc1270>,
 <Element tr at 0x28fbadc12c0>,
 <Element tr at 0x28fbadc1a90>,
 <Element tr at 0x28fbadc1cc0>,
 <Element tr at 0x28fbadc1860>,
 <Element tr at 0x28fbadc11d0>,
 <Element tr at 0x28fbadc1ae0>,
 <Element tr at 0x28fbadc0690>,
 <Element tr at 0x28fbadc0410>,
 <Element tr at 0x28fbadc09b0>,
 <Element tr at 0x28fbadc0a50>,
 <Element tr at 0x28fbadc0dc0>,
 <Element tr at 0x28fbadc0e60>,
 <Elemen

### 指定節點文字相符：找出文字是 Ivysaur 的節點
- Hint: 使用 `tree.xpath('//<tag_name>[text()="some_string"]')`

In [47]:
tree.xpath('//a[text()="Ivysaur"]')[0].text

'Ivysaur'

### 找出屬性包含部分文字的節點：找出各種類型的寶可夢種類標籤

- 包含： `tree.xpath('//<tag_name>[contains(<attribute>, <attribute_value>)]')`
- 不包含： `tree.xpath('//<tag_name>[not(contains(<attribute>, <attribute_value>))]')`

In [48]:
# 找出各種類型的寶可夢種類標籤(GRASS, POISON, ...)，用 set 過濾出不重複種類有哪幾種

# Hint: set(tree.xpath('//a[________(@_____, "type-")]/text()'))

set(tree.xpath('//a[contains(@class, "type-")]/text()'))

{'Bug',
 'Dark',
 'Dragon',
 'Electric',
 'Fairy',
 'Fighting',
 'Fire',
 'Flying',
 'Ghost',
 'Grass',
 'Ground',
 'Ice',
 'Normal',
 'Poison',
 'Psychic',
 'Rock',
 'Steel',
 'Water'}

### 將資訊組成表格

In [49]:
# Hint: 
# header_cols = [col.xpath(".//____()")[0] for col in header]
# row_values = [["".join(col.xpath('.//____()')) for col in row.xpath('.//__')] for row in body_rows]


header_cols = [col.xpath(".//text()")[0] for col in header]
row_values = [["".join(col.xpath('.//text()')) for col in row.xpath('.//td')] for row in body_rows]


In [50]:
row_values[0]

['\n\t\n\t\n0001',
 'Bulbasaur',
 'Grass Poison',
 '318',
 '45',
 '49',
 '49',
 '65',
 '65',
 '45']

In [51]:
import pandas as pd
df = pd.DataFrame(row_values, columns=header_cols)
df["Type"] = df["Type"].apply(lambda x: x.strip().split(" "))
df

Unnamed: 0,#,Name,Type,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,\n\t\n\t\n0001,Bulbasaur,"[Grass, Poison]",318,45,49,49,65,65,45
1,\n\t\n\t\n0002,Ivysaur,"[Grass, Poison]",405,60,62,63,80,80,60
2,\n\t\n\t\n0003,Venusaur,"[Grass, Poison]",525,80,82,83,100,100,80
3,\n\t\n\t\n0003,Venusaur Mega Venusaur,"[Grass, Poison]",625,80,100,123,122,120,80
4,\n\t\n\t\n0004,Charmander,[Fire],309,39,52,43,60,50,65
...,...,...,...,...,...,...,...,...,...,...
1210,\n\t\n\t\n1023,Iron Crown,"[Steel, Psychic]",590,90,72,100,122,108,98
1211,\n\t\n\t\n1024,Terapagos Normal Form,[Normal],450,90,65,85,65,85,60
1212,\n\t\n\t\n1024,Terapagos Terastal Form,[Normal],600,95,95,110,105,110,85
1213,\n\t\n\t\n1024,Terapagos Stellar Form,[Normal],700,160,105,110,130,110,85


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215 entries, 0 to 1214
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   #        1215 non-null   object
 1   Name     1215 non-null   object
 2   Type     1215 non-null   object
 3   Total    1215 non-null   object
 4   HP       1215 non-null   object
 5   Attack   1215 non-null   object
 6   Defense  1215 non-null   object
 7   Sp. Atk  1215 non-null   object
 8   Sp. Def  1215 non-null   object
 9   Speed    1215 non-null   object
dtypes: object(10)
memory usage: 95.1+ KB
