## 服务

- fastapi, flask
- 本地服务
- 云服务
- postman测试
- requests测试

In [1]:
import requests

# Base URL of the FastAPI application
base_url = "http://localhost:8000"

# Test the GET method
def test_get_example():
    response = requests.get(f"{base_url}/api/get_example", params={"name": "Alice"})
    print("GET Response:", response.json())

# Test the POST method
def test_post_example():
    response = requests.post(f"{base_url}/api/post_example", json={"name": "Bob"})
    print("POST Response:", response.json())


test_get_example()
test_post_example()

GET Response: {'message': 'Hello, Alice!'}
POST Response: {'message': 'Hello, Bob!'}


## HTML

### xpath

In [5]:
from lxml import etree

# 读取 elements.html 文件
with open('elements.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# 解析 HTML 内容
html_tree = etree.HTML(html_content)



In [4]:
html_content

'<!DOCTYPE html>\n<html lang="zh-CN">\n<head>\n    <meta charset="UTF-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <title>HTML元素综合示例</title>\n    <style>\n        body { font-family: Arial, sans-serif; }\n        code, pre { background-color: #f4f4f4; padding: 1px; }\n        table, th, td { border: 1px solid black; border-collapse: collapse; padding: 8px; }\n        img { max-width: 100%; height: auto; }\n    </style>\n</head>\n<body>\n\n<h1>HTML元素综合示例页面</h1>\n\n<h2>1. 标题级别演示</h2>\n<p>本节展示不同级别的标题。</p>\n<h3>1.1 三级标题</h3>\n<h4>1.1.1 四级标题</h4>\n\n<h2>2. 表格示例</h2>\n<table>\n    <thead>\n        <tr>\n            <th>列1</th>\n            <th>列2</th>\n            <th>列3</th>\n        </tr>\n    </thead>\n    <tbody>\n        <tr>\n            <td>数据1</td>\n            <td>数据2</td>\n            <td>数据3</td>\n        </tr>\n        <tr>\n            <td>更多数据1</td>\n            <td>更多数据2</td>\n            <td>更多数据3</td>\n        </tr>\n    </tbody>\n</ta

In [3]:
type(html_content)

str

### header

In [6]:
# 示例：获取所有的标题标签
titles = html_tree.xpath('//h1 | //h2 | //h3 | //h4 | //h5 | //h6')
for title in titles:
    print(f"{title.tag}: {title.text}")

h1: HTML元素综合示例页面
h2: 1. 标题级别演示
h3: 1.1 三级标题
h4: 1.1.1 四级标题
h2: 2. 表格示例
h2: 3. 代码块与行内代码
h2: 4. 列表示例
h3: 4.1 无序列表
h3: 4.2 有序列表
h2: 5. 链接与URL
h2: 6. 图像示例


### table

In [29]:
print("="*20)
# 示例：获取所有的表格数据
table_data = html_tree.xpath('//table//td')
for data in table_data:
    print(f"Table Data: {data.text}")

Table Data: 数据1
Table Data: 数据2
Table Data: 数据3
Table Data: 更多数据1
Table Data: 更多数据2
Table Data: 更多数据3


### url

In [30]:
print("="*20)
# 示例：获取所有的链接
links = html_tree.xpath('//a/@href')
for link in links:
    print(f"Link: {link}")

Link: https://www.baidu.com
Link: https://www.baidu.com


### Unordered List Item

In [31]:
print("="*20)
# 示例：获取所有的无序列表项
unordered_list_items = html_tree.xpath('//ul/li')
for item in unordered_list_items:
    print(f"Unordered List Item: {item.text}")

Unordered List Item: 项目一
        
Unordered List Item: 子项目1.1
Unordered List Item: 子项目1.2
Unordered List Item: 项目二
        
Unordered List Item: 子项目2.1
Unordered List Item: 子项目2.2
Unordered List Item: 项目三
        
Unordered List Item: 子项目3.1
Unordered List Item: 子项目3.2


### Ordered List Item

In [32]:
print("="*20)
# 示例：获取所有的有序列表项
ordered_list_items = html_tree.xpath('//ol/li')
for item in ordered_list_items:
    print(f"Ordered List Item: {item.text}")

Ordered List Item: 第一步
Ordered List Item: 第二步
Ordered List Item: 第三步


### Image

In [33]:
print("="*20)
# 示例：获取所有的图片的源链接
images = html_tree.xpath('//img/@src')
for img in images:
    print(f"Image Source: {img}")


Image Source: https://p1.itc.cn/images01/20230621/23acf071a82b437fa8a7bc738e4c69d8.jpeg


## HTTP

### 请求的示例

In [7]:
import requests

# 构造搜索请求的URL
search_url = 'https://www.baidu.com/s'
params = {
    'wd': 'Python爬虫'  # 搜索关键词
}

# 发送GET请求
response = requests.get(search_url, params=params)

# 打印请求的URL
print('请求URL:', response.url)

请求URL: https://www.baidu.com/s?wd=Python%E7%88%AC%E8%99%AB


### 浏览器的开发者工具（Developer Tools）

fn+F12

找到关键url，刷新后：
![image.png](attachment:image.png)

### unquote

将URL编码的字符串解码为普通字符串，然后再将其编码回URL编码格式

In [8]:
from urllib.parse import unquote, quote

# URL编码的字符串
# url_encoded_str = "%E5%A4%A7%E6%A8%A1%E5%9E%8B"
# url_encoded_str = "%E9%AD%94%E7%AB%A5%E9%99%8D%E4%B8%96"
# url_encoded_str = "http%E8%AF%B7%E6%B1%82%E4%BB%8B%E7%BB%8D"
url_encoded_str = "Python%E7%88%AC%E8%99%AB"

# 解码为普通字符串
decoded_str = unquote(url_encoded_str)
print(decoded_str)  # 输出: 大模型

# 编码为URL编码
encoded_str = quote(decoded_str)
print(encoded_str)  # 输出: %E5%A4%A7%E6%A8%A1%E5%9E%8B

Python爬虫
Python%E7%88%AC%E8%99%AB


### 响应的示例

In [9]:
import requests
from urllib.parse import quote

# 搜索的关键词
search_keyword = "魔童降世"

# 对关键词进行URL编码
encoded_keyword = quote(search_keyword)

# 百度的搜索URL，使用HTTP协议
url = f"http://www.baidu.com/s?wd={encoded_keyword}"

# 发送GET请求
response = requests.get(url)

# 检查请求是否成功
if response.status_code == 200:
    # 获取网页内容
    html_content = response.text
    print(html_content)
else:
    print(f"请求失败，状态码：{response.status_code}")

<!DOCTYPE html>
<!--STATUS OK-->



























































	







    


    
    
    




<html class="">
	<head>
		
		<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
		<meta http-equiv="content-type" content="text/html;charset=utf-8">
							<meta content="always" name="referrer">
		        <meta name="theme-color" content="#ffffff">
        <link rel="shortcut icon" href="https://www.baidu.com/favicon.ico" type="image/x-icon" />
        <link rel="icon" sizes="any" mask href="https://www.baidu.com/favicon.ico">
        <link rel="search" type="application/opensearchdescription+xml" href="/content-search.xml" title="百度搜索" />
	  	<link rel="stylesheet" data-for="result" href="https://pss.bdstatic.com/r/www/static/font/cosmic/pc/cos-icon_8bae49a.css"/>
		<link rel="apple-touch-icon-precomposed" href="https://psstatic.cdn.bcebos.com/video/wiseindex/aa6eef91f8b5b1a33b454c401_1660835115000.png">
		
		
	<title>魔童降世_百度搜索</title>

		

		
	<s

In [10]:
encoded_keyword

'%E9%AD%94%E7%AB%A5%E9%99%8D%E4%B8%96'

In [11]:
# 打印响应的状态码
print('响应状态码:', response.status_code)

# 打印响应的头部
print('响应头部:')
for key, value in response.headers.items():
    print(f'{key}: {value}')

# 打印响应的内容（前200个字符）
print('响应内容（前200个字符）:')
print(response.text[:200])

响应状态码: 200
响应头部:
Bdpagetype: 3
Bdqid: 0xa1f6267f00eabd1d
Cache-Control: private
Ckpacknum: 2
Ckrndstr: f00eabd1d
Connection: keep-alive
Content-Encoding: gzip
Content-Security-Policy: frame-ancestors 'self' https://chat.baidu.com http://mirror-chat.baidu.com https://fj-chat.baidu.com https://hba-chat.baidu.com https://hbe-chat.baidu.com https://njjs-chat.baidu.com https://nj-chat.baidu.com https://hna-chat.baidu.com https://hnb-chat.baidu.com http://debug.baidu-int.com;
Content-Type: text/html;charset=utf-8
Date: Sun, 02 Mar 2025 08:20:21 GMT
P3p: CP=" OTI DSP COR IVA OUR IND COM ", CP=" OTI DSP COR IVA OUR IND COM "
Server: BWS/1.1
Set-Cookie: BAIDUID=0355AB27F6DE637FEF48F9701B0300CA:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com, BIDUPSID=0355AB27F6DE637FEF48F9701B0300CA; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com, PSTM=1740903621; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=

## requests

### text

reference: https://book.douban.com/subject/26952485/

In [21]:
import requests
from lxml import etree

# 目标网页URL
url = "https://book.douban.com/subject/26952485/reviews"

# 构建请求头字典
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0'
}

# 发送HTTP请求获取网页内容
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'  # 根据网页实际编码调整

# 解析网页内容
html = etree.HTML(response.text)

# 使用XPath表达式提取数据
# 请根据实际网页结构调整XPath
xpath = '//*[@id="link-report-8460000"]/div[1]'
content = html.xpath(xpath)

# 打印提取的内容
for item in content:
    print(etree.tostring(item, encoding='unicode', pretty_print=True).strip())

In [22]:
content

[]

In [20]:
xpath = '/html/body/div[3]/div[2]/div/div[1]/div[3]/div[10]/div[3]/div/ul/li[1]/div/p/span'
xpath = '//*[@id="score"]/ul/li[5]/div/p/span'
content = html.xpath(xpath)

# 打印提取的内容
for item in content:
    print(etree.tostring(item, encoding='unicode', pretty_print=True).strip())

<span class="short">看了两集的同名大热剧，感觉水准很不稳定，不太想追了，找来了这本原著小说，儿戏一般的过家家，勉勉强强看完，电视剧是不会再去浪费时间追了。</span>


### image

reference: https://it.sohu.com/a/687912449_121731195

In [23]:

import requests

# 1. 获取单张图片
# 找到目标url
url = "https://p0.itc.cn/images01/20230621/b3090aaf0ea742a683daea3b2ce61a1b.jpeg"

# 构建请求头字典
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# 发送请求，获取响应
res = requests.get(url, headers=headers)

# 检查请求是否成功
if res.status_code == 200:
    # 打开文件，准备写入
    with open('机器狗1.jpg', 'wb') as f:
        f.write(res.content)
    print("图片已成功保存到本地。")
else:
    print("请求失败，状态码：", res.status_code)

图片已成功保存到本地。


### mp3

reference: https://music.163.com/#/song?id=556863701

In [24]:
import requests

# 2. 获取单首歌曲
# 找到目标url
# url = "https://m704.music.126.net/20250119162539/4e663db234502e77b578cb911632eb72/jdyyaac/5509/030b/0658/bc5f0b804d4af613f30b1209a040a564.m4a?vuutv=ee7351cirCsb1a+m+z5vHyizc5dC4cnjsb4c617kPG6pAl9ybd68cfpRHRGO21kvcFI9P08ywHtDU3si9JGCLZOV74tpRoBjiZ7tWKBtWAE=&authSecret=000001947d94005918e10a3b190621ad"
url = "https://m704.music.126.net/20250302171352/46a2a9a33d0ce076de5b720de398600c/jdyyaac/5509/030b/0658/bc5f0b804d4af613f30b1209a040a564.m4a?vuutv=PszbQ73LAVl//TbAOa7sb5gFcGMMU+tZMMHdiCNigmr7/E4dzn5RRT6QDyPylJRNvDjdQk762nyZvwJdK4wOp8gtTT7eD4z0jG8JODCTyNQ=&authSecret=00000195560b401a12cd0a3b194e57a0"
# 构建请求头字典
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# 发送请求，获取响应
res2 = requests.get(url, headers=headers)

# 检查请求是否成功
if res2.status_code == 200:
    # 打开文件，准备写入
    with open('网易云2.mp3', 'wb') as f:
        f.write(res2.content)
    print("音乐文件已成功保存到本地。")
else:
    print("请求失败，状态码：", res2.status_code)

音乐文件已成功保存到本地。


### mp4

reference: https://music.163.com/#/mv?id=10729090

In [25]:
import requests

# 3. 获取单个mv
# 找到目标url
# url = "https://vodkgeyttp8.vod.126.net/cloudmusic/5439/mv/52d8/2d759514199d2a93d5a632c2c9a1d42e.mp4?wsSecret=fbf98f7baccf71fe698e5aaaa24588df&wsTime=1737274230"
url = "https://vodkgeyttp8.vod.126.net/cloudmusic/5439/mv/52d8/2d759514199d2a93d5a632c2c9a1d42e.mp4?wsSecret=17fa347f84db9bf32aebee4e633dc2a4&wsTime=1740905452"

# 构建请求头字典
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

# 发送请求，获取响应
res3 = requests.get(url, headers=headers)

# 检查请求是否成功
if res3.status_code == 200:
    # 打开文件，准备写入
    with open('网易云2.mp4', 'wb') as f:
        f.write(res3.content)
    print("视频文件已成功保存到本地。")
else:
    print("请求失败，状态码：", res3.status_code)

视频文件已成功保存到本地。
