# 使用网络爬虫技术自动爬取网页内容

### 首先需要安装以下2个库：
  - pip install requests
  - pip install beautifulsoup4

### 目标1:爬取单个网页
https://www.zhouyi.cc/zhouyi/yijing64/4103.html

#### 确定当前工作目录，并测试raw_data.txt是否可以正常创建

In [12]:
import requests
from bs4 import BeautifulSoup
import os

# 确保 'RawData' 文件夹存在。如果不存在，创建它
if not os.path.exists('RawData'):
    os.makedirs('RawData')

# 给文件加上时间戳
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
file_name_or_path='raw_dataTest.txt'
output_dir = f"RawData/{file_name_or_path}-{timestamp}"

# 打印当前工作目录
print("Current working directory:", os.getcwd())

# 检查是否可以打开文件进行写入
try:
    with open(output_dir, 'w', encoding='utf-8') as file:
        print("File opened successfully for writing.")
        # 这里可以写入一些测试内容
        file.write("This is a test.")
except Exception as e:
    print("Error occurred while trying to write to file:", e)


Current working directory: /root/transformershomework/Crawl_rawData
File opened successfully for writing.


#### 确定目标网站url地址，查看网页响应的编码格式

In [13]:
# 目标网站URL
url = 'https://www.zhouyi.cc/zhouyi/yijing64/4103.html'

# 发送HTTP请求
response = requests.get(url)

# 检查响应内容的前几个字节：判断网页响应的编码格式，类似b'\x1f\x8b\x08'为utf-8编码
print(response.content[:4])

# 检查响应头
print("Response headers:", response.headers)

# 检查响应内容的字节
print("Response content bytes:", response.content)

b'\xef\xbb\xbf<'
Response headers: {'Server': 'nginx', 'Date': 'Thu, 29 Feb 2024 10:09:23 GMT', 'Content-Type': 'text/html', 'Last-Modified': 'Thu, 30 Nov 2023 12:02:52 GMT', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Vary': 'Accept-Encoding', 'ETag': 'W/"656879ec-be33"', 'Content-Encoding': 'gzip'}
Response content bytes: b'\xef\xbb\xbf<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<title>\xe5\x91\xa8\xe6\x98\x93\xe7\xac\xac1\xe5\x8d\xa6_\xe4\xb9\xbe\xe5\x8d\xa6(\xe4\xb9\xbe\xe4\xb8\xba\xe5\xa4\xa9)_\xe4\xb9\xbe\xe4\xb8\x8a\xe4\xb9\xbe\xe4\xb8\x8b_\xe6\x98\x93\xe5\xae\x89\xe5\xb1\x85\xe5\x90\x89\xe7\xa5\xa5\xe7\xbd\x91</title>\n<meta name="mobile-agent" content="format=xhtml; url=https://m.zhouyi.cc/zhouyi/yijing64/4103.html" />\n<link rel="canonical" href="https://m.zhouyi.cc/zhouyi/yijing64/4103.html" />\n<meta http-equiv="Content-Type" c

In [14]:
# 使用BeautifulSoup解析HTML内容，指定from_encoding='utf-8'
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')

# 找到你想要的内容，例如所有的段落：需要在浏览器开发工具中，找到网页中需要爬采的部分的选择器
paragraphs = soup.findAll('div', attrs={'class': 'gualist tleft f14 lh25'})

# 将内容写入到文件中
with open(output_dir, 'w', encoding='utf-8') as file:
    for paragraph in paragraphs:
        file.write(paragraph.text)
        file.write('\n')  # 段与段之间用换行符分隔
# 打印输出每个URL的内容，以便监控
print(f"Content from {url} has been written to the file.")

Content from https://www.zhouyi.cc/zhouyi/yijing64/4103.html has been written to the file.


### 目标2：爬取多个网页，且每个网页爬取的位置相同

#### 目标网页：
 - https://www.zhouyi.cc/zhouyi/yijing64/4103.html 乾卦 周易第1卦
 - https://www.zhouyi.cc/zhouyi/yijing64/4105.html 坤卦 周易第2卦
 - https://www.zhouyi.cc/zhouyi/yijing64/4106.html 屯卦 周易第3卦
 - https://www.zhouyi.cc/zhouyi/yijing64/4107.html 蒙卦 周易第4卦
 - https://www.zhouyi.cc/zhouyi/yijing64/4108.html 需卦 周易第5卦
 - https://www.zhouyi.cc/zhouyi/yijing64/4109.html 讼卦 周易第6卦
 - https://www.zhouyi.cc/zhouyi/yijing64/4110.html 师卦 周易第7卦
 - https://www.zhouyi.cc/zhouyi/yijing64/4111.html 比卦 周易第8卦

In [15]:
import requests
from bs4 import BeautifulSoup
import os

# URL列表
urls = [
    'https://www.zhouyi.cc/zhouyi/yijing64/4103.html',
    'https://www.zhouyi.cc/zhouyi/yijing64/4105.html',
    'https://www.zhouyi.cc/zhouyi/yijing64/4106.html',
    'https://www.zhouyi.cc/zhouyi/yijing64/4107.html',
    'https://www.zhouyi.cc/zhouyi/yijing64/4108.html',
    'https://www.zhouyi.cc/zhouyi/yijing64/4109.html',
    'https://www.zhouyi.cc/zhouyi/yijing64/4110.html',
    'https://www.zhouyi.cc/zhouyi/yijing64/4111.html',
]

# 确保 'RawData' 文件夹存在。如果不存在，创建它
if not os.path.exists('RawData'):
    os.makedirs('RawData')

# 给文件加上时间戳
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
file_name_or_path='raw_dataTest.txt'
output_dir = f"RawData/{file_name_or_path}-{timestamp}"

# 打开文件用于写入
with open(output_dir, 'w', encoding='utf-8') as file:
    for url in urls:
        # 发送HTTP请求
        response = requests.get(url)
        
        # 检查请求是否成功
        if response.status_code == 200:
            # 使用BeautifulSoup解析HTML内容
            soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
            
            # 找到你想要的内容，例如所有的段落
            paragraphs = soup.findAll('div', attrs={'class': 'gualist tleft f14 lh25'})
            
            # 将内容写入到文件中
            for paragraph in paragraphs:
                file.write(paragraph.text)
                file.write('\n')  # 段与段之间用换行符分隔

         # 打印输出每个URL的内容，以便监控
            print(f"Content from {url} has been written to the file.")
            
        else:
            print(f"Failed to retrieve the webpage from {url}. Status code: {response.status_code}")


Content from https://www.zhouyi.cc/zhouyi/yijing64/4103.html has been written to the file.
Content from https://www.zhouyi.cc/zhouyi/yijing64/4105.html has been written to the file.
Content from https://www.zhouyi.cc/zhouyi/yijing64/4106.html has been written to the file.
Content from https://www.zhouyi.cc/zhouyi/yijing64/4107.html has been written to the file.
Content from https://www.zhouyi.cc/zhouyi/yijing64/4108.html has been written to the file.
Content from https://www.zhouyi.cc/zhouyi/yijing64/4109.html has been written to the file.
Content from https://www.zhouyi.cc/zhouyi/yijing64/4110.html has been written to the file.
Content from https://www.zhouyi.cc/zhouyi/yijing64/4111.html has been written to the file.
