In [25]:
!rm -r rruff_files

In [24]:
import requests
from bs4 import BeautifulSoup
import time
import os
from tqdm import tqdm

class RRUFFScraper:
    def __init__(self):
        self.base_url = "https://rruff.info"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def get_mineral_links(self, letter):
        """获取某个字母开头的所有矿物链接"""
        url = f"{self.base_url}/index.php/r=lookup_minerals/letter={letter}/calling_form=frm_sample_search/name_field=txt_mineral/id_field=(letter)"
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        links = []
        for strong_tag in soup.find_all('strong'):
            a_tag = strong_tag.find('a')
            if a_tag and 'SubmitWin' in a_tag.get('href', ''):
                href = a_tag['href']
                mineral_name = href.split("'")[1]
                mineral_id = href.split("'")[3]
                links.append((mineral_name, mineral_id))
        return links

    def download_xray_data(self, mineral_info, output_dir='rruff_files'):
        """下载 X-ray Data (XY - Processed) 和 X-ray Data (XY - RAW) 文件"""
        mineral_name, mineral_id = mineral_info
        url = f"{self.base_url}/{mineral_name}/R{mineral_id}"

        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            powder_table = None
            for table in soup.find_all('table'):
                th = table.find('th')
                if th and "POWDER DIFFRACTION" in th.text:
                    powder_table = table
                    break

            if powder_table:
                download_links = {}

                for tr in powder_table.find_all('tr'):
                    for a in tr.find_all('a', href=True):
                        if "X-ray Data (XY - Processed)" in a.text or "X-ray Data (XY - RAW)" in a.text:
                            link = a['href']
                            if link.startswith('http'):
                                xray_data_link = link
                            else:
                                xray_data_link = self.base_url + link

                            file_info = a.text.strip().replace(" ", "_").replace("(", "").replace(")", "").lower()
                            download_links[xray_data_link] = file_info

                if not download_links:
                    return False

                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                for xray_data_link, file_info in download_links.items():
                    filename = f"{output_dir}/{mineral_name}_{file_info}.txt"

                    file_response = requests.get(xray_data_link, headers=self.headers)
                    with open(filename, 'wb') as f:
                        f.write(file_response.content)

                return True
            else:
                return False

        except requests.exceptions.RequestException as e:
            print(f"Error during requests to {url}: {str(e)}")
            print(soup.prettify())
            return False
        except Exception as e:
            print(f"Error processing {mineral_info}: {str(e)}")
            return False

    def scrape_all(self, start_letter='a', end_letter='z', delay=2):
        """下载从start_letter到end_letter的所有RRUFF文件"""
        success_count = 0

        for letter in range(ord(start_letter.lower()), ord(end_letter.lower()) + 1):
            letter = chr(letter)

            mineral_links = self.get_mineral_links(letter)

            # 使用 tqdm 创建进度条
            with tqdm(total=len(mineral_links), desc=f"Processing letter {letter}") as pbar:
                for mineral_info in mineral_links:
                    if self.download_xray_data(mineral_info):
                        success_count += 1
                    time.sleep(delay)
                    pbar.update(1)  # 更新进度条

        print(f"\nDownload completed! Successfully downloaded {success_count} X-ray Data files.")

if __name__ == "__main__":
    scraper = RRUFFScraper()
    scraper.scrape_all('a', 'b',0.01)

Processing letter a:  15%|█▍        | 34/231 [00:27<02:36,  1.26it/s]


KeyboardInterrupt: 