In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import numpy as np
import os

# 定义 ChromeDriver 路径
CHROME_DRIVER_PATH = r'C:\Users\19521\Desktop\chromedriver-win64\chromedriver.exe'
# 定义目标网页 URL
URL = "https://www.qschina.cn/university-rankings/world-university-rankings/2024"
# 定义要爬取的页数
PAGES_TO_SCRAPE = 40
# 定义保存文件夹路径（可根据需要修改）
SAVE_FOLDER = r"C:\Users\19521\Desktop\我的论文\代码"

def setup_driver():
    """
    初始化 Chrome 浏览器驱动
    :return: 浏览器驱动实例
    """
    service = Service(executable_path=CHROME_DRIVER_PATH)
    driver = webdriver.Chrome(service=service)
    return driver

def click_element(driver, xpath):
    """
    点击指定 XPath 的元素，使用显式等待确保元素可点击
    :param driver: 浏览器驱动实例
    :param xpath: 元素的 XPath
    """
    try:
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, xpath))
        )
        element.click()
    except Exception as e:
        print(f"点击元素时出现错误: {e}")

def scroll_page(driver, scroll_distance):
    """
    滚动网页到指定距离
    :param driver: 浏览器驱动实例
    :param scroll_distance: 滚动的距离
    """
    driver.execute_script(f"document.documentElement.scrollTop={scroll_distance}")
    time.sleep(1)

def get_text(driver, data_lists):
    """
    从当前页面提取所需数据并存储到相应列表中
    :param driver: 浏览器驱动实例
    :param data_lists: 存储数据的列表集合
    """
    rank_all, name_all, href_all, location_all, score_all, academic_all, repu_all, \
    ts_all, citation_all, internationT_all, internationS_all, research_all, \
    employment_all, sustain_all = data_lists

    # rank 排名
    list_rank = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" rank"]')
    for li_rank in list_rank:
        try:
            rank = li_rank.get_attribute('innerHTML').split('td-wrap-in">')[1].split('</div>')[0]
            rank_all.append(rank)
        except IndexError:
            rank_all.append(np.nan)

    # name 大学名称
    list_name = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" uni"]/div/div')
    for li_name in list_name:
        try:
            name = li_name.find_element(By.XPATH, './a').text
            name_all.append(name)
        except Exception:
            name_all.append(np.nan)

    # href 大学详情链接地址
    list_href = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" uni"]/div/div')
    for li_href in list_href:
        try:
            href = li_href.find_element(By.XPATH, './a').get_attribute('href')
            href_all.append(href)
        except Exception:
            href_all.append(np.nan)

    # location 大学所处国家
    list_location = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" uni"]/div/div/div')
    for li_location in list_location:
        location = li_location.text
        location_all.append(location)

    # 综合得分
    list_score = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class="ind-col ind-overall sorting_1"]')
    for li_score in list_score:
        score = li_score.text
        score_all.append(score)

    # 学术声誉
    list_academic = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" ind-col ind-76"]')
    for li_academic in list_academic:
        try:
            academic = li_academic.find_element(By.XPATH, './div/div').text
            academic_all.append(academic)
        except Exception:
            academic_all.append(np.nan)

    # 雇主声誉
    list_repu = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" ind-col ind-77"]')
    for li_repu in list_repu:
        try:
            repu = li_repu.find_element(By.XPATH, './div/div').text
            repu_all.append(repu)
        except Exception:
            repu_all.append(np.nan)

    # 每位教员引用率
    list_citation = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" ind-col ind-73"]')
    for li_citation in list_citation:
        try:
            citation = li_citation.find_element(By.XPATH, './div/div').text
            citation_all.append(citation)
        except Exception:
            citation_all.append(np.nan)

    # 师生比
    list_ts = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" ind-col ind-36"]')
    for li_ts in list_ts:
        try:
            ts = li_ts.find_element(By.XPATH, './div/div').text
            ts_all.append(ts)
        except Exception:
            ts_all.append(np.nan)

    # 国际学生占比
    list_internationS = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" ind-col ind-14"]')
    for li_internationS in list_internationS:
        try:
            internationS = li_internationS.find_element(By.XPATH, './div/div').text
            internationS_all.append(internationS)
        except Exception:
            internationS_all.append(np.nan)

    # 国际教师占比
    list_internationT = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" ind-col ind-18"]')
    for li_internationT in list_internationT:
        try:
            internationT = li_internationT.find_element(By.XPATH, './div/div').text
            internationT_all.append(internationT)
        except Exception:
            internationT_all.append(np.nan)

    # 国际研究网络
    list_research = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" ind-col ind-15"]')
    for li_research in list_research:
        try:
            research = li_research.find_element(By.XPATH, './div/div').text
            research_all.append(research)
        except Exception:
            research_all.append(np.nan)

    # 就业结果
    list_employment = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" ind-col ind-2177844"]')
    for li_employment in list_employment:
        try:
            employment = li_employment.find_element(By.XPATH, './div/div').text
            employment_all.append(employment)
        except Exception:
            employment_all.append(np.nan)

    # 可持续性
    list_sustain = driver.find_elements(By.XPATH, '//*[@id="qs-rankings-indicators"]/tbody/tr//td[@class=" ind-col ind-2208745"]')
    for li_sustain in list_sustain:
        try:
            sustain = li_sustain.find_element(By.XPATH, './div/div').text
            sustain_all.append(sustain)
        except Exception:
            sustain_all.append(np.nan)

    return [rank_all, name_all, href_all, location_all, score_all, academic_all, repu_all,
            ts_all, citation_all, internationT_all, internationS_all, research_all,
            employment_all, sustain_all]

def clean_location(location_all):
    """
    清理 location_all 列表中的无效数据
    :param location_all: 存储大学所处国家的列表
    :return: 清理后的列表
    """
    invalid_values = ['5+ QS Stars', '5 QS Stars', '4 QS Stars', '3 QS Stars', '']
    return [loc for loc in location_all if loc not in invalid_values]

def clean_rank(value):
    """
    清理排名数据，将字符串转换为数值
    :param value: 排名数据
    :return: 清理后的数值
    """
    try:
        if isinstance(value, str):
            if value.startswith('='):
                return int(value[1:])
            elif '-' in value:
                start, end = map(int, value.split('-'))
                return (start + end) // 2
            else:
                return int(value)
        return int(value)
    except ValueError:
        return np.nan

def main():
    driver = setup_driver()
    driver.get(URL)

    # 点击指定元素
    click_element(driver, '//*[@id="qs-rankings-datatables"]/div[1]/ul/li[2]/a')

    # 初始化数据列表，改为列表类型
    data_lists = [[], [], [], [], [], [], [], [], [], [], [], [], [], []]

    # 爬取第一页数据
    data_lists = get_text(driver, data_lists)
    print(f'第一页的学校名称有：{data_lists[1]}')
    print(f'第一页的学校排名有：{data_lists[0]}')

    # 滚动页面并点击下一页
    scroll_page(driver, 2500)
    click_element(driver, '//*[@id="qs-rankings-indicators_next"]')
    data_lists = get_text(driver, data_lists)
    print(f'前 2 页的学校排名有：{data_lists[0]}')

    # 循环爬取剩余页面
    for _ in range(2, PAGES_TO_SCRAPE):
        scroll_page(driver, 2100)
        click_element(driver, '//*[@id="qs-rankings-indicators_next"]')
        data_lists = get_text(driver, data_lists)

    # 清理 location 数据
    data_lists[3] = clean_location(data_lists[3])

    # 创建 DataFrame
    df = pd.DataFrame({
        '排名': data_lists[0],
        '大学名称': data_lists[1],
        '详情链接': data_lists[2],
        '所处国家': data_lists[3],
        '综合得分': data_lists[4],
        '学术声誉': data_lists[5],
        '雇主声誉': data_lists[6],
        '师生比': data_lists[7],
        '每位教员引用率': data_lists[8],
        '国际教师占比': data_lists[9],
        '国际学生占比': data_lists[10],
        '国际研究网络': data_lists[11],
        '就业结果': data_lists[12],
        '可持续性': data_lists[13]
    })

    # 清理排名数据
    df['排名'] = df['排名'].apply(clean_rank)
    df.reset_index(drop=True, inplace=True)

    # 确保保存文件夹存在，如果不存在则创建
    if not os.path.exists(SAVE_FOLDER):
        os.makedirs(SAVE_FOLDER)

    # 拼接完整的文件路径
    file_path = os.path.join(SAVE_FOLDER, "qs_ranking.xlsx")

    # 将数据保存到指定路径
    df.to_excel(file_path, index=False)

    # 关闭浏览器
    driver.quit()

if __name__ == "__main__":
    main()

第一页的学校名称有：['麻省理工学院', '剑桥大学', '牛津大学', '哈佛大学', '斯坦福大学', '帝国理工学院', '苏黎世联邦理工大学（瑞士联邦理工学院）', '新加坡国立大学', '伦敦大学学院', '加州大学伯克利分校', '芝加哥大学', '宾夕法尼亚大学', '康奈尔大学', '墨尔本大学', '加州理工大学（Caltech)', '耶鲁大学', '北京大学', '普林斯顿大学', '新南威尔士大学（UNSW）', '悉尼大学', '多伦多大学', '爱丁堡大学', '哥伦比亚大学', '巴黎科学艺术人文大学', '清华大学']
第一页的学校排名有：['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '=17', '=17', '=19', '=19', '21', '22', '23', '24', '25']
前 2 页的学校排名有：['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '=17', '=17', '=19', '=19', '21', '22', '23', '24', '25', '=26', '=26', '28', '28', '29', '30', '32', '33', '=34', '=34', '36', '37', '=38', '=38', '40', '41', '42', '43', '=44', '45', '46', '=47', '=47', '=47', '50']


In [8]:
import pandas as pd
import numpy as np

# 假设已经读取并处理好数据，存储在 df 中
# 选择用于聚类分析的特征列
features = ['综合得分', '学术声誉', '雇主声誉', '师生比', '每位教员引用率', 
            '国际教师占比', '国际学生占比', '国际研究网络', '就业结果', '可持续性']

X = df[features]

# 数据标准化，因为不同特征的量纲可能不同
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

NameError: name 'df' is not defined

In [5]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# 尝试不同的 k 值
inertia = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# 绘制肘部图
plt.plot(k_range, inertia, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()

NameError: name 'X_scaled' is not defined