In [1]:
import time
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urljoin
from datetime import datetime

In [2]:
# This is the first page "精彩推荐"


# Define the URL
url = 'https://www.tsinghua.edu.cn/news/spqh/jctj.htm'

driver = webdriver.Safari()

# Open the web page in the driver
driver.get(url)


# Extract the HTML from the web page
html = driver.page_source

# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Define a list to store the extracted data
df = pd.DataFrame(columns=['Title', 'Date', 'URL', 'Description'])


# Extract data from the first featured post
video_title = soup.find_all('p', class_="bt")[0]
titles = [vedio.text.strip() for vedio in video_title]

ul_tag = soup.find('div', class_='qhrw2_first')
links = [urljoin(url, a_tag.get('href')) for a_tag in ul_tag.find_all('a')]

ul_tag = soup.find('div', class_='sj')
dates = []
p_tag = ul_tag.find('p')
span_tag = ul_tag.find('span')
date_str = f"{p_tag.text}/{span_tag.text}"
date = datetime.strptime(date_str, '%d/%Y.%m')
dates.append(date.strftime('%Y/%m/%d'))

words = soup.find_all('p', class_='zy')[0]
word = [word.text.strip() for word in words]

new_data = {'Title': titles, 'Date': dates, 'URL': links, 'Description': word}
new_df = pd.DataFrame(new_data)
df = pd.concat([df, new_df], ignore_index=True)

while True:
    try:
        # Extract the HTML from the web page
        html = driver.page_source

        # Parse the HTML with Beautiful Soup
        soup = BeautifulSoup(html, 'html.parser')

        # Extract the data from the soup object
        video_title = soup.find_all('p', class_="bt")[1:9]
        titles = [vedio.text.strip() for vedio in video_title]

        # Find the <ul> tag with class "qhrw2_ul"
        ul_tag = soup.find('ul', class_='qhrw2_ul')
        # Find all <a> tags within the <ul> tag and get their href attributes     
        links = [urljoin(url, a_tag.get('href')) for a_tag in ul_tag.find_all('a')]

        # Find all <div> tags with class "sj"
        div_tags = ul_tag.find_all('div', class_='sj')
        # Loop through each <div> tag and get the date
        dates = []
        for div_tag in div_tags:
            p_tag = div_tag.find('p')
            span_tag = div_tag.find('span')
            date_str = f"{p_tag.text}/{span_tag.text}"
            date = datetime.strptime(date_str, '%d/%Y.%m')
            dates.append(date.strftime('%Y/%m/%d'))

        # For the description    
        words = soup.find_all('p', class_='zy')[1:]
        word = [word.text.strip() for word in words]

        # Ensure that all the lists in the dictionary have the same length
        num_rows = min(len(titles), len(dates), len(links), len(word))
        new_data = {'Title': titles[:num_rows], 'Date': dates[:num_rows], 'URL': links[:num_rows], 'Description': word[:num_rows]}

        # Convert the dictionary to a Pandas DataFrame
        new_df = pd.DataFrame(new_data)

        # Append the new DataFrame to the original DataFrame
        df = pd.concat([df, new_df], ignore_index=True)

        # Wait for the next button to become clickable
        wait = WebDriverWait(driver, 10)
        next_link = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='next']")))

        # Check if the next link has a valid href attribute
        if next_link.get_attribute('href') == "javascript:;":
            # If the href attribute is "javascript:;", it means that we have reached the last page
            break
        else:
            # Click on the next button
            next_link.click()

    except StaleElementReferenceException:
        # If the element reference is stale, locate the element again
        pass

    except TimeoutException:
        # If the "next" button is not clickable, it means that we have reached the last page
        break
   

driver.quit()

df.drop_duplicates(inplace=True)

df['Category'] = '精彩推荐'

df_jctj = df


# Print a message to confirm the export
print('Done')

Done


In [3]:
# # This is the second page "清华映像"


# Define the URL
url = 'https://www.tsinghua.edu.cn/news/spqh/qhyx.htm'

driver = webdriver.Safari()

# Open the web page in the driver
driver.get(url)


# Extract the HTML from the web page
html = driver.page_source

# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Define a list to store the extracted data
df = pd.DataFrame(columns=['Title', 'Date', 'URL', 'Description'])

while True:
    try:
        # Extract the HTML from the web page
        html = driver.page_source

        # Parse the HTML with Beautiful Soup
        soup = BeautifulSoup(html, 'html.parser')

        # Extract the data from the soup object
        video_title = soup.find_all('p', class_="bt")[1:9]
        titles = [vedio.text.strip() for vedio in video_title]

        # Find the <ul> tag with class "qhrw2_ul"
        ul_tag = soup.find('ul', class_='qhrw2_ul')
        # Find all <a> tags within the <ul> tag and get their href attributes     
        links = [urljoin(url, a_tag.get('href')) for a_tag in ul_tag.find_all('a')]

        # Find all <div> tags with class "sj"
        div_tags = ul_tag.find_all('div', class_='sj')
        # Loop through each <div> tag and get the date
        dates = []
        for div_tag in div_tags:
            p_tag = div_tag.find('p')
            span_tag = div_tag.find('span')
            date_str = f"{p_tag.text}/{span_tag.text}"
            date = datetime.strptime(date_str, '%d/%Y.%m')
            dates.append(date.strftime('%Y/%m/%d'))

        # For the description    
        words = soup.find_all('p', class_='zy')[1:]
        word = [word.text.strip() for word in words]

        # Ensure that all the lists in the dictionary have the same length
        num_rows = min(len(titles), len(dates), len(links), len(word))
        new_data = {'Title': titles[:num_rows], 'Date': dates[:num_rows], 'URL': links[:num_rows], 'Description': word[:num_rows]}

        # Convert the dictionary to a Pandas DataFrame
        new_df = pd.DataFrame(new_data)

        # Append the new DataFrame to the original DataFrame
        df = pd.concat([df, new_df], ignore_index=True)

        # Wait for the next button to become clickable
        wait = WebDriverWait(driver, 10)
        next_link = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='next']")))

        # Check if the next link has a valid href attribute
        if next_link.get_attribute('href') == "javascript:;":
            # If the href attribute is "javascript:;", it means that we have reached the last page
            break
        else:
            # Click on the next button
            next_link.click()

    except StaleElementReferenceException:
        # If the element reference is stale, locate the element again
        pass

    except TimeoutException:
        # If the "next" button is not clickable, it means that we have reached the last page
        break



# Close the driver
driver.quit()

df.drop_duplicates(inplace=True)

df['Category'] = '清华映像'

df_qhyx = df

print('Done')

Done


In [6]:
 # This is the third page "专题集锦 "

# Define the URL
url = 'https://www.tsinghua.edu.cn/news/spqh/ztjj.htm'

driver = webdriver.Safari()

# Open the web page in the driver
driver.get(url)


# Extract the HTML from the web page
html = driver.page_source

# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Define a list to store the extracted data
df = pd.DataFrame(columns=['Title', 'Date', 'URL', 'Description'])

while True:
    try:
        # Extract the HTML from the web page
        html = driver.page_source

        # Parse the HTML with Beautiful Soup
        soup = BeautifulSoup(html, 'html.parser')

        # Extract the data from the soup object
        video_title = soup.find_all('p', class_="bt")[1:9]
        titles = [vedio.text.strip() for vedio in video_title]

        # Find the <ul> tag with class "qhrw2_ul"
        ul_tag = soup.find('ul', class_='qhrw2_ul')
        # Find all <a> tags within the <ul> tag and get their href attributes     
        links = [urljoin(url, a_tag.get('href')) for a_tag in ul_tag.find_all('a')]

        # Find all <div> tags with class "sj"
        div_tags = ul_tag.find_all('div', class_='sj')
        # Loop through each <div> tag and get the date
        dates = []
        for div_tag in div_tags:
            p_tag = div_tag.find('p')
            span_tag = div_tag.find('span')
            date_str = f"{p_tag.text}/{span_tag.text}"
            date = datetime.strptime(date_str, '%d/%Y.%m')
            dates.append(date.strftime('%Y/%m/%d'))

        # For the description    
        words = soup.find_all('p', class_='zy')[1:]
        word = [word.text.strip() for word in words]

        # Ensure that all the lists in the dictionary have the same length
        num_rows = min(len(titles), len(dates), len(links), len(word))
        new_data = {'Title': titles[:num_rows], 'Date': dates[:num_rows], 'URL': links[:num_rows], 'Description': word[:num_rows]}

        # Convert the dictionary to a Pandas DataFrame
        new_df = pd.DataFrame(new_data)

        # Append the new DataFrame to the original DataFrame
        df = pd.concat([df, new_df], ignore_index=True)

        # Wait for the next button to become clickable
        wait = WebDriverWait(driver, 10)
        next_link = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='next']")))

        # Check if the next link has a valid href attribute
        if next_link.get_attribute('href') == "javascript:;":
            # If the href attribute is "javascript:;", it means that we have reached the last page
            break
        else:
            # Click on the next button
            next_link.click()

    except StaleElementReferenceException:
        # If the element reference is stale, locate the element again
        pass

    except TimeoutException:
        # If the "next" button is not clickable, it means that we have reached the last page
        break
        
        

# Close the driver
driver.quit()

df.drop_duplicates(inplace=True)


df['Category'] = '专题集锦'

df_ztjj = df

print('Done')

Done


In [7]:
 # This is the fourth page "拍客日记 "
        

# Define the URL
url = 'https://www.tsinghua.edu.cn/news/spqh/pkrj.htm'

driver = webdriver.Safari()

# Open the web page in the driver
driver.get(url)


# Extract the HTML from the web page
html = driver.page_source

# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Define a list to store the extracted data
df = pd.DataFrame(columns=['Title', 'Date', 'URL', 'Description'])

while True:
    try:
        # Extract the HTML from the web page
        html = driver.page_source

        # Parse the HTML with Beautiful Soup
        soup = BeautifulSoup(html, 'html.parser')

        # Extract the data from the soup object
        video_title = soup.find_all('p', class_="bt")[1:9]
        titles = [vedio.text.strip() for vedio in video_title]

        # Find the <ul> tag with class "qhrw2_ul"
        ul_tag = soup.find('ul', class_='qhrw2_ul')
        # Find all <a> tags within the <ul> tag and get their href attributes     
        links = [urljoin(url, a_tag.get('href')) for a_tag in ul_tag.find_all('a')]

        # Find all <div> tags with class "sj"
        div_tags = ul_tag.find_all('div', class_='sj')
        # Loop through each <div> tag and get the date
        dates = []
        for div_tag in div_tags:
            p_tag = div_tag.find('p')
            span_tag = div_tag.find('span')
            date_str = f"{p_tag.text}/{span_tag.text}"
            date = datetime.strptime(date_str, '%d/%Y.%m')
            dates.append(date.strftime('%Y/%m/%d'))

        # For the description    
        words = soup.find_all('p', class_='zy')[1:]
        word = [word.text.strip() for word in words]

        # Ensure that all the lists in the dictionary have the same length
        num_rows = min(len(titles), len(dates), len(links), len(word))
        new_data = {'Title': titles[:num_rows], 'Date': dates[:num_rows], 'URL': links[:num_rows], 'Description': word[:num_rows]}

        # Convert the dictionary to a Pandas DataFrame
        new_df = pd.DataFrame(new_data)

        # Append the new DataFrame to the original DataFrame
        df = pd.concat([df, new_df], ignore_index=True)

        # Wait for the next button to become clickable
        wait = WebDriverWait(driver, 10)
        next_link = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='next']")))

        # Check if the next link has a valid href attribute
        if next_link.get_attribute('href') == "javascript:;":
            # If the href attribute is "javascript:;", it means that we have reached the last page
            break
        else:
            # Click on the next button
            next_link.click()

    except StaleElementReferenceException:
        # If the element reference is stale, locate the element again
        pass

    except TimeoutException:
        # If the "next" button is not clickable, it means that we have reached the last page
        break
        
        

# Close the driver
driver.quit()

df.drop_duplicates(inplace=True)

df['Category'] = '拍客日记'

df_pkrj = df

print('Done')

Done


In [10]:
 # This is the fifth page "文化印记 "

# Define the URL
url = 'https://www.tsinghua.edu.cn/news/spqh/whyj.htm'

driver = webdriver.Safari()

# Open the web page in the driver
driver.get(url)


# Extract the HTML from the web page
html = driver.page_source

# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Define a list to store the extracted data
df = pd.DataFrame(columns=['Title', 'Date', 'URL', 'Description'])

while True:
    try:
        # Extract the HTML from the web page
        html = driver.page_source

        # Parse the HTML with Beautiful Soup
        soup = BeautifulSoup(html, 'html.parser')

        # Extract the data from the soup object
        video_title = soup.find_all('p', class_="bt")[1:9]
        titles = [vedio.text.strip() for vedio in video_title]

        # Find the <ul> tag with class "qhrw2_ul"
        ul_tag = soup.find('ul', class_='qhrw2_ul')
        # Find all <a> tags within the <ul> tag and get their href attributes     
        links = [urljoin(url, a_tag.get('href')) for a_tag in ul_tag.find_all('a')]

        # Find all <div> tags with class "sj"
        div_tags = ul_tag.find_all('div', class_='sj')
        # Loop through each <div> tag and get the date
        dates = []
        for div_tag in div_tags:
            p_tag = div_tag.find('p')
            span_tag = div_tag.find('span')
            date_str = f"{p_tag.text}/{span_tag.text}"
            date = datetime.strptime(date_str, '%d/%Y.%m')
            dates.append(date.strftime('%Y/%m/%d'))

        # For the description    
        words = soup.find_all('p', class_='zy')[1:]
        word = [word.text.strip() for word in words]

        # Ensure that all the lists in the dictionary have the same length
        num_rows = min(len(titles), len(dates), len(links), len(word))
        new_data = {'Title': titles[:num_rows], 'Date': dates[:num_rows], 'URL': links[:num_rows], 'Description': word[:num_rows]}

        # Convert the dictionary to a Pandas DataFrame
        new_df = pd.DataFrame(new_data)

        # Append the new DataFrame to the original DataFrame
        df = pd.concat([df, new_df], ignore_index=True)

        # Wait for the next button to become clickable
        wait = WebDriverWait(driver, 15)
        next_link = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='next']")))

        # Check if the next link has a valid href attribute
        if next_link.get_attribute('href') == "javascript:;":
            # If the href attribute is "javascript:;", it means that we have reached the last page
            break
        else:
            # Click on the next button
            next_link.click()

    except StaleElementReferenceException:
        # If the element reference is stale, locate the element again
        pass

    except TimeoutException:
        # If the "next" button is not clickable, it means that we have reached the last page
        break
        
        

# Close the driver
driver.quit()

df.drop_duplicates(inplace=True)

df['Category'] = '文化印记'

df_whyj = df

print('Done')

Done


In [11]:
 # This is the sixth page "活动全录 "

# Define the URL
url = 'https://www.tsinghua.edu.cn/news/spqh/hdql.htm'

driver = webdriver.Safari()

# Open the web page in the driver
driver.get(url)


# Extract the HTML from the web page
html = driver.page_source

# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

# Define a list to store the extracted data
df = pd.DataFrame(columns=['Title', 'Date', 'URL', 'Description'])

while True:
    try:
        # Extract the HTML from the web page
        html = driver.page_source

        # Parse the HTML with Beautiful Soup
        soup = BeautifulSoup(html, 'html.parser')

        # Extract the data from the soup object
        video_title = soup.find_all('p', class_="bt")[1:9]
        titles = [vedio.text.strip() for vedio in video_title]

        # Find the <ul> tag with class "qhrw2_ul"
        ul_tag = soup.find('ul', class_='qhrw2_ul')
        # Find all <a> tags within the <ul> tag and get their href attributes     
        links = [urljoin(url, a_tag.get('href')) for a_tag in ul_tag.find_all('a')]

        # Find all <div> tags with class "sj"
        div_tags = ul_tag.find_all('div', class_='sj')
        # Loop through each <div> tag and get the date
        dates = []
        for div_tag in div_tags:
            p_tag = div_tag.find('p')
            span_tag = div_tag.find('span')
            date_str = f"{p_tag.text}/{span_tag.text}"
            date = datetime.strptime(date_str, '%d/%Y.%m')
            dates.append(date.strftime('%Y/%m/%d'))

        # For the description    
        words = soup.find_all('p', class_='zy')[1:]
        word = [word.text.strip() for word in words]

        # Ensure that all the lists in the dictionary have the same length
        num_rows = min(len(titles), len(dates), len(links), len(word))
        new_data = {'Title': titles[:num_rows], 'Date': dates[:num_rows], 'URL': links[:num_rows], 'Description': word[:num_rows]}

        # Convert the dictionary to a Pandas DataFrame
        new_df = pd.DataFrame(new_data)

        # Append the new DataFrame to the original DataFrame
        df = pd.concat([df, new_df], ignore_index=True)

        # Wait for the next button to become clickable
        wait = WebDriverWait(driver, 10)
        next_link = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='next']")))

        # Check if the next link has a valid href attribute
        if next_link.get_attribute('href') == "javascript:;":
            # If the href attribute is "javascript:;", it means that we have reached the last page
            break
        else:
            # Click on the next button
            next_link.click()

    except StaleElementReferenceException:
        # If the element reference is stale, locate the element again
        pass

    except TimeoutException:
        # If the "next" button is not clickable, it means that we have reached the last page
        break
        
        

# Close the driver
driver.quit()

df.drop_duplicates(inplace=True)

df['Category'] = '活动全录'

df_hdjl = df

print('Done')

Done


In [12]:
df_combined = pd.concat([df_jctj,df_qhyx,df_ztjj,df_pkrj,df_whyj, df_hdjl])

In [13]:
# Export the DataFrame to a CSV file
df_combined.to_csv(r'/Users/davidsmacbook/Desktop/Work/THU_Website.csv', index=False)