In [1]:
# import json

# with open('new_all_qguide_data.html', 'r') as file:
#     json_data = json.load(file)

# with open('new_all_qguide_data.json', 'w') as file:
#     json.dump(json_data, file, separators=(',', ':'))


In [7]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from tabulate import tabulate

def extract_table_data(url):
    # Fetch the HTML content from the URL
    response = requests.get(url)
    html = response.text

    # Parse the HTML
    soup = BeautifulSoup(html, 'html.parser')

    # Define table names
    table_names = ['Responses', 'General', 'Instructor', 'Hours', 'Recommendation', 'Recommendation_stats', 'Reasons', 'Comments']

    # Find all tables containing the required data
    tables = soup.find_all('table')

    # Create a dictionary to store DataFrames for each table
    dataframes = {}

    # Iterate through each table
    for idx, table in enumerate(tables):
        # Extract headers
        headers = []
        for th in table.find('tr').find_all('th'):
            headers.append(th.text.strip())

        # Extract data rows
        data = []
        for row in table.find_all('tr'):
            row_data = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
            if len(row_data) == len(headers):
                data.append(row_data)

        # If data rows exist
        if len(data) > 1:
            # Create a DataFrame excluding the header row
            df = pd.DataFrame(data[1:], columns=headers)
            dataframes[table_names[idx]] = df

    return dataframes


In [4]:
# Example usage:
url = "https://harvard.bluera.com/harvard/rpv-eng.aspx?lang=eng&redi=1&SelectedIDforPrint=7a0a2af3e410e112bf9de805337d95874c27341b5445e2f5053d952af958c58c9549a839ccebc274f7760f73d58e1223&ReportType=2&regl=en-US"
tables_data = extract_table_data(url)

# Print each table with its name
for table_name, table_df in tables_data.items():
    print(f"Table '{table_name}':")
    print(tabulate(table_df, headers='keys', tablefmt='grid'))
    print()

Table 'Responses':
+----+----------------+------------+
|    | Raters         | Students   |
|  0 | Responded      | 18         |
+----+----------------+------------+
|  1 | Invited        | 24         |
+----+----------------+------------+
|  2 | Response Ratio | 75%        |
+----+----------------+------------+

Table 'General':
+----+--------------------------------------------------------------------------------------------+---------+-------------+-------------+--------+--------+------------------+---------------+------------+
|    |                                                                                            |   Count | Excellent   | Very Good   | Good   | Fair   | Unsatisfactory   |   Course Mean |   FAS Mean |
|  0 | Evaluate the course overall.                                                               |      17 | 59%         | 35%         | 6%     | 0%     | 0%               |          4.53 |       4.21 |
+----+-------------------------------------------------

In [12]:
import json
import time

with open('new_all_qguide_data.json', 'r') as f:
    all_qguide_data = json.load(f)

course_data_list = []

start_time = time.time()

total_entries = len(all_qguide_data['data'])

processed_entries = 0

for entry in all_qguide_data['data']:
    iteration_start_time = time.time()

    course_data = {}
    for key in ['title', 'url', 'instructor', 'department', 'term', 'subject', 'blueCourseId']:
        if key in entry:
            course_data[key.capitalize()] = entry[key]
        else:
            # If blueCourseId is missing, assign 0
            if key == 'blueCourseId':
                course_data['Bluecourseid'] = "POST-SPRING2024"

    tables_data_dict = {table_name: table_df.to_dict(orient='records') for table_name, table_df in tables_data.items()}

    course_data['Feedback'] = tables_data_dict

    course_data_list.append(course_data)

    processed_entries += 1

    iteration_elapsed_time = time.time() - iteration_start_time

    percent_finished = (processed_entries / total_entries) * 100

    avg_time_per_iteration = (time.time() - start_time) / processed_entries

    remaining_entries = total_entries - processed_entries
    remaining_time_seconds = remaining_entries * avg_time_per_iteration

    hours = int(remaining_time_seconds // 3600)
    remaining_time_seconds %= 3600
    minutes = int(remaining_time_seconds // 60)
    seconds = int(remaining_time_seconds % 60)

    print(f"Iteration {processed_entries}/{total_entries} - {percent_finished:.2f}% completed - Time taken: {iteration_elapsed_time:.2f}s - Estimated time remaining: {hours}h {minutes}m {seconds}s")

end_time = time.time()

with open('new_course_data.json', 'w') as json_file:
    json.dump(course_data_list, json_file, indent=4)


Iteration 1/9372 - 0.01% completed - Time taken: 0.00s - Estimated time remaining: 0h 0m 25s
Iteration 2/9372 - 0.02% completed - Time taken: 0.00s - Estimated time remaining: 0h 0m 17s
Iteration 3/9372 - 0.03% completed - Time taken: 0.00s - Estimated time remaining: 0h 0m 14s
Iteration 4/9372 - 0.04% completed - Time taken: 0.00s - Estimated time remaining: 0h 0m 13s
Iteration 5/9372 - 0.05% completed - Time taken: 0.00s - Estimated time remaining: 0h 0m 12s
Iteration 6/9372 - 0.06% completed - Time taken: 0.00s - Estimated time remaining: 0h 0m 11s
Iteration 7/9372 - 0.07% completed - Time taken: 0.00s - Estimated time remaining: 0h 0m 11s
Iteration 8/9372 - 0.09% completed - Time taken: 0.00s - Estimated time remaining: 0h 0m 10s
Iteration 9/9372 - 0.10% completed - Time taken: 0.00s - Estimated time remaining: 0h 0m 10s
Iteration 10/9372 - 0.11% completed - Time taken: 0.00s - Estimated time remaining: 0h 0m 10s
Iteration 11/9372 - 0.12% completed - Time taken: 0.00s - Estimated t