In [None]:
# import json

# with open('all_qguide_data.html', 'r') as file:
#     json_data = json.load(file)

# with open('all_qguide_data.json', 'w') as file:
#     json.dump(json_data, file, separators=(',', ':'))


In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from functools import lru_cache
from tabulate import tabulate

@lru_cache(maxsize=None)
def extract_table_data(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException:
        return None

    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    # Find the feedback header link
    feedback_header = soup.find('a', {'href': lambda x: x and x.startswith('#report')})
    if not feedback_header:
        return None

    # Extract text from the header
    title_text = feedback_header.get('title', '')
    if not title_text:
        title_text = feedback_header.text

    # Extract instructor name and course title using regex
    import re

    # Find the last hyphen before <br>
    match = re.search(r'Feedback for\s+(.*?)\s*-\s*([^-<\n]+?)(?:\s*<br>|\s*\(click)', title_text)

    if not match:
        return None

    full_course_title = match.group(1).strip()
    instructor_name = match.group(2).strip()

    # Handle multiple departments in course title
    departments = []
    course_parts = full_course_title.split(',')
    for part in course_parts:
        dept = part.strip().split()[0]
        if dept not in departments:
            departments.append(dept)

    department = ', '.join(departments)

    # Skip certain tables that we don't want to process
    captions_to_skip = [
        "Table for [QTitle]-In this course, most students listen attentively with an open mind and a willingness to change their point of view as they learn more about the topic.-Statistics.",
        "Table for [QTitle]-In this course (including sections), I feel comfortable expressing my views on controversial topics.-Statistics.",
        "Table for [qtitle]-in this course, most students listen attentively with an open mind and a willingness to change their point of view as they learn more about the topic..",
        "Table for [qtitle]-in this course (including sections), i feel comfortable expressing my views on controversial topics..",
        "Table for [qtitle]-in this course, most students listen attentively with an open mind and a willingness to change their point of view as they learn more about the topic.."
    ]

    captions_to_skip = [caption.lower().strip() for caption in captions_to_skip]

    # Find all tables in the HTML
    tables = soup.find_all('table')
    dataframes = {}

    # Store basic course info
    dataframes['Instructor Name'] = pd.DataFrame([instructor_name], columns=['Instructor Name'])
    dataframes['Course Name'] = pd.DataFrame([full_course_title], columns=['Course Name'])
    dataframes['Department'] = pd.DataFrame([department], columns=['Department'])

    # Process each table
    for table in tables:
        # Find the nearest preceding h3 tag to use as the table name
        h3_tag = table.find_previous('h3')
        if h3_tag:
            table_name = "_".join(h3_tag.text.strip().lower().split()).replace("-", "_")
        else:
            continue

        caption_tag = table.find('caption')
        if caption_tag:
            caption_text = caption_tag.text.strip().lower()
            if any(skip_caption in caption_text for skip_caption in captions_to_skip):
                continue

        # Extract headers and data
        headers = [th.text.strip() for th in table.find('tr').find_all('th')]

        data = []
        for row in table.find_all('tr'):
            row_data = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
            if len(row_data) == len(headers):
                data.append(row_data)

        # Create DataFrame if we have data
        if len(data) > 1:  # Skip if only headers
            df = pd.DataFrame(data[1:], columns=headers)
            dataframes[table_name] = df

    return dataframes

In [None]:
url = "https://harvard.bluera.com/harvard/rpv-eng.aspx?lang=eng&redi=1&SelectedIDforPrint=2a25bbb354dd26d2a5018145b3eccac802438e93cb86651665bfd7dc8f5e8b888dcf958e60eb641327093338906fe195&ReportType=2&regl=en-US"
tables_data = extract_table_data(url)

# Print instructor name and each table with its name
for table_name, table_df in tables_data.items():
    print(table_name, table_df)
    print(tabulate(table_df, headers='keys', tablefmt='grid'))
    print()

In [None]:
import json
import time
from concurrent.futures import ThreadPoolExecutor

with open('all_qguide_data.json', 'r') as f:
    all_qguide_data = json.load(f)

course_data_list = []

total_entries = len(all_qguide_data['data'])
processed_entries = 0
start_time = time.time()

def process_entry(entry):
    course_data = {key.capitalize(): entry[key] for key in ['title', 'url', 'instructor', 'department', 'term', 'subject', 'blueCourseId'] if key in entry}
    tables_data = extract_table_data(entry['url'])
    
    # Handle the case where tables_data is None
    if tables_data is None:
        course_data['Feedback'] = {}
    else:
        tables_data_dict = {table_name: table_df.to_dict(orient='records') for table_name, table_df in tables_data.items()}
        course_data['Feedback'] = tables_data_dict

    return course_data

def update_progress():
    global processed_entries
    processed_entries += 1
    percent_finished = (processed_entries / total_entries) * 100
    avg_time_per_iteration = (time.time() - start_time) / processed_entries
    remaining_entries = total_entries - processed_entries
    remaining_time_seconds = remaining_entries * avg_time_per_iteration

    hours = int(remaining_time_seconds // 3600)
    remaining_time_seconds %= 3600
    minutes = int(remaining_time_seconds // 60)
    seconds = int(remaining_time_seconds % 60)

    print(f"Iteration {processed_entries}/{total_entries} - {percent_finished:.2f}% completed - Estimated time remaining: {hours}h {minutes}m {seconds}s")

with ThreadPoolExecutor(max_workers=16) as executor:
    futures = []
    for entry in all_qguide_data['data']:
        futures.append(executor.submit(process_entry, entry))

    for future in futures:
        try:
            result = future.result()
            if result is not None:
                course_data_list.append(result)
            update_progress()
        except Exception as e:
            print(f"Error processing entry: {e}")
            update_progress()

with open('course_data.json', 'w') as json_file:
    json.dump(course_data_list, json_file, indent=4)

# This below section is not possible anymore because I forgot to scrape instructor names from 2019 :(

In [None]:
# import json

# def add_unique_courses(json1, json2):
#     # Create a set of Bluecourseid from the first JSON to keep track of added courses
#     existing_course_ids = {course['Bluecourseid'] for course in json1}
    
#     # Loop through the second JSON and add courses that don't exist in the first
#     for course in json2:
#         if course['Bluecourseid'] not in existing_course_ids:
#             json1.append(course)  # Add the new course to the first JSON
    
#     return json1

# # Load JSON data from files
# with open('course_data.json', 'r') as file1:
#     json1 = json.load(file1)

# with open('new_course_data.json', 'r') as file2:
#     json2 = json.load(file2)

# # Add unique courses from json2 to json1
# combined_json = add_unique_courses(json1, json2)

# # Save the combined JSON array
# with open('2020-2024_Course_Data.json', 'w') as outfile:
#     json.dump(combined_json, outfile, indent=4)


In [None]:
import pandas as pd
import json
from tabulate import tabulate

# Load the data
with open('course_data.json', 'r') as f:
    course_data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(course_data)

# Display the general head of the data to understand the structure
data_sample = df.head(10)
print("Data sample (general head):")
print(tabulate(data_sample, headers='keys', tablefmt='grid'))

In [1]:
import pandas as pd
import json
import numpy as np

# Load the data
with open('course_data.json', 'r') as f:
    course_data = json.load(f)

# Create a dictionary to store aggregated data per instructor
instructor_data = {}

for course in course_data:
    feedback = course.get('Feedback', {})
    instructor_name = feedback.get('Instructor Name', {})
    if not instructor_name:
        continue

    instructor_name = instructor_name[0].get('Instructor Name')
    if not instructor_name:
        continue

    department = feedback.get('Department', {})
    if department:
        department = department[0].get('Department')
    else:
        department = "Unknown"

    for table_name, records in feedback.items():
        if table_name == "general_instructor_questions":
            if instructor_name not in instructor_data:
                instructor_data[instructor_name] = {
                    'departments': list(set(department.split(", "))),  # Unique departments
                    'excellent_count': 0,
                    'very_good_count': 0,
                    'good_count': 0,
                    'fair_count': 0,
                    'unsatisfactory_count': 0,
                    'total_responses': 0
                }

            course_info = records[0]

            try:
                total_count = float(course_info['Count'])

                def safe_percentage(value):
                    if value == 'N/A':
                        return 0
                    return float(value.strip('%')) / 100

                instructor_data[instructor_name]['excellent_count'] += total_count * safe_percentage(course_info['Excellent'])
                instructor_data[instructor_name]['very_good_count'] += total_count * safe_percentage(course_info['Very Good'])
                instructor_data[instructor_name]['good_count'] += total_count * safe_percentage(course_info['Good'])
                instructor_data[instructor_name]['fair_count'] += total_count * safe_percentage(course_info['Fair'])
                instructor_data[instructor_name]['unsatisfactory_count'] += total_count * safe_percentage(course_info['Unsatisfactory'])
                instructor_data[instructor_name]['total_responses'] += total_count
            except (ValueError, KeyError):
                continue

# Convert to DataFrame
df = pd.DataFrame.from_dict(instructor_data, orient='index')

# Calculate global mean and variance
total_weighted_sum = (
    df['excellent_count'] * 5 +
    df['very_good_count'] * 4 +
    df['good_count'] * 3 +
    df['fair_count'] * 2 +
    df['unsatisfactory_count'] * 1
).sum()
total_responses = df['total_responses'].sum()
global_mean = total_weighted_sum / total_responses
global_variance = (
    (df['excellent_count'] * 25 +
     df['very_good_count'] * 16 +
     df['good_count'] * 9 +
     df['fair_count'] * 4 +
     df['unsatisfactory_count'] * 1).sum() / total_responses
) - global_mean ** 2

# Empirical Bayes calculation function
def calculate_empirical_bayes(row):
    try:
        weighted_sum = (row['excellent_count'] * 5 +
                        row['very_good_count'] * 4 +
                        row['good_count'] * 3 +
                        row['fair_count'] * 2 +
                        row['unsatisfactory_count'] * 1)
        observed_mean = weighted_sum / row['total_responses']
        observed_variance = 1 / row['total_responses']
        shrinkage = global_variance / (global_variance + observed_variance)
        return round((shrinkage * observed_mean) + ((1 - shrinkage) * global_mean), 3)
    except ZeroDivisionError:
        return round(global_mean, 3)

# Apply calculations
df['Empirical Bayes Average'] = df.apply(calculate_empirical_bayes, axis=1)
df['Overall Percentile'] = df['Empirical Bayes Average'].rank(pct=True) * 100

# Grading boundaries
grade_boundaries = [
    (0, 0.1, 'S+'), (0.1, 0.5, 'S'), (0.5, 1, 'S-'),
    (1, 2, 'A+'), (2, 5, 'A'), (5, 10, 'A-'),
    (10, 20, 'B+'), (20, 40, 'B'), (40, 60, 'B-'),
    (60, 80, 'C'), (80, 95, 'D'), (95, 100, 'F')
]

def assign_grade(percentile):
    for lower, upper, grade in grade_boundaries:
        if lower <= 100 - percentile < upper:
            return grade
    return 'F'

df['Overall Grade'] = df['Overall Percentile'].apply(assign_grade)
df['Global Rank'] = df['Empirical Bayes Average'].rank(ascending=False, method='min').astype(int)

# Intra-department calculations
df['Department Grades'] = [{} for _ in range(len(df))]
df['Department Ranks'] = [{} for _ in range(len(df))]

all_departments = set()
for depts in df['departments']:
    all_departments.update(depts)

for department in all_departments:
    # Filter department members
    dept_mask = df['departments'].apply(lambda x: department in x)
    dept_df = df[dept_mask].copy()
    
    if len(dept_df) < 1:
        continue
        
    # Calculate intra-department percentiles
    dept_df['Dept Percentile'] = dept_df['Empirical Bayes Average'].rank(pct=True) * 100
    
    # Calculate ranks within department
    dept_df['Dept Rank'] = dept_df['Empirical Bayes Average'].rank(ascending=False, method='min').astype(int)
    
    # Assign grades
    dept_df['Dept Grade'] = dept_df['Dept Percentile'].apply(assign_grade)
    
    # Update main dataframe
    for idx in dept_df.index:
        df.at[idx, 'Department Grades'][department] = dept_df.at[idx, 'Dept Grade']
        df.at[idx, 'Department Ranks'][department] = dept_df.at[idx, 'Dept Rank']

# Format department metrics
def format_dept_metrics(row):
    metrics = []
    for dept in row['departments']:
        grade = row['Department Grades'].get(dept, 'N/A')
        rank = row['Department Ranks'].get(dept, 'N/A')
        metrics.append(f"{dept}: {grade} (Rank {rank})")
    return " | ".join(metrics)

# Final output
result_df = pd.DataFrame({
    'Instructor': df.index,
    'Departments': df['departments'].apply(lambda x: ", ".join(x)),
    'Total Ratings': df['total_responses'],
    'EB Score': df['Empirical Bayes Average'],
    'Global Rank': df['Global Rank'],
    'Overall Grade': df['Overall Grade'],
    'Department Metrics': df.apply(format_dept_metrics, axis=1)
}).sort_values('EB Score', ascending=False)

# Save results
print(result_df.to_string(index=False))
result_df.to_json('professors.json', orient='records', indent=4)

                            Instructor                Departments  Total Ratings  EB Score  Global Rank Overall Grade                                                  Department Metrics
                      Katharine Clarke                      EXPOS           84.0     4.992            1            S+                                                  EXPOS: S+ (Rank 1)
                      William Friedman                     FRSEMR           47.0     4.986            2            S+                                                 FRSEMR: S+ (Rank 1)
                       Susan Farbstein                     FRSEMR           39.0     4.983            3            S+                                                 FRSEMR: S- (Rank 2)
                       Cirrus Foroughi                       ECON           35.0     4.981            4             S                                                   ECON: S+ (Rank 1)
                              Jing Cai                      CHNSE     