In [None]:
# import json

# with open('all_qguide_data.html', 'r') as file:
#     json_data = json.load(file)

# with open('all_qguide_data.json', 'w') as file:
#     json.dump(json_data, file, separators=(',', ':'))


In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from functools import lru_cache
from tabulate import tabulate

@lru_cache(maxsize=None)
def extract_table_data(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException:
        return None

    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    # Find the feedback header link
    feedback_header = soup.find('a', {'href': lambda x: x and x.startswith('#report')})
    if not feedback_header:
        return None

    # Extract text from the header
    title_text = feedback_header.get('title', '')
    if not title_text:
        title_text = feedback_header.text

    # Extract instructor name and course title using regex
    import re

    # Find the last hyphen before <br>
    match = re.search(r'Feedback for\s+(.*?)\s*-\s*([^-<\n]+?)(?:\s*<br>|\s*\(click)', title_text)

    if not match:
        return None

    full_course_title = match.group(1).strip()
    instructor_name = match.group(2).strip()

    # Handle multiple departments in course title
    departments = []
    course_parts = full_course_title.split(',')
    for part in course_parts:
        dept = part.strip().split()[0]
        if dept not in departments:
            departments.append(dept)

    department = ', '.join(departments)

    # Skip certain tables that we don't want to process
    captions_to_skip = [
        "Table for [QTitle]-In this course, most students listen attentively with an open mind and a willingness to change their point of view as they learn more about the topic.-Statistics.",
        "Table for [QTitle]-In this course (including sections), I feel comfortable expressing my views on controversial topics.-Statistics.",
        "Table for [qtitle]-in this course, most students listen attentively with an open mind and a willingness to change their point of view as they learn more about the topic..",
        "Table for [qtitle]-in this course (including sections), i feel comfortable expressing my views on controversial topics..",
        "Table for [qtitle]-in this course, most students listen attentively with an open mind and a willingness to change their point of view as they learn more about the topic.."
    ]

    captions_to_skip = [caption.lower().strip() for caption in captions_to_skip]

    # Find all tables in the HTML
    tables = soup.find_all('table')
    dataframes = {}

    # Store basic course info
    dataframes['Instructor Name'] = pd.DataFrame([instructor_name], columns=['Instructor Name'])
    dataframes['Course Name'] = pd.DataFrame([full_course_title], columns=['Course Name'])
    dataframes['Department'] = pd.DataFrame([department], columns=['Department'])

    # Process each table
    for table in tables:
        # Find the nearest preceding h3 tag to use as the table name
        h3_tag = table.find_previous('h3')
        if h3_tag:
            table_name = "_".join(h3_tag.text.strip().lower().split()).replace("-", "_")
        else:
            continue

        caption_tag = table.find('caption')
        if caption_tag:
            caption_text = caption_tag.text.strip().lower()
            if any(skip_caption in caption_text for skip_caption in captions_to_skip):
                continue

        # Extract headers and data
        headers = [th.text.strip() for th in table.find('tr').find_all('th')]

        data = []
        for row in table.find_all('tr'):
            row_data = [cell.text.strip() for cell in row.find_all(['td', 'th'])]
            if len(row_data) == len(headers):
                data.append(row_data)

        # Create DataFrame if we have data
        if len(data) > 1:  # Skip if only headers
            df = pd.DataFrame(data[1:], columns=headers)
            dataframes[table_name] = df

    return dataframes

In [None]:
url = "https://harvard.bluera.com/harvard/rpv-eng.aspx?lang=eng&redi=1&SelectedIDforPrint=2a25bbb354dd26d2a5018145b3eccac802438e93cb86651665bfd7dc8f5e8b888dcf958e60eb641327093338906fe195&ReportType=2&regl=en-US"
tables_data = extract_table_data(url)

# Print instructor name and each table with its name
for table_name, table_df in tables_data.items():
    print(table_name, table_df)
    print(tabulate(table_df, headers='keys', tablefmt='grid'))
    print()

In [None]:
import json
import time
from concurrent.futures import ThreadPoolExecutor

with open('all_qguide_data.json', 'r') as f:
    all_qguide_data = json.load(f)

course_data_list = []

total_entries = len(all_qguide_data['data'])
processed_entries = 0
start_time = time.time()

def process_entry(entry):
    course_data = {key.capitalize(): entry[key] for key in ['title', 'url', 'instructor', 'department', 'term', 'subject', 'blueCourseId'] if key in entry}
    tables_data = extract_table_data(entry['url'])
    
    # Handle the case where tables_data is None
    if tables_data is None:
        course_data['Feedback'] = {}
    else:
        tables_data_dict = {table_name: table_df.to_dict(orient='records') for table_name, table_df in tables_data.items()}
        course_data['Feedback'] = tables_data_dict

    return course_data

def update_progress():
    global processed_entries
    processed_entries += 1
    percent_finished = (processed_entries / total_entries) * 100
    avg_time_per_iteration = (time.time() - start_time) / processed_entries
    remaining_entries = total_entries - processed_entries
    remaining_time_seconds = remaining_entries * avg_time_per_iteration

    hours = int(remaining_time_seconds // 3600)
    remaining_time_seconds %= 3600
    minutes = int(remaining_time_seconds // 60)
    seconds = int(remaining_time_seconds % 60)

    print(f"Iteration {processed_entries}/{total_entries} - {percent_finished:.2f}% completed - Estimated time remaining: {hours}h {minutes}m {seconds}s")

with ThreadPoolExecutor(max_workers=16) as executor:
    futures = []
    for entry in all_qguide_data['data']:
        futures.append(executor.submit(process_entry, entry))

    for future in futures:
        try:
            result = future.result()
            if result is not None:
                course_data_list.append(result)
            update_progress()
        except Exception as e:
            print(f"Error processing entry: {e}")
            update_progress()

with open('course_data.json', 'w') as json_file:
    json.dump(course_data_list, json_file, indent=4)

# This below section is not possible anymore because I forgot to scrape instructor names from 2019 :(

In [None]:
# import json

# def add_unique_courses(json1, json2):
#     # Create a set of Bluecourseid from the first JSON to keep track of added courses
#     existing_course_ids = {course['Bluecourseid'] for course in json1}
    
#     # Loop through the second JSON and add courses that don't exist in the first
#     for course in json2:
#         if course['Bluecourseid'] not in existing_course_ids:
#             json1.append(course)  # Add the new course to the first JSON
    
#     return json1

# # Load JSON data from files
# with open('course_data.json', 'r') as file1:
#     json1 = json.load(file1)

# with open('new_course_data.json', 'r') as file2:
#     json2 = json.load(file2)

# # Add unique courses from json2 to json1
# combined_json = add_unique_courses(json1, json2)

# # Save the combined JSON array
# with open('2020-2024_Course_Data.json', 'w') as outfile:
#     json.dump(combined_json, outfile, indent=4)


In [None]:
import pandas as pd
import json
from tabulate import tabulate

# Load the data
with open('course_data.json', 'r') as f:
    course_data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(course_data)

# Display the general head of the data to understand the structure
data_sample = df.head(10)
print("Data sample (general head):")
print(tabulate(data_sample, headers='keys', tablefmt='grid'))

In [13]:
import pandas as pd
import json
import numpy as np

# Load the data
with open('course_data.json', 'r') as f:
    course_data = json.load(f)

# Create a dictionary to store aggregated data per instructor
instructor_data = {}

for course in course_data:
    feedback = course.get('Feedback', {})
    instructor_name = feedback.get('Instructor Name', {})
    if not instructor_name:
        continue

    instructor_name = instructor_name[0].get('Instructor Name')
    if not instructor_name:
        continue

    department = feedback.get('Department', {})
    if department:
        department = department[0].get('Department')
    else:
        department = "Unknown"

    for table_name, records in feedback.items():
        if table_name == "general_instructor_questions":
            if instructor_name not in instructor_data:
                instructor_data[instructor_name] = {
                    'departments': department.split(", "),  # Split multiple departments into a list
                    'excellent_count': 0,
                    'very_good_count': 0,
                    'good_count': 0,
                    'fair_count': 0,
                    'unsatisfactory_count': 0,
                    'total_responses': 0
                }

            course_info = records[0]

            try:
                total_count = float(course_info['Count'])

                def safe_percentage(value):
                    if value == 'N/A':
                        return 0
                    return float(value.strip('%')) / 100

                instructor_data[instructor_name]['excellent_count'] += total_count * safe_percentage(course_info['Excellent'])
                instructor_data[instructor_name]['very_good_count'] += total_count * safe_percentage(course_info['Very Good'])
                instructor_data[instructor_name]['good_count'] += total_count * safe_percentage(course_info['Good'])
                instructor_data[instructor_name]['fair_count'] += total_count * safe_percentage(course_info['Fair'])
                instructor_data[instructor_name]['unsatisfactory_count'] += total_count * safe_percentage(course_info['Unsatisfactory'])
                instructor_data[instructor_name]['total_responses'] += total_count
            except (ValueError, KeyError):
                continue

# Convert to DataFrame
df = pd.DataFrame.from_dict(instructor_data, orient='index')

# Calculate the global mean and variance (τ²) dynamically based on dataset
total_weighted_sum = (
    df['excellent_count'] * 5 +
    df['very_good_count'] * 4 +
    df['good_count'] * 3 +
    df['fair_count'] * 2 +
    df['unsatisfactory_count'] * 1
).sum()
total_responses = df['total_responses'].sum()
global_mean = total_weighted_sum / total_responses  # Global mean (μ)

# Calculate global variance (τ²)
global_variance = (
    (
        df['excellent_count'] * 5 ** 2 +
        df['very_good_count'] * 4 ** 2 +
        df['good_count'] * 3 ** 2 +
        df['fair_count'] * 2 ** 2 +
        df['unsatisfactory_count'] * 1 ** 2
    ).sum() / total_responses
) - global_mean ** 2

# Function to calculate Empirical Bayes Average with normal prior
def calculate_empirical_bayes_normal(row, global_mean=global_mean, global_variance=global_variance):
    """
    Calculate the Empirical Bayes Average using a normal prior.
    :param row: Data row containing counts and total responses.
    :param global_mean: The overall mean (prior mean).
    :param global_variance: The variance of the global mean (prior variance).
    :return: Empirical Bayes score.
    """
    try:
        # Weighted sum and observed mean for the instructor
        weighted_sum = (row['excellent_count'] * 5 +
                        row['very_good_count'] * 4 +
                        row['good_count'] * 3 +
                        row['fair_count'] * 2 +
                        row['unsatisfactory_count'] * 1)
        observed_mean = weighted_sum / row['total_responses']
        
        # Variance of observed mean (ε²)
        observed_variance = 1 / row['total_responses']  # Assuming equal weighting

        # Shrinkage weight (B_i)
        shrinkage_weight = global_variance / (global_variance + observed_variance)

        # Empirical Bayes estimate
        eb_score = shrinkage_weight * observed_mean + (1 - shrinkage_weight) * global_mean
        return round(eb_score, 3)
    except ZeroDivisionError:
        return global_mean  # Default to prior mean if no responses

# Apply Empirical Bayes calculations
df['Empirical Bayes Average'] = df.apply(calculate_empirical_bayes_normal, axis=1)

# Calculate overall percentiles for Empirical Bayes Average
df['Percentile Rank'] = df['Empirical Bayes Average'].rank(pct=True) * 100

grade_boundaries = [ # NOTE THAT THIS IS PERCENTILES, NOT RAW, SO GRADING IS UNCONVENTIONAL
        (0, 0.1, 'S+'),
        (0.1, 0.5, 'S'),
        (0.5, 1, 'S-'),
        (1, 2, 'A+'),
        (2, 5, 'A'),
        (5, 10, 'A-'),
        (10, 20, 'B+'),
        (20, 40, 'B'),
        (40, 60, 'B-'),
        (60, 80, 'C'),
        (80, 95, 'D'),
        (95, 100, 'F'),
]

# Assign letter grades based on percentile rank
def assign_letter_grade(percentile):
    for lower, upper, grade in grade_boundaries:
        if lower <= 100 - percentile < upper:
            return grade
    return 'F'  # default if something goes wrong

df['Overall Letter Grade'] = df['Percentile Rank'].apply(assign_letter_grade)

# Add rank for Empirical Bayes Average
df['Empirical Bayes Rank'] = df['Empirical Bayes Average'].rank(ascending=False, method='min').astype(int)

# ============================================
# Intra-Department Empirical Bayes Rankings
# ============================================

# Initialize a dictionary to store intra-department ranks
intra_dept_ranks = {instructor: {} for instructor in df.index}

# Get unique departments
unique_departments = set(dept for depts in df['departments'] for dept in depts)

# Initialize dictionary columns properly
df['Intra-Department EB Averages'] = [{} for _ in range(len(df))]
df['Intra-Department Letter Grades'] = [{} for _ in range(len(df))]
df['Intra-Department Ranks'] = ""


# Iterate through each department and calculate intra-department scores and ranks
for department in unique_departments:
    # Filter the DataFrame for instructors in the current department
    dept_df = df[df['departments'].apply(lambda x: department in x)].copy()
    
    if dept_df.empty:
        continue  # Skip if no instructors in this department
    
    # Calculate department-specific global mean and variance
    total_weighted_sum_dept = (
        dept_df['excellent_count'] * 5 +
        dept_df['very_good_count'] * 4 +
        dept_df['good_count'] * 3 +
        dept_df['fair_count'] * 2 +
        dept_df['unsatisfactory_count'] * 1
    ).sum()
    total_responses_dept = dept_df['total_responses'].sum()
    
    if total_responses_dept == 0:
        continue  # Avoid division by zero
    
    global_mean_dept = total_weighted_sum_dept / total_responses_dept  # μ_dept
    
    global_variance_dept = (
        (
            dept_df['excellent_count'] * 5 ** 2 +
            dept_df['very_good_count'] * 4 ** 2 +
            dept_df['good_count'] * 3 ** 2 +
            dept_df['fair_count'] * 2 ** 2 +
            dept_df['unsatisfactory_count'] * 1 ** 2
        ).sum() / total_responses_dept
    ) - global_mean_dept ** 2  # τ²_dept

    # Function to calculate Empirical Bayes Average with department-specific prior
    def calculate_empirical_bayes_dept(row, mean_dept=global_mean_dept, var_dept=global_variance_dept):
        try:
            weighted_sum = (row['excellent_count'] * 5 +
                            row['very_good_count'] * 4 +
                            row['good_count'] * 3 +
                            row['fair_count'] * 2 +
                            row['unsatisfactory_count'] * 1)
            observed_mean = weighted_sum / row['total_responses']
            
            observed_variance = 1 / row['total_responses']  # Assuming equal weighting

            shrinkage_weight = var_dept / (var_dept + observed_variance)

            eb_score = shrinkage_weight * observed_mean + (1 - shrinkage_weight) * mean_dept
            return round(eb_score, 3)
        except ZeroDivisionError:
            return mean_dept  # Default to department mean if no responses

    # Apply department-specific Empirical Bayes calculation
    dept_df['Empirical Bayes Dept Average'] = dept_df.apply(
        calculate_empirical_bayes_dept,
        axis=1
    )
    
    # Calculate percentile ranks within the department
    dept_df['Percentile Rank Dept'] = dept_df['Empirical Bayes Dept Average'].rank(pct=True) * 100

    # Assign letter grades based on department-specific percentiles
    dept_df['Dept Letter Grade'] = dept_df['Percentile Rank Dept'].apply(assign_letter_grade)

    # Rank within the department
    dept_df['Intra-Department Rank'] = dept_df['Empirical Bayes Dept Average'].rank(
        ascending=False, method='min'
    ).astype(int)
    
    # Update the main DataFrame
    for instructor in dept_df.index:
        intra_dept_ranks[instructor][department] = dept_df.at[instructor, 'Intra-Department Rank']
        # Store department-specific scores and grades in dictionaries
        df.at[instructor, 'Intra-Department EB Averages'][department] = dept_df.at[instructor, 'Empirical Bayes Dept Average']
        df.at[instructor, 'Intra-Department Letter Grades'][department] = dept_df.at[instructor, 'Dept Letter Grade']

# Format the department-specific information for output
def format_dept_info(row):
    dept_info = []
    for dept in row['departments']:
        eb_avg = row['Intra-Department EB Averages'].get(dept, 'N/A')
        letter_grade = row['Intra-Department Letter Grades'].get(dept, 'N/A')
        rank = intra_dept_ranks[row.name].get(dept, 'N/A')
        dept_info.append(f"{dept}: {eb_avg} ({letter_grade}, Rank: {rank})")
    return " | ".join(dept_info)

# Create final output DataFrame with rankings
result_df = pd.DataFrame({
    'Instructor Name': df.index,
    'Departments': df['departments'].apply(lambda x: ", ".join(x)),
    'Total Ratings': df['total_responses'],
    'Empirical Bayes Average': df['Empirical Bayes Average'],
    'Empirical Bayes Rank': df['Empirical Bayes Rank'],
    'Overall Letter Grade': df['Overall Letter Grade'],
    'Department Metrics': df.apply(format_dept_info, axis=1)
}).sort_values(by='Empirical Bayes Average', ascending=False)


# Display results
print(result_df.to_string(index=False))

# Convert final DataFrame to a JSON-friendly dictionary
result_json = result_df.to_dict(orient='records')

# Save as a JSON file for further processing or directly return as JSON string
with open('professors.json', 'w') as f:
    json.dump(result_json, f, indent=4)

# Optional: Output the JSON string for verification
print(json.dumps(result_json, indent=4))



                       Instructor Name                Departments  Total Ratings  Empirical Bayes Average  Empirical Bayes Rank Overall Letter Grade                                                                          Department Metrics
                      Katharine Clarke                      EXPOS           84.0                    4.992                     1                   S+                                                                  EXPOS: 4.991 (S+, Rank: 1)
                      William Friedman                     FRSEMR           47.0                    4.986                     2                   S+                                                                 FRSEMR: 4.986 (S+, Rank: 1)
                       Susan Farbstein                     FRSEMR           39.0                    4.983                     3                   S+                                                                 FRSEMR: 4.983 (S-, Rank: 2)
                       Cirrus Foroug