In [None]:
from schema import parse_json_file, School

school: School = parse_json_file('joemama.json')


In [None]:
from collections import defaultdict
from typing import List, Dict
from schema import School, Professor, Review, Course
import statistics
import pandas as pd
import matplotlib.pyplot as plt


def analyze_professors(school: School) -> Dict[str, Dict[str, float]]:
    analytics = {}

    for professor in school.professors:
        if professor.reviews:
            review_count = len(professor.reviews)
            avg_difficulty = statistics.mean(
                [review.difficulty for review in professor.reviews])
            avg_quality = statistics.mean(
                [review.quality for review in professor.reviews])
            tags = [tag for review in professor.reviews for tag in review.tags]
            tag_frequency = {tag: tags.count(tag) for tag in set(tags)}

            analytics[professor.firstname + ' ' + professor.lastName] = {
                'review_count': review_count,
                'avg_difficulty': avg_difficulty,
                'avg_quality': avg_quality,
                'tag_frequency': tag_frequency
            }

    return analytics


def plot_review_timeseries(school: School):
    # Create a DataFrame to hold the time series data
    data = []
    for professor in school.professors:
        for review in professor.reviews:
            data.append({
                'professor': f"{professor.firstname} {professor.lastName}",
                'date': review.date,
                'difficulty': review.difficulty,
                'quality': review.quality
            })

    df = pd.DataFrame(data)
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)

    # Convert difficulty and quality to numeric, forcing errors to NaN
    df['difficulty'] = pd.to_numeric(df['difficulty'], errors='coerce')
    df['quality'] = pd.to_numeric(df['quality'], errors='coerce')

    # Ensure only numeric columns are aggregated
    numeric_df = df[['difficulty', 'quality']]

    # Plot the time series data
    fig, ax = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

    for professor in df['professor'].unique():
        prof_data = df[df['professor'] == professor]
        prof_data_numeric = prof_data[['difficulty', 'quality']]
        prof_data_numeric.resample('M').mean()['difficulty'].plot(
            ax=ax[0], label=professor)
        prof_data_numeric.resample('M').mean()['quality'].plot(
            ax=ax[1], label=professor)

    ax[0].set_title('Average Difficulty Over Time')
    ax[0].set_ylabel('Average Difficulty')
    ax[0].legend()

    ax[1].set_title('Average Quality Over Time')
    ax[1].set_ylabel('Average Quality')
    ax[1].legend()

    plt.xlabel('Date')
    plt.show()


def plot_review_timeseries_2(school: School):
    # Create a DataFrame to hold the time series data
    data = []
    for professor in school.professors:
        for review in professor.reviews:
            data.append({
                'professor': f"{professor.firstname} {professor.lastName}",
                'date': review.date,
                'difficulty': review.difficulty,
                'quality': review.quality
            })

    df = pd.DataFrame(data)
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)

    # Convert difficulty and quality to numeric, forcing errors to NaN
    df['difficulty'] = pd.to_numeric(df['difficulty'], errors='coerce')
    df['quality'] = pd.to_numeric(df['quality'], errors='coerce')

    # Resample data by month and calculate statistical measures
    resampled_df = df.resample('M').agg({
        'difficulty': ['mean', 'std', 'min', 'max'],
        'quality': ['mean', 'std', 'min', 'max']
    }).interpolate(method='linear')

    # Plot the time series data
    fig, ax = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

    resampled_df['difficulty']['mean'].plot(
        ax=ax[0], label='Average Difficulty')
    resampled_df['difficulty']['std'].plot(
        ax=ax[0], label='Std Dev Difficulty', linestyle='--')
    resampled_df['difficulty']['min'].plot(
        ax=ax[0], label='Min Difficulty', linestyle=':')
    resampled_df['difficulty']['max'].plot(
        ax=ax[0], label='Max Difficulty', linestyle=':')

    resampled_df['quality']['mean'].plot(ax=ax[1], label='Average Quality')
    resampled_df['quality']['std'].plot(
        ax=ax[1], label='Std Dev Quality', linestyle='--')
    resampled_df['quality']['min'].plot(
        ax=ax[1], label='Min Quality', linestyle=':')
    resampled_df['quality']['max'].plot(
        ax=ax[1], label='Max Quality', linestyle=':')

    ax[0].set_title('Difficulty Statistics Over Time')
    ax[0].set_ylabel('Difficulty')
    ax[0].legend()

    ax[1].set_title('Quality Statistics Over Time')
    ax[1].set_ylabel('Quality')
    ax[1].legend()

    plt.xlabel('Date')
    plt.show()


def plot_review_timeseries_by_course_prefix(school: School):
    course_prefix_groups = group_professors_by_course_prefix(school)

    for prefix, professors in course_prefix_groups.items():
        # Create a DataFrame to hold the time series data for the current prefix
        data = []
        for professor in professors:
            for review in professor.reviews:
                data.append({
                    'professor': f"{professor.firstname} {professor.lastName}",
                    'date': review.date,
                    'difficulty': review.difficulty,
                    'quality': review.quality
                })

        df = pd.DataFrame(data)

        if df.empty:
            print(f"No data available for course prefix: {prefix}")
            continue

        df['date'] = pd.to_datetime(df['date'])
        df.set_index('date', inplace=True)

        # Convert difficulty and quality to numeric, forcing errors to NaN
        df['difficulty'] = pd.to_numeric(df['difficulty'], errors='coerce')
        df['quality'] = pd.to_numeric(df['quality'], errors='coerce')

        # Resample data by month and calculate statistical measures
        resampled_df = df.resample('M').agg({
            'difficulty': ['mean', 'std', 'min', 'max'],
            'quality': ['mean', 'std', 'min', 'max']
        }).interpolate(method='linear')

        # Plot the time series data
        fig, ax = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

        resampled_df['difficulty']['mean'].plot(
            ax=ax[0], label='Average Difficulty')
        resampled_df['difficulty']['std'].plot(
            ax=ax[0], label='Std Dev Difficulty', linestyle='--')
        resampled_df['difficulty']['min'].plot(
            ax=ax[0], label='Min Difficulty', linestyle=':')
        #resampled_df['difficulty']['max'].plot(
        #    ax=ax[0], label='Max Difficulty', linestyle=':')

        resampled_df['quality']['mean'].plot(ax=ax[1], label='Average Quality')
        resampled_df['quality']['std'].plot(
            ax=ax[1], label='Std Dev Quality', linestyle='--')
        resampled_df['quality']['min'].plot(
            ax=ax[1], label='Min Quality', linestyle=':')
        #resampled_df['quality']['max'].plot(
        #    ax=ax[1], label='Max Quality', linestyle=':')

        ax[0].set_title(f'Difficulty Statistics Over Time for {prefix}')
        ax[0].set_ylabel('Difficulty')
        ax[0].legend()

        ax[1].set_title(f'Quality Statistics Over Time for {prefix}')
        ax[1].set_ylabel('Quality')
        ax[1].legend()

        plt.xlabel('Date')
        #plt.show()
        plt.savefig(f'plot/review_timeseries_{prefix}.png')
        plt.close()



def group_professors_by_course_prefix(school: School) -> Dict[str, List[Professor]]:
    course_prefix_groups = defaultdict(list)

    for professor in school.professors:
        for course_code in professor.courses.keys():
            prefix = ''.join(filter(str.isalpha, course_code))
            course_prefix_groups[prefix].append(professor)
            break  # Assuming each professor is grouped by the first course prefix found

    return course_prefix_groups

In [None]:
professor_analytics = analyze_professors(school)
for professor, data in professor_analytics.items():
    print(f"Professor: {professor}")
    print(f"  Review Count: {data['review_count']}")
    print(f"  Average Difficulty: {data['avg_difficulty']:.2f}")
    print(f"  Average Quality: {data['avg_quality']:.2f}")
    print(f"  Tag Frequency: {data['tag_frequency']}")
    print()
    

In [None]:
plot_review_timeseries(school)

In [None]:
plot_review_timeseries_2(school)

In [None]:
plot_review_timeseries_by_course_prefix(school)