In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os

In [15]:
# 1. Fixed data loading and cleaning code
file_path = 'dataset.xlsx'
try:
    # List all sheets in the excel file
    xl = pd.ExcelFile(file_path)
    print(f"Available sheets in the excel file: {xl.sheet_names}")
    
    # Read each sheet directly by name to avoid confusion
    lecturer_details = pd.read_excel(file_path, sheet_name='Lecturer Details')
    rooms_data = pd.read_excel(file_path, sheet_name='Rooms data')
    courses_data = pd.read_excel(file_path, sheet_name='Course list')
    student_requests = pd.read_excel(file_path, sheet_name='Student requests')
    
    print("Successfully loaded all data sheets")

    # Clean data - handle missing values but PRESERVE original column names
    lecturer_details = lecturer_details.fillna('')
    rooms_data = rooms_data.fillna('')
    courses_data = courses_data.fillna('')
    student_requests = student_requests.fillna('')
    
    # Display the actual column names to confirm what we're working with
    print("\n--- Lecturer Details Columns ---")
    print(lecturer_details.columns.tolist())
    
    print("\n--- Rooms Data Columns ---")
    print(rooms_data.columns.tolist())
    
    print("\n--- Courses Data Columns ---")
    print(courses_data.columns.tolist())
    
    print("\n--- Student Requests Columns ---")
    print(student_requests.columns.tolist())
    
    # Display sample data
    print("\n--- Lecturer Details Sample ---")
    display(lecturer_details.head(3))
    
    # Create comprehensive JSON that preserves ALL columns
    data_dict = {
        'lecturer_details': lecturer_details.to_dict(orient='records'),
        'rooms_data': rooms_data.to_dict(orient='records'),
        'courses_data': courses_data.to_dict(orient='records'),
        'student_requests': student_requests.to_dict(orient='records')
    }
    
    # Save to JSON file with proper formatting
    with open('crestwood_data_fixed.json', 'w') as f:
        json.dump(data_dict, f, indent=2)
    
    print("\nData successfully converted to JSON and saved as 'crestwood_data_fixed.json'")
    
    # Validate data based on rules
    print("\n--- Data Validation Based on Actual Structure ---")
    
    # 1. Check valid blocks
    valid_blocks = ["1A", "1B", "2A", "2B", "3", "4A", "4B"]
    
    # Identify the actual column names for available/unavailable blocks
    # (print first to confirm the exact column names)
    print("\nCourses data sample to check column names:")
    display(courses_data[['Course Code', 'Title', 'Available Blocks', 'Unavailable Blocks']].head(2))
    
    # Parse blocks function
    def parse_blocks(block_str):
        if not isinstance(block_str, str):
            return []
        return [b.strip() for b in block_str.split(',')]
    
    # Apply parsing to the correctly named columns
    if 'Available Blocks' in courses_data.columns:
        courses_data['available_blocks_list'] = courses_data['Available Blocks'].apply(parse_blocks)
    if 'Unavailable Blocks' in courses_data.columns:
        courses_data['unavailable_blocks_list'] = courses_data['Unavailable Blocks'].apply(parse_blocks)
    
    # Validate blocks (with correct column names)
    invalid_blocks = []
    for _, course in courses_data.iterrows():
        if 'available_blocks_list' in courses_data.columns:
            for block in course['available_blocks_list']:
                if block not in valid_blocks:
                    invalid_blocks.append((course['Title'], block))
    
    if invalid_blocks:
        print(f"Found {len(invalid_blocks)} courses with invalid blocks:")
        for course, block in invalid_blocks:
            print(f"  - Course '{course}' has invalid block '{block}'")
    else:
        print("✓ All courses have valid block assignments")
    
    # 2. Additional insights based on actual data structure
    print("\n--- Data Insights Based on Actual Structure ---")
    
    # Course length distribution
    if 'Length' in courses_data.columns:
        course_lengths = courses_data['Length'].value_counts()
        print("\nCourse distribution by length:")
        print(course_lengths)
    
    # Most requested courses
    if 'Title' in student_requests.columns:
        popular_courses = student_requests['Title'].value_counts().head(10)
        print("\nTop 10 most requested courses:")
        print(popular_courses)
    
    # Request type distribution
    if 'Type' in student_requests.columns:
        type_counts = student_requests['Type'].value_counts()
        print("\nRequest type distribution:")
        print(type_counts)

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found")
except Exception as e:
    print(f"Error: {str(e)}")

Available sheets in the excel file: ['Lecturer Details', 'Rooms data', 'Course list', 'Student requests', 'RULES']
Successfully loaded all data sheets

--- Lecturer Details Columns ---
['Lecturer ID', 'Lecture Title', 'lecture Code', 'Length', 'Start Term', 'Section number']

--- Rooms Data Columns ---
['Course Title', 'Section number', ' Year', 'Term Description', 'prof ID', 'lecture ID', 'Course Code', 'Course Length', 'Term name', 'Room Number']

--- Courses Data Columns ---
['Course code', 'Title', 'Length', 'Priority', 'Available blocks', 'Unavailable blocks', 'Minimum section size', 'Target section size', 'Maximum section size', 'Number of sections', 'Total credits']

--- Student Requests Columns ---
['College Year', 'Request start term', 'Title', 'Type', 'student ID', 'Course ID', 'Length', 'Course code', 'Priority', 'Department(s)', 'Credits']

--- Lecturer Details Sample ---


Unnamed: 0,Lecturer ID,Lecture Title,lecture Code,Length,Start Term,Section number
0,5361519,Band - High,ARTBND,2,1,1
1,5361487,Chorus - High,ARTChor,2,1,1
2,5361415,Drawing and Painting,ARTDRAW,2,1,1



Data successfully converted to JSON and saved as 'crestwood_data_fixed.json'

--- Data Validation Based on Actual Structure ---

Courses data sample to check column names:
Error: "['Course Code', 'Available Blocks', 'Unavailable Blocks'] not in index"
