In [6]:
import re

def clean_and_structure_content(content):
    # Split content into sections
    sections = re.split(r'\n(?=Week Date|KEY CONCEPTS|REQUIRED PREPARATION|EVALUATION|Instructional Plan)', content)
    
    cleaned_sections = []
    for section in sections:
        # Remove extra whitespace
        section = re.sub(r'\s+', ' ', section).strip()
        
        # Remove repeated headers
        section = re.sub(r'(MGMT-8680 IT Operations & Project Management -Prof\. Sean Yo)\s*\d*', r'\1', section)
        
        # Remove page numbers
        section = re.sub(r'Page \d+ of \d+', '', section)
        
        # Standardize formatting
        section = re.sub(r'([A-Z]+):', r'\n\1:', section)
        
        cleaned_sections.append(section)
    
    # Join cleaned sections
    cleaned_content = '\n\n'.join(cleaned_sections)
    
    return cleaned_content

def main():
    # Read the content from the file
    with open('cleaned_course_content.txt', 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Clean and structure the content
    structured_content = clean_and_structure_content(content)
    
    # Write the structured content to a new file
    with open('structured_course_content.txt', 'w', encoding='utf-8') as file:
        file.write(structured_content)
    
    print("Content has been cleaned and structured. Check 'structured_course_content.txt'")

if __name__ == "__main__":
    main()

Content has been cleaned and structured. Check 'structured_course_content.txt'


In [3]:
import json

In [4]:
# Read the structured course content file
with open('structured_course_content.txt', 'r', encoding='utf-8') as f:
    structured_content_full = f.read()

# Read the transcriptions JSON file
with open('transcriptions.json', 'r') as f:
    transcriptions_full = json.load(f)

# Print the beginning of each content to confirm loading
print("Structured Course Content (first 500 chars):")
print(structured_content_full[:500])

print("\nTranscriptions JSON Sample:")
for key, value in list(transcriptions_full.items())[:3]:
    print(f"{key}: {value[:200]}...")


Structured Course Content (first 500 chars):
1 Dec 2008 A Tale of Two Projects ~ By Robert Howard A business tale of what it takes to turn around troubled projects. The year is 2005 and times are good. The business environment is vibrant and the economy is strong. Large businesses are committing large amounts of capital and resources to implement new strategies, establish new capabilities, and open new markets. It was no different at PintCo, where Jack works as a Director of Customer Relationship Management. Jack walked into work on Monday

Transcriptions JSON Sample:
Unit 10 - Topic 1.webm: welcome to unit ten managing organizational change in this unit we'll discuss an explore how to manage organizational change with a focus on the integration of project management and service managemen...
Unit 10 - Topic 2.webm: and now move on to organizational change models so let's make sure we're all on the same page and level set on what change models are these are a process to manage organizat

In [None]:
from transformers import pipeline

# Initialize the LLM summarization pipeline
summarizer = pipeline("summarization")

# Segment the structured content
def segment_text(text, max_length=500):
    # Split the text into manageable chunks for summarization
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

# Use the LLM to summarize segments and identify topics
segments = segment_text(structured_content_full)
summarized_segments = [summarizer(segment, max_length=150, min_length=30, do_sample=False) for segment in segments]

# Example of how to use LLM to generate section titles and contents
sectioned_content = {}
for idx, summary in enumerate(summarized_segments):
    title = f"Section {idx+1}"
    content = summary[0]['summary_text']
    sectioned_content[title] = content

# Print out the structured sections as a demonstration
print("Structured Sections Generated by LLM:")
for title, content in sectioned_content.items():
    print(f"{title}: {content[:200]}...")

# Integrate transcriptions
for title in sectioned_content.keys():
    if title in transcriptions_full:
        sectioned_content[title] += f"\n\nTranscription:\n{transcriptions_full[title]}"

# Create a hierarchical JSON structure
structured_data_llm = {
    "course": {
        "name": "Information Technology Operations and Project Management",
        "code": "MGMT8680",
        "modules": []
    }
}

# Populate the structure with LLM-generated sections
for title, content in sectioned_content.items():
    module = {
        "title": title,
        "lessons": [
            {
                "title": f"Lesson from {title}",
                "content": content
            }
        ]
    }
    structured_data_llm["course"]["modules"].append(module)

# Save the LLM-structured data to a JSON file
output_path_llm = '/structured_course_data_llm.json'
with open(output_path_llm, 'w') as f:
    json.dump(structured_data_llm, f, indent=2)

output_path_llm
