In [1]:
import os
import PyPDF2
import textract
import fitz

In [2]:
# Directories
notebooks_directory = "Lesson-Plans"
pdf_slides_directory = "Slides"
transcripts_directory = "Transcripts"
output_directory = "Output"

In [4]:
# Function to read markdown/code files
def extract_notebook_content(notebook_path, delimiter="##--CODE--##\n"):
    with open(notebook_path, 'r', encoding='utf-8') as nb_file:
        nb = nbformat.read(nb_file, as_version=4)
        combined_content = []
        for cell in nb.cells:
            if cell.cell_type == 'markdown':
                combined_content.append(cell.source)
            elif cell.cell_type == 'code':
                combined_content.append(delimiter + cell.source)
        return "\n\n".join(combined_content) + "\n\n"

def extract_md_content(md_path):
    with open(md_path, 'r', encoding='utf-8') as md_file:
        return md_file.read()

def extract_py_content(py_path):
    with open(py_path, 'r', encoding='utf-8') as py_file:
        return py_file.read()

def process_directory_and_combine(directory_path, output_directory, delimiter="##--CODE--##\n"):
    processed_files = 0
    skipped_files = 0
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if 'unsolved' in file.lower():  # Skip files with 'unsolved' in the filename (case insensitive)
                skipped_files += 1
                print(f"Skipped: {file}")
                continue  # Skip the rest of the loop for this file

            try:
                content = ""
                if file.endswith(".ipynb"):
                    content = extract_notebook_content(os.path.join(root, file), delimiter)
                elif file.endswith(".md"):
                    content = extract_md_content(os.path.join(root, file))
                elif file.endswith(".py"):
                    content = extract_py_content(os.path.join(root, file))
                else:
                    continue

                if content:
                    processed_files += 1
                    # Derive the subject name from the directory structure
                    subject_name = os.path.basename(os.path.dirname(root))
                    output_filename = f"{subject_name}_{file.split('.')[0]}_combined_content.txt"
                    output_path = os.path.join(output_directory, output_filename)
                    with open(output_path, 'w', encoding='utf-8') as outfile:
                        outfile.write(content)
                    print(f"Processed: {file} into {output_filename}")
            except Exception as e:
                print(f"Failed to process {file}: {e}")

    print(f"Total processed files: {processed_files}")
    print(f"Total skipped files: {skipped_files}")


# Call the function
process_directory_and_combine(notebooks_directory, output_directory)

Processed: CommonCommands.md into 01-Introduction-to-AI_CommonCommands_combined_content.txt
Processed: terminal.md into 01-Introduction-to-AI_terminal_combined_content.txt
Processed: README.md into Activities_README_combined_content.txt
Processed: README.md into Activities_README_combined_content.txt
Processed: README.md into Activities_README_combined_content.txt
Processed: README.md into Activities_README_combined_content.txt
Processed: README.md into Activities_README_combined_content.txt
Processed: README.md into Activities_README_combined_content.txt
Processed: README.md into Activities_README_combined_content.txt
Processed: dishes_pseudocode.md into 01-Ins_Pseudocoding_dishes_pseudocode_combined_content.txt
Processed: README.md into Activities_README_combined_content.txt
Processed: laundry_pseudocode_solution.md into 02-Stu_Pseudocoding_Practice_laundry_pseudocode_solution_combined_content.txt
Processed: laundry_pseudocode.md into 02-Stu_Pseudocoding_Practice_laundry_pseudocode_c

In [5]:
def read_all_files(directory):
    file_data = {}
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            subject_name = filename.split('_')[0]  # Assuming the subject is the first part of the filename
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                file_data.setdefault(subject_name, []).append(file.read())
    return file_data

def read_pdf_files(pdf_directory):
    pdf_data = {}
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            subject_name = filename.split('_')[0]
            pdf_path = os.path.join(pdf_directory, filename)
            try:
                with fitz.open(pdf_path) as doc:
                    text_content = []
                    for page in doc:
                        text_content.append(page.get_text())
                    pdf_data.setdefault(subject_name, []).append("\n".join(text_content))
            except Exception as e:
                print(f"Failed to read {filename}: {e}")
    return pdf_data

def read_transcript_files(transcripts_directory):
    transcript_data = {}
    for filename in os.listdir(transcripts_directory):
        if filename.endswith(".txt"):
            subject_name = filename.split('_')[0]  # Assuming the subject is the first part of the filename
            file_path = os.path.join(transcripts_directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                transcript_data.setdefault(subject_name, []).append(file.read())
    return transcript_data

# Paths to the directories
notebooks_directory = "Lesson-Plans"
pdf_slides_directory = "Slides"
transcripts_directory = "Transcripts"
output_directory = "Output"

# Read the processed markdown and code files
markdown_code_data = read_all_files(notebooks_directory)
print("Markdown/code files read successfully.")

# Read the PDF slides
pdf_slides_data = read_pdf_files(pdf_slides_directory)
print("PDF slides read successfully.")

# Read transcripts
transcripts_data = read_transcript_files(transcripts_directory)
print("Transcript files read successfully.")

# Combine the data by subject
combined_subject_data = {}
all_subjects = set(markdown_code_data) | set(pdf_slides_data) | set(transcripts_data)
for subject in all_subjects:
    combined_subject_data[subject] = []
    if subject in markdown_code_data:
        combined_subject_data[subject].extend(markdown_code_data[subject])
    if subject in pdf_slides_data:
        combined_subject_data[subject].extend(pdf_slides_data[subject])
    if subject in transcripts_data:
        combined_subject_data[subject].extend(transcripts_data[subject])

print("All subjects combined successfully.")







Markdown/code files read successfully.
PDF slides read successfully.
Transcript files read successfully.
All subjects combined successfully.


In [6]:
def display_data_snippets(data, num_lines=5):
    for subject, contents in data.items():
        print(f"Subject: {subject}\n")
        for content in contents:
            # Split the content into lines and display the first few lines
            lines = content.split('\n')
            print('\n'.join(lines[:num_lines]))
            print("\n---\n")  # Separator between different contents
        print("\n==========\n")  # Separator between different subjects

# Call the function to display snippets from the markdown/code data
display_data_snippets(markdown_code_data)

# Call the function to display snippets from the PDF slides data
display_data_snippets(pdf_slides_data)


Subject: M1.2

AI Boot Camp
The Impact of 
Machine Learning
Module 1 Day 2
© 2023 edX Boot Camps LLC. Conﬁdential and Proprietary. All Rights Reserved.

---



Subject: M1.3

AI Boot Camp
Overview of Machine 
Learning Tools
Module 1 Day 3
© 2023 edX Boot Camps LLC. Conﬁdential and Proprietary. All Rights Reserved.

---



Subject: M11.1

AI Bootcamp
Module 11 Day 1
© 2023 edX Boot Camps LLC. Conﬁdential and Proprietary. All Rights Reserved.
Introduction to 
Machine Learning

---



Subject: M11.2

AI Bootcamp
Module 11 Day 2
© 2023 edX Boot Camps LLC. Conﬁdential and Proprietary. All Rights Reserved.
Unsupervised Learning 
in Practice

---



Subject: M11.3

AI Bootcamp
Module 11 Day 3
© 2023 edX Boot Camps LLC. Conﬁdential and Proprietary. All Rights Reserved.
Principal Component 
Analysis (PCA)

---



Subject: M12.1

AI Bootcamp
Module 12 Day 1
© 2023 edX Boot Camps LLC. Conﬁdential and Proprietary. All Rights Reserved.
Introduction to 
Supervised Learning 

---



Subject: M12.2

A

In [7]:
# Display all subjects from markdown/code files
print("Subjects from markdown/code files:")
for subject in markdown_code_data.keys():
    print(subject)

# Display all subjects from PDF slides
print("\nSubjects from PDF slides:")
for subject in pdf_slides_data.keys():
    print(subject)

# Display all subjects from transcripts
print("\nSubjects from transcripts:")
for subject in transcripts_data.keys():
    print(subject)

Subjects from markdown/code files:

Subjects from PDF slides:
M1.2
M1.3
M11.1
M11.2
M11.3
M12.1
M12.2
M13.1
M13.2
M13.3
M14.1
M14.2
M14.3
M15.1
M15.2
M15.3
M18.1
M18.2
M18.3
M19.1
M19.2
M19.3
M2.1
M2.2
M2.3
M20.1
M20.2
M20.3
M21.1
M21.2
M21.3
M22.1
M22.2
M22.3
M3.1
M3.2
M3.3
M4.2
M4.3
M5.1
M5.2
M5.3
M6.1
M6.2
M6.3
M7.1
M7.2
M7.3
M8.1
M8.2
M8.3

Subjects from transcripts:
Data Ethics 1.txt
Data Ethics 2 - Keith Ellis.txt
Data Ethics 3.txt
Deep Learning 1.txt
Deep Learning 2.txt
Deep Learning 3.txt
Emerging Topics in AI 1.txt
Emerging Topics in AI 2.txt
Emerging Topics in AI 3.txt
Exploring Data through Visualizations 1 (Section 2).txt
Exploring Data through Visualizations 1.txt
Exploring Data through Visualizations 2.txt
Exploring Data through Visualizations 3.txt
Extra Review 1.txt
Extra Review Static Website.txt
Introduction to AI 1.txt
Introduction to AI 2.txt
Introduction to AI 3.txt
Machine Learning Optimization 1.txt
Machine Learning Optimization 2.txt
Making Predictions with Data