In [1]:
# %%
from pprint import pprint
import yaml
import os
from PIL import Image
from utils.client import Content, Projects, Project
import networkx as nx
import matplotlib.pyplot as plt
import os
from datetime import datetime
from pprint import pprint
# Here since image is too large thinks it's a decompression DOS attack
Image.MAX_IMAGE_PIXELS = 933120000


def get_name(id: str, modules: list) -> str:
    for module in modules:
        if module["id"] == id:
            return module["name"]
    return "unknown"


class Tree:
    def __init__(self) -> None:
        self.content = Content()

    def generate_full_content_graph(self):
        g = self.get_graph_from_lesson_subset(self.content.lessons)
        self._show_and_save_and_archive(g)

    def get_graph_from_lesson_subset(self, lessons, color_mode="default"):
        graph = nx.DiGraph()
        # TODO solve issue: this needs to always build the graph including all lessons so that all recursive prerequisites show
        for lesson in lessons:
            # see other attrs here https://graphviz.gitlab.io/doc/info/attrs.html
            attrs = {"fillcolor": self._get_node_color(
                lesson, mode=color_mode)}
            graph.add_node(lesson.name, **attrs)
            if len(lesson.prerequisites) == 0:
                print(f"{lesson.path} has no prerequisites")
                # graph.add_node(lesson)
            for prerequisite in lesson.prerequisites:
                try:
                    prerequisite = self.content.get_lesson_from_id(
                        prerequisite)
                except Exception as e:
                    print('Error with', prerequisite, 'in', lesson.path)
                    raise Exception(e)

                graph.add_edge(prerequisite, lesson.name)
        return graph

    def _get_node_color(self, lesson, mode="default"):

        green = "#71D271"
        purple = '#BE2ED6'
        blue = '#3C68DB'
        amber = '#FBBB63'
        red = '#E05151'

        if mode == "default":
            if lesson.video:
                return green
            # elif lesson.recorded and lesson.notebook:
            #     return '#F2672C'}  # dark orange
            elif lesson.needs_uploading:
                return purple
            elif lesson.recorded:
                return blue
            elif lesson.notebook:
                return amber
            else:
                return red
        elif mode == "practicals_exist":
            if lesson.practicals:
                return green
            else:
                return red

    def show_and_save(self, G, fp="workflow_graph/images/tree.png"):
        path, obj = nx.nx_agraph.view_pygraphviz(G)

        save_dir = os.path.join(*fp.split('/')[:-1])
        os.makedirs(save_dir, exist_ok=True)

        img = Image.open(path)
        img.save(fp)
        return img

    def _show_and_save_and_archive(self, *args):
        img = self.show_and_save(*args)
        img.save(f"workflow_graph/images/evolution/{datetime.now()}.png")

    # def save(self):

    def generate_project_content_graphs(self):
        # TODO remove old projects causing issues

        projects = self.set_all_projects_graph_attr()
        for project in projects:
            self.show_and_save(
                project.graph, fp=f"workflow_graph/images/project-dependencies/{project.name}.png")
            print()
        # input("hit enter to continue")

    def set_all_projects_graph_attr(self, graph_mode="default"):
        projects = Projects()
        for project in projects:
            # TODO put in project client
            print(project.name)
            print(project.path)
            project_prereqs = []
            for task in project.tasks:
                prerequisites_for_this_task = [self.content.get_lesson_from_id(
                    p) for p in task.prerequisites]
                recursive_tasks = []
                for prerequisite in prerequisites_for_this_task:
                    message = f"Creating prerequisites for {prerequisite.name}"
                    # print(f"{message:*^100}")
                    recursive_tasks.extend(
                        self.content.get_recursive_prerequisites(prerequisite))
                    # We need to add the prerequisite itself to the list of recursive tasks
                    recursive_tasks.append(prerequisite)
                    # print(recursive_tasks)
                recursive_tasks = self.__keep_only_first(recursive_tasks)
                # task_graph = self.get_graph_from_lesson_subset(recursive_tasks)

                print(
                    f"\nPrerequisite for task {task.id}: {recursive_tasks}\n")
                project_prereqs.extend(recursive_tasks)
            project_prereqs = self.__keep_only_first(project_prereqs)
            graph = self.get_graph_from_lesson_subset(
                project_prereqs, color_mode=graph_mode)
            project.graph = graph
        return projects

    def save_all_project_task_recursive_prerequisites(self):
        projects = Projects()
        for project in projects:
            self.save_project_task_recursive_prerequisites(project)

    def save_project_task_recursive_prerequisites(self, project):
        print(project.name)
        print(project.path)

        task_to_recursive_prereqs = {}

        for task in project.tasks:
            prerequisites_for_this_task = [self.content.get_lesson_from_id(
                p) for p in task.prerequisites]
            recursive_tasks = []
            for prerequisite in prerequisites_for_this_task:
                print(f"Creating prerequisites for {prerequisite.name}")
                recursive_tasks.extend(
                    self.content.get_recursive_prerequisites(prerequisite))
                # We need to add the prerequisite itself to the list of recursive tasks
                recursive_tasks.append(prerequisite)
                # print(recursive_tasks)
            recursive_tasks = self.__keep_only_first(recursive_tasks)

            if recursive_tasks:
                task_graph = self.get_graph_from_lesson_subset(
                    recursive_tasks)
                task_graph_save_dir = os.path.join(
                    project.path.replace("/specification.yaml", ""), "status", "task-prereqs")
                if not os.path.exists(task_graph_save_dir):
                    os.makedirs(task_graph_save_dir, exist_ok=True)
                task_graph_save_path = os.path.join(
                    task_graph_save_dir,
                    f"M{task.milestone_idx}T{task.idx}.png"
                )
                self.show_and_save(
                    task_graph,
                    task_graph_save_path
                )

            # print(
            #     f"\nPrerequisite for task {task.id}: {recursive_tasks}\n")
            recursive_prereqs_for_this_task = [
                l.id for l in recursive_tasks]
            task_to_recursive_prereqs[task.id] = recursive_prereqs_for_this_task

        # print(task_to_recursive_prereqs)
        prereqs_file = os.path.join(
            *project.path.split('/')[:-1], "task-to-recursive-prereqs.yaml")
        with open(prereqs_file, 'w') as f:
            yaml.dump(task_to_recursive_prereqs, f)
        print()

    def generate_project_practicals_exist(self):
        projects = self.set_project_graph_attr(graph_mode="practicals_exist")
        for project in projects:
            self.show_and_save(
                project.graph, fp=f"Projects/scenarios/{project.name}/status/practicals.png")
            print()

    @staticmethod
    def __keep_only_first(prereq_list: list) -> list:
        '''
        Iterate through the list of prerequisites and keep only the first instance of each prerequisite.

        Parameters
        ----------
        prereq_list : list
            List of prerequisites

        Returns
        -------
        list
            List of prerequisites with only the first instance of each prerequisite
        '''
        already_seen = []
        for task in prereq_list:
            if task not in already_seen:
                already_seen.append(task)
        return already_seen

In [3]:
if __name__ == '__main__':

    tree = Tree()
    graph = tree.get_graph_from_lesson_subset(tree.content.lessons)
    # # tree.generate_project_content_graphs()
    # project = Project(
    #     "Projects/scenarios/Movie-Recommendation")
    # tree.save_project_task_recursive_prerequisites(project)
    # tree.save_all_project_task_recursive_prerequisites()
    # tree.generate_project_practicals_exist()

# %%


  validate(nb)
/home/ivanyingxuan/miniconda3/envs/content-projects_new/lib/python3.10/site-packages/nbformat/__init__.py:93: DuplicateCellId: Non-unique cell id '308f9b23' detected. Corrected to '61c01d1b'.
  validate(nb)


Content/units/Machine-Learning/2. Introduction to ML/0. What is machine learning? has no prerequisites
Content/units/Data-Analytics/1. Tableau/1. What is Data Analytics? has no prerequisites
Content/units/MLOps/0. Intro to MLOps/0. What is MLOps? has no prerequisites
Content/units/Data-Handling/3. SQL/0. What is SQL? has no prerequisites
Content/units/Cloud-and-DevOps/2. Docker/0. What is Docker? has no prerequisites
Content/units/Cloud-and-DevOps/0. Intro to Cloud/0. What is the Cloud? has no prerequisites
Content/units/Essentials/2. The Command Line/0. What is the command line has no prerequisites
Content/units/Essentials/5. Common File Types for Working with Data/0. Markdown has no prerequisites
Content/units/Essentials/4. Common Data Types /0. Tabular Data has no prerequisites
Content/units/Essentials/4. Common Data Types /2. Text Data has no prerequisites
Content/units/Essentials/4. Common Data Types /1. Image Data has no prerequisites
Content/units/Essentials/4. Common Data Types

In [11]:
len(tree.content.lessons)

268

In [16]:
graph.nodes

NodeView(('Integrating Kafka & Spark', Kafka Essentials, Spark Essentials, 'Data Storage', Data Pipelines, 'Data Transformation - ETL & ELT', Batch Processing and Streaming, 'Data Ingestion', 'Batch Processing and Streaming', Data Ingestion, 'Data Pipelines', The Data Engineering Lifecycle, 'Enterprise Data Warehouses', Data Storage, 'What is Data Engineering?', Summary of Common File Types, Summary of Common Data Types, pyscopg2 and SQLAlchemy, 'The Data Engineering Landscape', Data Transformation - ETL & ELT, Enterprise Data Warehouses, 'The Data Engineering Lifecycle', What is Data Engineering?, 'What is Kafka?', The Data Engineering Landscape, 'Streaming in Kafka', 'Kafka Essentials', What is Kafka?, 'Kafka-Python', 'Columnar NoSQL Storage', What is NoSQL?, 'What is NoSQL?', 'Key-Value NoSQL Storage', 'Document-Oriented NoSQL Storage', 'Graph-Oriented NoSQL Storage', 'What is Apache Hadoop?', 'Installing Apache Hadoop', What is Apache Hadoop?, 'What is Apache Spark?', 'Integrating 