# ETL Milestone

### Import necessary libraries and initialize variables

In [1]:
from pipelines import GitHubPipeline, YouTubePipeline, MediumPipeline, LinkedInPipeline
from clearml import Task
import logging
import os

In [2]:
# Set logging level to info
logging.basicConfig(level=logging.INFO)

In [3]:
%env CLEARML_WEB_HOST=http://webserver:8080/
%env CLEARML_API_HOST=http://apiserver:8008
%env CLEARML_FILES_HOST=http://fileserver:8081
%env CLEARML_API_ACCESS_KEY=WOLAKOQGPNE6UYI1O3MGC57ZLNX7IN
%env CLEARML_API_SECRET_KEY=JTD4z46ycq_VRIlG60rRPAdE2kkjd1O4jtHWTvhVmiRKvg82MKV-YfoeNIgX12bNuIQ

env: CLEARML_WEB_HOST=http://webserver:8080/
env: CLEARML_API_HOST=http://apiserver:8008
env: CLEARML_FILES_HOST=http://fileserver:8081
env: CLEARML_API_ACCESS_KEY=WOLAKOQGPNE6UYI1O3MGC57ZLNX7IN
env: CLEARML_API_SECRET_KEY=JTD4z46ycq_VRIlG60rRPAdE2kkjd1O4jtHWTvhVmiRKvg82MKV-YfoeNIgX12bNuIQ


### Define and run the Pipeline class to pull code from each source

In [4]:
class ETLPipeline:
    def __init__(self) -> None:
        self.add_parameters()

    def add_parameters(self):
        #Initialize mongo db and github token
        self.mongodb_uri = 'mongodb://mongo:27017/'
        self.db_name = 'ros2_docs'
        self.github_token = os.getenv('GITHUB_TOKEN')

    def run_etl_pipeline(self, pipeline_name, mongodb_uri, db_name, github_token=None):
        #Initialize task for current scraping source
        task = Task.init(project_name="ROS2 RAG System", task_name=f"ETL - {pipeline_name}", task_type=Task.TaskTypes.data_processing)
        logger = task.get_logger()

        #Get the pipeline object depending on the source
        if pipeline_name == "GitHubPipeline":
            pipeline = GitHubPipeline(mongodb_uri, db_name, github_token)
        elif pipeline_name == "YouTubePipeline":
            pipeline = YouTubePipeline(mongodb_uri, db_name)
        elif pipeline_name == "MediumPipeline":
            pipeline = MediumPipeline(mongodb_uri, db_name)
        elif pipeline_name == "LinkedInPipeline":
            pipeline = LinkedInPipeline(mongodb_uri, db_name)
        else:
            raise ValueError(f"Unknown pipeline: {pipeline_name}")

        try:
            #Perform ETL
            logging.info(f"Running {pipeline_name}")
            logger.report_text(f"Starting pipeline: {pipeline_name}")
            pipeline.extract_and_transform_and_load()
            logger.report_text(f"Successfully completed pipeline: {pipeline_name}")

            # Return the urls that were scraped
            urls = [doc["url"] for doc in pipeline.collection.find()]
            file_name = f"{pipeline_name}_urls.txt"
            with open(file_name, "w") as f:
                f.write("\n".join(urls))
            
            #Upload artifact to clearml server
            logger.report_text(f"URLs saved to {file_name}")
            task.upload_artifact("Top URLs", artifact_object=file_name)

            return {"count": len(urls), "top": urls[:10]}

        except Exception as e:
            logger.report_text(f"Error in pipeline {pipeline_name}: {e}")
            logging.error(f"Error in {pipeline_name}: {e}", exc_info=True)

        finally:
            task.close()

    def run_all_pipelines(self):
        all_urls = {}
        
        # Run each pipeline with the appropriate parameters
        all_urls['GitHub'] = self.run_etl_pipeline("GitHubPipeline", self.mongodb_uri, self.db_name, self.github_token)
        all_urls['YouTube'] = self.run_etl_pipeline("YouTubePipeline", self.mongodb_uri, self.db_name)
        all_urls['Medium'] = self.run_etl_pipeline("MediumPipeline", self.mongodb_uri, self.db_name)
        all_urls['LinkedIn'] = self.run_etl_pipeline("LinkedInPipeline", self.mongodb_uri, self.db_name)
        
        return all_urls

Run the ETL pipeline for all sources

In [5]:
etl = ETLPipeline()
all_urls = etl.run_all_pipelines()

ClearML Task: created new task id=7d475db36ac446c9bf4f6bfb6a9da30f
ClearML results page: http://webserver:8080/projects/e54dc86522f8446889c50edbdf493bef/experiments/7d475db36ac446c9bf4f6bfb6a9da30f/output/log
CLEARML-SERVER new package available: UPGRADE to v1.17.0 is recommended!
Release Notes:
### New Features 
- New ClearML Model dashboard: View all live model endpoints in a single location, complete with real time metrics reporting.
- New UI pipeline run table comparative view: compare plots and scalars of selected pipeline runs
- Improve services agent behavior: If no credentials are specified, agent uses default credentials ([ClearML Server GitHub issue #140](https://github.com/allegroai/clearml-server/issues/140))
- Add UI re-enqueue of failed tasks
- Add UI experiment scalar results table view
- Add "Block running user's scripts in the browser" UI setting option for added security
- Add UI "Reset" to set task installed packages to originally recorded values 
- Add UI edit of de

INFO:root:Running GitHubPipeline
INFO:root:Starting to extract GitHub repositories
INFO:root:Processing source: ros2/ros2_documentation


Starting pipeline: GitHubPipeline


INFO:root:Temporary directory created - /tmp/tmptphb02hq
INFO:root:Fetching all files in repository


ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


INFO:root:Found 271 files in current repo
INFO:root:Converting RST Files to HTML
INFO:root:Completed Conversion
Inserting Data: 100%|██████████| 273/273 [00:00<00:00, 288.60it/s]            
INFO:root:Processing source: moveit/moveit2_tutorials
INFO:root:Temporary directory created - /tmp/tmp7n3e8et3
INFO:root:Fetching all files in repository


ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


INFO:root:Found 85 files in current repo
INFO:root:Converting RST Files to HTML
INFO:root:Completed Conversion
Inserting Data: 100%|██████████| 87/87 [00:00<00:00, 89.23it/s]             
INFO:root:Processing source: gazebosim/docs
INFO:root:Temporary directory created - /tmp/tmp86ce23d_
INFO:root:Fetching all files in repository
INFO:root:Found 238 files in current repo
Inserting Data: 100%|██████████| 238/238 [00:00<00:00, 251.88it/s]
INFO:root:Processing source: ros-navigation/docs.nav2.org
INFO:root:Temporary directory created - /tmp/tmp6d81g37d
INFO:root:Fetching all files in repository
INFO:root:Found 206 files in current repo
INFO:root:Converting RST Files to HTML
INFO:root:Completed Conversion
Inserting Data: 100%|██████████| 208/208 [00:00<00:00, 487.75it/s]            


Successfully completed pipeline: GitHubPipeline
URLs saved to GitHubPipeline_urls.txt
ClearML Task: created new task id=64459c65c5c64e839917c1df93699154
ClearML results page: http://webserver:8080/projects/e54dc86522f8446889c50edbdf493bef/experiments/64459c65c5c64e839917c1df93699154/output/log


INFO:root:Running YouTubePipeline
INFO:root:Starting YouTube ROS Documentation Transcript Pipeline


Starting pipeline: YouTubePipeline


Search Terms:   0%|          | 0/6 [00:00<?, ?it/s]INFO:root:Searching YouTube for: ROS2 Tutorial


ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


INFO:root:Processed 4 videos for term - ROS2 Tutorial
Search Terms:  17%|█▋        | 1/6 [00:08<00:41,  8.37s/it]INFO:root:Searching YouTube for: ROS Robot Operating System Documentation
INFO:root:Processed 11 videos for term - ROS Robot Operating System Documentation
Search Terms:  33%|███▎      | 2/6 [00:21<00:44, 11.21s/it]INFO:root:Searching YouTube for: ROS Programming Explained
INFO:root:Processed 9 videos for term - ROS Programming Explained
Search Terms:  50%|█████     | 3/6 [00:35<00:37, 12.48s/it]INFO:root:Searching YouTube for: ROS Robotics Tutorials
INFO:root:Processed 6 videos for term - ROS Robotics Tutorials
Search Terms:  67%|██████▋   | 4/6 [00:43<00:21, 10.59s/it]INFO:root:Searching YouTube for: ROS Navigation Stack
INFO:root:Processed 12 videos for term - ROS Navigation Stack
Search Terms:  83%|████████▎ | 5/6 [01:01<00:13, 13.19s/it]INFO:root:Searching YouTube for: ROS Perception Tutorial
INFO:root:Processed 9 videos for term - ROS Perception Tutorial
Search Terms: 

Successfully completed pipeline: YouTubePipeline
URLs saved to YouTubePipeline_urls.txt
ClearML Task: created new task id=b2391bbfaf4542eba231bc7432074508
ClearML results page: http://webserver:8080/projects/e54dc86522f8446889c50edbdf493bef/experiments/b2391bbfaf4542eba231bc7432074508/output/log


INFO:root:Running MediumPipeline


Starting pipeline: MediumPipeline
ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


INFO:root:Scrolling page
Scroll count: 100%|██████████| 20/20 [01:06<00:00,  3.30s/it]
Article Parsing Progress: 100%|██████████| 60/60 [00:32<00:00,  1.83it/s]
Data Insert: 100%|██████████| 60/60 [00:02<00:00, 25.74it/s]


Successfully completed pipeline: MediumPipeline
URLs saved to MediumPipeline_urls.txt
ClearML Task: created new task id=2f62cfb5bec34e09afbdb87531fb32bb
ClearML results page: http://webserver:8080/projects/e54dc86522f8446889c50edbdf493bef/experiments/2f62cfb5bec34e09afbdb87531fb32bb/output/log


INFO:root:Running LinkedInPipeline


Starting pipeline: LinkedInPipeline
ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


Article Processing Progress: 100%|██████████| 100/100 [02:30<00:00,  1.50s/it]


Successfully completed pipeline: LinkedInPipeline
URLs saved to LinkedInPipeline_urls.txt


## Total URLs parsed:

In [10]:
total_count = sum([all_urls[source]['count'] for source in all_urls])
print(f"{total_count} urls parsed and stored in database!")

1017 urls parsed and stored in database!


### Print the scraped information for each source

In [14]:
#GitHub
count = all_urls['GitHub']['count']
top = all_urls['GitHub']['top']
print(f"The total number of urls scraped for GitHub: {count}")
print("Sample URLs scraped: ")
print(top)

The total number of urls scraped for GitHub: 806
Sample URLs scraped: 
['https://github.com/ros2/ros2_documentation/tree/jazzy/../source/build/How-To-Guides.rst', 'https://github.com/ros2/ros2_documentation/tree/jazzy/../source/build/Related-Projects.rst', 'https://github.com/ros2/ros2_documentation/tree/jazzy/../source/build/The-ROS2-Project.rst', 'https://github.com/ros2/ros2_documentation/tree/jazzy/../source/build/Tutorials.rst', 'https://github.com/ros2/ros2_documentation/tree/jazzy/../source/build/Installation.rst', 'https://github.com/ros2/ros2_documentation/tree/jazzy/../source/build/Releases.rst', 'https://github.com/ros2/ros2_documentation/tree/jazzy/../source/build/Package-Docs.rst', 'https://github.com/ros2/ros2_documentation/tree/jazzy/../source/build/index.rst', 'https://github.com/ros2/ros2_documentation/tree/jazzy/../source/build/search.rst', 'https://github.com/ros2/ros2_documentation/tree/jazzy/../source/build/Contact.rst']


In [15]:
#Medium
count = all_urls['Medium']['count']
top = all_urls['Medium']['top']
print(f"The total number of urls scraped for Medium: {count}")
print("Sample URLs scraped: ")
print(top)

The total number of urls scraped for Medium: 60
Sample URLs scraped: 
['https://medium.com/@robofoundry/trying-out-ros2-jazzy-jalisco-in-10-mins-177ad6bcbed2?source=tag_recommended_stories_page------ros2---0-84--------------------7a26b50a_abf8_4e52_8f3b_c9b4e5095595-------', 'https://medium.com/@antonioconsiglio/integrating-orb-slam3-with-ros2-humble-on-raspberry-pi-5-a-step-by-step-guide-78e7b911c361?source=tag_recommended_stories_page------ros2---1-85--------------------7a26b50a_abf8_4e52_8f3b_c9b4e5095595-------', 'https://medium.com/@myequation/why-ros-is-the-lifeline-of-every-engineer-d9c137103de6?source=tag_recommended_stories_page------ros2---2-84--------------------7a26b50a_abf8_4e52_8f3b_c9b4e5095595-------', 'https://medium.com/@tfoldi/ros-2-live-depth-cam-point-cloud-visualization-with-rerun-66534561557a?source=tag_recommended_stories_page------ros2---3-85--------------------7a26b50a_abf8_4e52_8f3b_c9b4e5095595-------', 'https://medium.com/@psreeram/building-a-home-service-r

In [16]:
#YouTube
count = all_urls['YouTube']['count']
top = all_urls['YouTube']['top']
print(f"The total number of urls scraped for YouTube: {count}")
print("Sample URLs scraped: ")
print(top)

The total number of urls scraped for YouTube: 51
Sample URLs scraped: 
['https://www.youtube.com/watch?v=7TVWlADXwRw&pp=ygUNUk9TMiBUdXRvcmlhbA%3D%3D', 'https://www.youtube.com/watch?v=Gg25GfA456o&pp=ygUNUk9TMiBUdXRvcmlhbA%3D%3D', 'https://www.youtube.com/watch?v=idQb2pB-h2Q&t=291s&pp=ygUNUk9TMiBUdXRvcmlhbA%3D%3D', 'https://www.youtube.com/watch?v=GHb6Wr_exxI&pp=ygUNUk9TMiBUdXRvcmlhbA%3D%3D', 'https://www.youtube.com/watch?v=96XsJ7xfsS8&pp=ygUoUk9TIFJvYm90IE9wZXJhdGluZyBTeXN0ZW0gRG9jdW1lbnRhdGlvbg%3D%3D', 'https://www.youtube.com/watch?v=N6K2LWG2kRI&pp=ygUoUk9TIFJvYm90IE9wZXJhdGluZyBTeXN0ZW0gRG9jdW1lbnRhdGlvbg%3D%3D', 'https://www.youtube.com/watch?v=7TVWlADXwRw&pp=ygUoUk9TIFJvYm90IE9wZXJhdGluZyBTeXN0ZW0gRG9jdW1lbnRhdGlvbg%3D%3D', 'https://www.youtube.com/watch?v=MWKnMPX0Yjg&pp=ygUoUk9TIFJvYm90IE9wZXJhdGluZyBTeXN0ZW0gRG9jdW1lbnRhdGlvbg%3D%3D', 'https://www.youtube.com/watch?v=8QfI5a7lTKU&pp=ygUoUk9TIFJvYm90IE9wZXJhdGluZyBTeXN0ZW0gRG9jdW1lbnRhdGlvbg%3D%3D', 'https://www.youtube.com/watch

In [17]:
#LinkedIn
count = all_urls['LinkedIn']['count']
top = all_urls['LinkedIn']['top']
print(f"The total number of urls scraped for LinkedIn: {count}")
print("Sample URLs scraped: ")
print(top)

The total number of urls scraped for LinkedIn: 100
Sample URLs scraped: 
['https://www.linkedin.com/advice/0/how-can-you-diagnose-fix-ros-problems-roswtf-skills-ros', 'https://www.linkedin.com/advice/3/what-benefits-getting-ros-certification-your-career-skills-ros', 'https://www.linkedin.com/advice/1/what-steps-use-ros-navigation-tools-skills-ros', 'https://www.linkedin.com/advice/1/how-do-you-optimize-ros-performance-version-management-skills-ros', 'https://www.linkedin.com/advice/0/how-can-you-link-ros-other-systems-skills-ros', 'https://www.linkedin.com/advice/3/how-can-you-teach-ros-robotics-professionals-students-skills-ros', 'https://www.linkedin.com/advice/3/what-pros-cons-using-slam-navigation-skills-ros', 'https://www.linkedin.com/advice/0/how-can-you-navigate-mobile-robots-autonomously-ros-skills-ros', 'https://www.linkedin.com/advice/0/how-can-you-use-ros-3d-perception-skills-ros', 'https://www.linkedin.com/advice/0/how-can-you-avoid-common-ros-package-library-errors-skills-