### Run this notebook on Google Colab after running the scrpe_links.py script

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
!apt-get update
!apt install -y chromium-chromedriver
!pip install selenium

In [6]:
import os

# Specify the directory you want to list the files in your Google Drive root
directory = '/content/drive/MyDrive/'

# List all files and directories in the specified path
files = os.listdir(directory)

# Print all files
for file in files:
    print(file)

In [5]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from concurrent.futures import ThreadPoolExecutor
from google.colab import drive
import time

drive.mount('/content/drive')

def scrape_project_data(url):
    try:
        # Setup Chrome options for Colab
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        # Initialize the WebDriver
        driver = webdriver.Chrome(options=chrome_options)

        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'gemini-type-h1')))

        # Scraping the title
        title_element = driver.find_element(By.CLASS_NAME, 'gemini-type-h1')
        title = title_element.text.strip() if title_element else "N/A"

        # Scraping the sub-title (description under the title)
        try:
            sub_title_element = driver.find_element(By.CLASS_NAME, 'gemini-type-t1')
            sub_title = sub_title_element.text.strip() if sub_title_element else "N/A"
        except:
            sub_title = "N/A"
            print(f"Subtitle not found for {url}")

        # Initialize variables
        video_link = "N/A"

        # Retry mechanism for YouTube link extraction
        for _ in range(3): 
            try:
                WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'gemini-project-video')))
                video_div = driver.find_element(By.CLASS_NAME, 'gemini-project-video')
                iframe = video_div.find_element(By.TAG_NAME, 'iframe')
                video_src = iframe.get_attribute('src') if iframe else None
                if video_src and 'youtube.com/embed/' in video_src:
                    video_id = video_src.split('/embed/')[1].split('?')[0]  
                    video_link = f"https://www.youtube.com/watch?v={video_id}"
                    break  
            except Exception as e:
                print(f"Attempt failed for YouTube link on {url}: {e}")
                time.sleep(2)  

        # Scraping "What it does"
        try:
            what_it_does_label = driver.find_element(By.XPATH, "//p[text()='What it does']")
            what_it_does = what_it_does_label.find_element(By.XPATH, "following-sibling::p").text.strip()
        except:
            what_it_does = "N/A"

        # Scraping "Built with"
        try:
            built_with_label = driver.find_element(By.XPATH, "//p[text()='Built with']")
            built_with_list = built_with_label.find_element(By.XPATH, "following-sibling::ul")
            built_with_items = [li.text.strip() for li in built_with_list.find_elements(By.TAG_NAME, 'li')]
            built_with = ', '.join(built_with_items)
        except:
            built_with = "N/A"

        # Scraping "By" (team member)
        try:
            by_label = driver.find_element(By.XPATH, "//p[text()='By']")
            by = by_label.find_element(By.XPATH, "following-sibling::p").text.strip()
        except:
            by = "N/A"

        # Scraping "From" (location)
        try:
            from_label = driver.find_element(By.XPATH, "//p[text()='From']")
            location = from_label.find_element(By.XPATH, "following-sibling::p").text.strip()
        except:
            location = "N/A"

        driver.quit()

        # Return all scraped data as a dictionary
        return {
            'Title': title,
            'Sub-Title': sub_title,
            'YouTube Link': video_link,
            'What it Does': what_it_does,
            'Built With': built_with,
            'By': by,
            'Location': location,
            'Project Link': url
        }

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

def scrape_projects_from_csv(input_csv, num_links=None, chunk_size=200):
    project_data = []

    # Read the project links from CSV and remove the first row
    with open(input_csv, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        rows = list(reader)

        data_rows = rows[1:]

    # Define a function to process each row
    def process_row(row):
        project_url = row[0]
        print(f"Scraping data from: {project_url}")
        data = scrape_project_data(project_url)
        if data:
            return data
        return None

    # Use ThreadPoolExecutor to parallelize requests
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = executor.map(process_row, data_rows[:num_links] if num_links else data_rows)
        project_data = [result for result in results if result]

    # Split data into chunks and write to separate CSV files
    for i in range(1, len(project_data), chunk_size):
        chunk = project_data[i:i + chunk_size]
        output_csv = f'/content/drive/MyDrive/project_data_{i // chunk_size + 1}.csv'
        with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=['Title', 'Sub-Title', 'YouTube Link', 'What it Does', 'Built With', 'By', 'Location', 'Project Link'])
            writer.writeheader()
            for data in chunk:
                writer.writerow(data)

scrape_projects_from_csv('/content/drive/MyDrive/project_links.csv', num_links=None)