========================= READING THE DATA ==========================

In [None]:
# ============================================
# STEP 1 : IMPORT LIBRARIES
# ============================================

import requests
from bs4 import BeautifulSoup
import os
import time


# ----------------------------------------------------------------------
# Define a function to read the youtube video urls from a local folder
# ----------------------------------------------------------------------

def read_local_urls_file(file_name):
    """
    Reads a text file containing URLs, where each URL is on a new line.
    Returns a list of URLs or None if the file is not found.
    """
    try:
        with open(file_name, 'r', encoding='utf-8') as f:
            urls = [line.strip() for line in f if line.strip()]
        print(f"Successfully read {len(urls)} URLs from '{file_name}'.")
        return urls
    except FileNotFoundError:
        print(f"Error: The file '{file_name}' was not found. Please make sure it's in the same directory as the script.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the file '{file_name}': {e}")
        return None


# ----------------------------------------------------------------------
# Define a function to read the blogs and articles urls from a local folder
# ----------------------------------------------------------------------

def scrape_web_page(url):
    """
    Fetches the content of a web page related to AI and extracts the main text.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status() # Raise an exception for bad status codes

        soup = BeautifulSoup(response.content, 'html.parser')

        # Get the title of the article
        title = soup.title.string if soup.title else "Untitled"

        # Find all paragraph tags and combine their text
        paragraphs = soup.find_all('p')
        text = ' '.join([p.get_text() for p in paragraphs])

        # Basic cleaning: remove extra whitespace and newlines
        cleaned_text = ' '.join(text.split())

        return title, cleaned_text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None, None
    except Exception as e:
        print(f"Error parsing content from {url}: {e}")
        return None, None

# --- Main script execution ---

# Get URLs from youtube_urls.txt and process them.
youtube_urls = read_local_urls_file("youtube_urls.txt")
if youtube_urls:
    print(f"\nProcessing {len(youtube_urls)} YouTube URLs...")
    # Add your YouTube scraping logic here

# Get URLs from web_urls.txt and process them.
web_urls = read_local_urls_file("web_urls.txt")
if web_urls:
    # Create a directory to store the scraped content
    if not os.path.exists("articles"):
        os.makedirs("articles")

    articles_processed = 0

    for url in web_urls:
        print(f"Processing {url}...")
        title, content = scrape_web_page(url)
        if content:
            # Create a clean filename from the URL to save the content
            filename = url.replace("https://", "").replace("http://", "").replace("/", "_").replace(".", "_")
            output_filename = os.path.join("articles", f"{filename}.txt")

            with open(output_filename, "w", encoding="utf-8") as f:
                f.write(f"Title: {title}\n\n")
                f.write(content)

            articles_processed += 1
            print(f"Successfully scraped and saved to {output_filename}")

        # Be a good web citizen and add a small delay
        time.sleep(1)

    print("\n--- Process Complete ---")
    print(f"Successfully processed {articles_processed} articles.")
    print("The content is saved in the 'articles' directory.")


Successfully read 15 URLs from 'youtube_urls.txt'.

Processing 15 YouTube URLs...
Successfully read 10 URLs from 'web_urls.txt'.
Processing https://www.sprinklr.com/blog/ai-social-media-content-creation/...
Successfully scraped and saved to articles\www_sprinklr_com_blog_ai-social-media-content-creation_.txt
Processing https://www.gwi.com/blog/free-ai-tools-for-content-creation...
Successfully scraped and saved to articles\www_gwi_com_blog_free-ai-tools-for-content-creation.txt
Processing https://www.getblend.com/blog/10-best-ai-tools-to-use-for-content-creation/...
Successfully scraped and saved to articles\www_getblend_com_blog_10-best-ai-tools-to-use-for-content-creation_.txt
Processing https://www.hostpapa.com/blog/marketing/how-to-use-ai-for-content-creation/...
Successfully scraped and saved to articles\www_hostpapa_com_blog_marketing_how-to-use-ai-for-content-creation_.txt
Processing https://www.usemotion.com/blog/ai-content-creation...
Successfully scraped and saved to articles

In [None]:
# ==============================================================
# Check if the youtube videos have the transcript available
# ==============================================================

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, VideoUnavailable

def has_transcript(video_id: str) -> bool:
    try:
        api = YouTubeTranscriptApi()
        api.fetch(video_id)
        return True
    except (NoTranscriptFound, TranscriptsDisabled, VideoUnavailable):
        return False

print(has_transcript("V5ych2rxtnQ"))
print(has_transcript("1MQ5ozIvgzE"))
print(has_transcript("tQ84XYcP-nA"))
print(has_transcript("S2TAa4P2IuY&t=347s"))
print(has_transcript("1MQ5ozIvgzE&t=171s"))
print(has_transcript("UKeCWtI_lfA"))  # Replace with any YouTube video ID


True
True
True
False
False
True


===================== FETCH THE TRANSCRIPTS FROM THE YOUTUBE VIDEOS =====================

In [None]:
import os
import re
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, VideoUnavailable

# --- CONFIG ---
input_file = "youtube_urls.txt"   # Your file with one YouTube URL per line
output_folder = "YT_transcripts"   # Assign folder to save transcripts
os.makedirs(output_folder, exist_ok=True)


# -----------------------------------------
# --- FUNCTION: Extract Video ID ---
# -----------------------------------------

def extract_video_id(url: str) -> str:
    """
    Extract YouTube video ID from a URL (works with standard and shortened URLs)
    """
    match = re.search(r"v=([A-Za-z0-9_-]{11})", url)
    if match:
        return match.group(1)
    match = re.search(r"youtu\.be/([A-Za-z0-9_-]{11})", url)
    if match:
        return match.group(1)
    return None


# ---------------------------------------------------------
# --- FUNCTION: Fetch Transcript from the video ID ---
# ---------------------------------------------------------

def fetch_transcript(video_id: str):
    try:
        api = YouTubeTranscriptApi()
        transcript = api.fetch(video_id)
        # Combine text lines into a single string using dot notation
        full_text = " ".join([entry.text for entry in transcript])
        return full_text
    except (NoTranscriptFound, TranscriptsDisabled, VideoUnavailable):
        return None

# ---------------------------------------------------------
# --- SAVE the fetched youtube transcripts ---
# ---------------------------------------------------------

with open(input_file, "r") as f:
    urls = [line.strip() for line in f.readlines()]

for url in urls:
    video_id = extract_video_id(url)
    if not video_id:
        print(f"❌ Could not extract video ID from URL: {url}")
        continue

    print(f"Processing video ID: {video_id} ...")
    transcript_text = fetch_transcript(video_id)

    if transcript_text:
        output_path = os.path.join(output_folder, f"{video_id}.txt")
        with open(output_path, "w", encoding="utf-8") as out_file:
            out_file.write(transcript_text)
        print(f"✅ Transcript saved: {output_path}")
    else:
        print(f"⚠️ No transcript available for video: {video_id}")


Processing video ID: tQ84XYcP-nA ...
✅ Transcript saved: YT_transcripts\tQ84XYcP-nA.txt
Processing video ID: _SpyH8wTA-4 ...
✅ Transcript saved: YT_transcripts\_SpyH8wTA-4.txt
Processing video ID: S2TAa4P2IuY ...
✅ Transcript saved: YT_transcripts\S2TAa4P2IuY.txt
Processing video ID: 1MQ5ozIvgzE ...
✅ Transcript saved: YT_transcripts\1MQ5ozIvgzE.txt
Processing video ID: UKeCWtI_lfA ...
✅ Transcript saved: YT_transcripts\UKeCWtI_lfA.txt
Processing video ID: rM0xpwENa8I ...
✅ Transcript saved: YT_transcripts\rM0xpwENa8I.txt
Processing video ID: uxrSyA-VxWs ...
✅ Transcript saved: YT_transcripts\uxrSyA-VxWs.txt
Processing video ID: Tw9HButMNu8 ...
✅ Transcript saved: YT_transcripts\Tw9HButMNu8.txt
Processing video ID: LEJGFnjIWmQ ...
✅ Transcript saved: YT_transcripts\LEJGFnjIWmQ.txt
Processing video ID: V5ych2rxtnQ ...
✅ Transcript saved: YT_transcripts\V5ych2rxtnQ.txt
Processing video ID: -lSDKrX01xA ...
✅ Transcript saved: YT_transcripts\-lSDKrX01xA.txt
Processing video ID: 1MQ5ozIvgzE

In [None]:
#  ===================================
#  Load the youtube video transcripts
#  ===================================

import os

transcripts_folder = "YT_transcripts"
transcript_files = [f for f in os.listdir(transcripts_folder) if f.endswith(".txt")]

YT_transcripts_data = []

for file_name in transcript_files:
    file_path = os.path.join(transcripts_folder, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
        YT_transcripts_data.append({
            "video_id": file_name.replace(".txt", ""),
            "text": text
        })

print(f"✅ Loaded {len(YT_transcripts_data)} transcripts.")
print("Example:", YT_transcripts_data[0]["video_id"], YT_transcripts_data[0]["text"][:500])


✅ Loaded 14 transcripts.
Example: tQ84XYcP-nA Every week there's a new AI tool making headlines, and right now there are more AI video generators than ever. But most of them don't work as well as you'd expect. Some generate great videos, but only if you stay within their style limits. If you try to get more creative, like detailed anime or wild fantasy worlds, they often mess up, and most of the time they're too expensive or just confusing to use. I've tested every major tool that's come out recently, and I've seen where they shine and wher


In [None]:
# ====================================
# chunk the youtube video transcripts
# ====================================


def chunk_text(text, max_length=500):  # max length define the length of each chunk
    """
    Split text into smaller chunks of approximately max_length words.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length):
        chunks.append(" ".join(words[i:i+max_length]))
    return chunks

yt_chunks = []

for item in YT_transcripts_data:
    chunks = chunk_text(item["text"], max_length=500)  # you can adjust length
    for i, chunk in enumerate(chunks):
        yt_chunks.append({
            "video_id": item["video_id"],
            "chunk_index": i,
            "text": chunk
        })

print(f"✅ Total YouTube chunks created: {len(yt_chunks)}")
print("Example chunk:", yt_chunks[0])

✅ Total YouTube chunks created: 96
Example chunk: {'video_id': 'tQ84XYcP-nA', 'chunk_index': 0, 'text': "Every week there's a new AI tool making headlines, and right now there are more AI video generators than ever. But most of them don't work as well as you'd expect. Some generate great videos, but only if you stay within their style limits. If you try to get more creative, like detailed anime or wild fantasy worlds, they often mess up, and most of the time they're too expensive or just confusing to use. I've tested every major tool that's come out recently, and I've seen where they shine and where they completely fall apart. So, in this video, I'm going to show you the AI video tools that actually deliver the kind of quality you'd want to publish. The best way to use them without wasting hours learning clunky software, and the one platform that ties everything together, so you can create full videos without switching between five different sites. Let's break it down. All right, let's

In [None]:
#  ====================================================================================
# chunk the youtube video transcripts with max_length of 200 characters for each chunk
#  ====================================================================================


# def chunk_text(text, max_length=500):
#     """
#     Split text into smaller chunks of approximately max_length words.
#     """
#     words = text.split()
#     chunks = []
#     for i in range(0, len(words), max_length):
#         chunks.append(" ".join(words[i:i+max_length]))
#     return chunks

# yt_chunks = []

# for item in transcripts_data:
#     chunks = chunk_text(item["text"], max_length=200)  # you can adjust length
#     for i, chunk in enumerate(chunks):
#         yt_chunks.append({
#             "video_id": item["video_id"],
#             "chunk_index": i,
#             "text": chunk
#         })

# print(f"✅ Total YouTube chunks created: {len(yt_chunks)}")
# print("Example chunk:", yt_chunks[0])


✅ Total YouTube chunks created: 104
Example chunk: {'video_id': 'V5ych2rxtnQ', 'chunk_index': 0, 'text': "If you've ever used Chat GBT to create content, you've probably run into the same issue I have. It sounds okay, but it never really sounds like you. Yeah, this sounds nothing like me. But what if I could tell you that you could visually teach AI how to sound like you? Your tone, the structure, and even your sense of humor, all by just dropping in your old work. The problem isn't that AI can't sound like you. The problem is there's no clear or creative way to show it how. That is, there wasn't until now. And I'm going to show you a tool that has helped me save hours of time in my creative workflow. With just a few inputs from my past work, you're going to see this thing work like magic. So, let me show you. Poppy AI. All right. Here we are in a brand new Poppy board. This is the visual space that you will be working in. And I'm going to start an AI chat window, which is not going to

In [None]:
#  =======================================
#  Generate embeddings for each YT chunk
#  =======================================

import os
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm
import time

# Load API key from .env
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def get_yt_embedding(text):
    # Ensure text isn’t too long
    if len(text) > 8000:  # token proxy via characters
        text = text[:8000]
    try:
        response = client.embeddings.create(
            model="text-embedding-3-small",    # OpenAI embedding model
            input=text
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"❌ Embedding failed: {e}")
        return None

# Loop with a pause to respect rate limits
for chunk in tqdm(yt_chunks):
    emb = get_yt_embedding(chunk['text'])
    if emb:
        chunk['embedding'] = emb
    time.sleep(0.5)  # add small delay


print(f"✅ Embeddings generated for {len(yt_chunks)} chunks.")


100%|██████████| 96/96 [01:21<00:00,  1.18it/s]

✅ Embeddings generated for 96 chunks.





In [None]:
#  ===========================================================
#  save generated YouTube embeddings locally as .json file)
#  ===========================================================

import json

# Example: storing locally as JSON
output_file = "embedded_yt_transcripts.json"   # Assign the file name to save the embedded data

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(yt_chunks, f, ensure_ascii=False, indent=2)

print(f"✅ Embedded data saved to {output_file}")


✅ Embedded data saved to embedded_yt_transcripts.json


======================= BLOGS AND ARTICLES ========================

In [10]:
#  ===================================
#  Load the blogs and articles
#  ===================================


articles_folder = "articles"  # your folder containing article .txt files
article_files = [f for f in os.listdir(articles_folder) if f.endswith(".txt")]

articles_data = []

for file_name in article_files:
    file_path = os.path.join(articles_folder, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
        articles_data.append({
            "article_id": file_name.replace(".txt", ""),
            "text": text
        })

print(f"✅ Loaded {len(articles_data)} articles.")
print("Example:", articles_data[1]["article_id"], articles_data[0]["text"][:500])


✅ Loaded 10 articles.
Example: blog_hootsuite_com_ai-content-creation-tools_ Title: 15 Free AI Tools For Content Creation | GWI

SHARE You’ve got five briefs due, a campaign launching tomorrow, and your designer is on leave. Stressful, right? That’s where AI tools can step in - to make the chaos more manageable. Marketers, strategists, and creatives are no longer just looking for tools to help them write faster, but are in desperate need of solutions that can do everything from analyzing audience behavior to creating visuals, optimizing content marketing, and more. That’


In [11]:
#  ===================================
#  Chunk the blogs and articles
#  ===================================

def chunk_text(text, max_length=500):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length):
        chunks.append(" ".join(words[i:i+max_length]))
    return chunks

article_chunks = []

for item in articles_data:
    chunks = chunk_text(item["text"], max_length=500)  # adjust size if needed
    for i, chunk in enumerate(chunks):
        article_chunks.append({
            "article_id": item["article_id"],
            "chunk_index": i,
            "text": chunk
        })

print(f"✅ Total article chunks created: {len(article_chunks)}")
print("Example chunk:", article_chunks[0])


✅ Total article chunks created: 68
Example chunk: {'article_id': 'www_gwi_com_blog_free-ai-tools-for-content-creation', 'chunk_index': 0, 'text': "Title: 15 Free AI Tools For Content Creation | GWI SHARE You’ve got five briefs due, a campaign launching tomorrow, and your designer is on leave. Stressful, right? That’s where AI tools can step in - to make the chaos more manageable. Marketers, strategists, and creatives are no longer just looking for tools to help them write faster, but are in desperate need of solutions that can do everything from analyzing audience behavior to creating visuals, optimizing content marketing, and more. That’s why we’ve pulled together 15 of the most useful free AI tools on the web. These tools don’t just churn out text, they help you understand your audience, streamline workflows, and make sure your content cuts through the noise. We like to think of AI as not replacing human creativity but rather supercharging it. Today’s tools can draft copy, generate i

In [12]:
#  ===========================================
#  Generate embeddings for each article chunk
#  ===========================================

def get_art_embedding(text):
    
    if len(text) > 8000:    # Ensure text isn’t too long
        text = text[:8000]
    try:
        response = client.embeddings.create(
            model="text-embedding-3-small",
            input=text
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"❌ Embedding failed: {e}")
        return None

# Loop with a pause to respect rate limits
for chunk in tqdm(article_chunks):
    emb = get_art_embedding(chunk['text'])
    if emb:
        chunk['embedding'] = emb
    time.sleep(0.5)  # add small delay


# for i, chunk in enumerate(article_chunks):
#     chunk["embedding"] = embeddings[i]

print(f"✅ Embeddings generated for {len(article_chunks)} article chunks.")


100%|██████████| 68/68 [00:52<00:00,  1.29it/s]

✅ Embeddings generated for 68 article chunks.





In [None]:
#  ===========================================
#  Save the article embeddings
#  ===========================================

import json

output_file = "embedded_articles.json"     # Assign the file name to save the embedded article data

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(article_chunks, f, ensure_ascii=False, indent=2)

print(f"✅ Embedded article data saved to {output_file}")


✅ Embedded article data saved to embedded_articles.json


In [None]:
# Importing the Youtube chunks

import json

file_path_1 = "/content/drive/MyDrive/Ironhack_final_project/embedded_yt_transcripts.json" # CHANGE THE FILE PATH IF YOU ARE USING THE VS CODE

with open(file_path_1, "r") as f:
      yt_embedded_chunks = json.load(f)

print(f"✅ Loaded {len(yt_embedded_chunks)} youtube transcript chunks")
print(yt_embedded_chunks[0])  # preview first chunk



# Importing the article chunks

file_path_2 = "/content/drive/MyDrive/Ironhack_final_project/embedded_articles.json" # CHANGE THE FILE PATH IF YOU ARE USING THE VS CODE

with open(file_path_2, "r") as f:
      article_embedded_chunks = json.load(f)

print(f"✅ Loaded {len(article_embedded_chunks)} article chunks")
print(article_embedded_chunks[0])  # preview first chunk


In [None]:
import os
# =========================================================================
# Comnbining the both embedded chunks into one named 'all_embedded_chunks'
# =========================================================================

all_embedded_chunks = yt_embedded_chunks + article_embedded_chunks

print(f"Total chunks: {len(all_embedded_chunks)}")
print("Example chunk:", all_embedded_chunks[45])

# Save to JSON

all_embedded_file_path = "/content/drive/MyDrive/Ironhack_final_project/"   # assign the folder path to save as JSON file.

# make sure the folder exists
os.makedirs(all_embedded_file_path, exist_ok=True)

# full file path
file_path = os.path.join(all_embedded_file_path, "all_embedded_chunks.json")

# save as json
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(all_embedded_chunks, f, ensure_ascii=False, indent=4)

print(f"Saved {len(all_embedded_chunks)} chunks to {file_path}")

print("✅Saved as all_embedded_chunks.json")