# Project: TDS Virtual TA Requirement Checklist
- [X] Scrape "TDS Discourse posts with content from 1 Jan 2025 - 14 Apr 2025."
- [X] Scrape "Course content with content for TDS Jan 2025 as on 15 Apr 2025."


## Session handling
| Step | Description | Screenshot |
| --- | --- | --- |
| 1 | Inorder to update the cookies and user agents, I have used below script. <br> It is important to note that I have acquired the intial cookies together with request headers manually <br> I have stored those informations in the form of dictionaries in the files headers.json and cookies.json before running below script. <br> One more functionality i have added to the script is to rotate User-Agent to avoid any blocking of sessions from the server. <br> since the cookies contains confidential informations, i have added both the file names in `.gitignore`, <br> therefore any subsequesnt running from outside of my enviornment need this files added manually | ![image.png](attachment:image.png) |

In [238]:
import requests
import json
import random
import os

USER_AGENTS = [
    # Popular desktop UAs
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/114.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/113.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 Version/14.0 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 CrKey/1.54.250320"
]

def get_discourse_session(url: str = "https://discourse.onlinedegree.iitm.ac.in/search?q=%23courses%3Atds-kb%20after%3A2024-12-31%20before%3A2025-03-15&page=1"):
    session = requests.Session()
    headers = {}
    cookies = {}

    try:
        if os.path.exists("headers.json"):
            with open("headers.json") as f:
                headers = json.load(f)

        if os.path.exists("cookies.json"):
            with open("cookies.json") as f:
                cookies = json.load(f)

        headers["referer"] = url
        # Make request. Disables the certificate verification as i am getting a lot of failures around this
        response = session.get(url, verify=False, headers=headers, cookies=cookies)
        print(response)

        if response.status_code != 200:
            print("Error:", response.status_code)
            return {"error": "Failed to retrieve data"}

        return response.json()
    
    except requests.RequestException as e:
        print("Request error:", e)
        return {"error": str(e)}
    
    except json.JSONDecodeError as e:
        print("Failed to parse JSON file:", e)
        return {"error": "Invalid JSON in headers or cookies"}
    
    except Exception as e:
        print("Unexpected error:", e)
        return {"error": str(e)}

    finally:
        
        # Save cookies and headers
        # cookies["_t"] = session.cookies.get("_t")
        # if cookies["_t"]:
        #     with open("cookies.json", "w") as f:
        #         json.dump(cookies, f)

        # headers["user-agent"] = random.choice(USER_AGENTS)
        # headers["x-csrf-token"] = response.request.headers.get("x-csrf-token")
        # if headers["x-csrf-token"]:
        #     with open("headers.json", "w") as f:
        #         json.dump(headers, f)

        session.close()



In [None]:
# Test the session creation and closure functionality
print(get_discourse_session())
print(get_discourse_session())
print(get_discourse_session())
print(get_discourse_session())

## Scrape Discourse data
| Step | Description | Screenshot |
| --- | --- | --- |
| 1 | As first step i have filtered the Topics/posts as in the attached screenshot | ![image.png](attachment:image.png) |
| 2 | `95` results as search result. As per the inspections in the Chrome dev tools, <br> found that there are two url navigations triggered as part of the Search result page loading. | https://discourse.onlinedegree.iitm.ac.in/search?q=%23courses%3Atds-kb%20after%3A2024-12-31%20before%3A2025-03-15&page=1 <br> https://discourse.onlinedegree.iitm.ac.in/search?q=%23courses%3Atds-kb%20after%3A2024-12-31%20before%3A2025-03-15&page=2 |
| 3 | Found a json object with this structure.  | ![image-2.png](attachment:image-2.png) |

In [None]:
import requests
from urllib.parse import urlencode

baseurl = "https://discourse.onlinedegree.iitm.ac.in/search"
params1 = {
    "q": "#courses:tds-kb after:2024-12-31 before:2025-03-15",
    "page": 1
}
params2 = {
    "q": "#courses:tds-kb after:2024-12-31 before:2025-03-15",
    "page": 2
}

def scrape_data(url, params):
    # Construct the full URL with query parameters
    full_url = f"{url}?{urlencode(params)}"

    return get_discourse_session(full_url)

p1 = scrape_data(baseurl, params1)
p2 = scrape_data(baseurl, params2)

## p1 and p2 have structure similar to below image
![image.png](attachment:image.png)

In [78]:
# Script to verify the json structure
import json

def print_json_structure(data, prefix=""):
    if isinstance(data, dict):
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            print(f"{new_prefix} : {type(value).__name__}")
            print_json_structure(value, new_prefix)
    elif isinstance(data, list):
        print(f"{prefix}[] : list of {type(data[0]).__name__}" if data else f"{prefix}[] : list")
        if data:
            print_json_structure(data[0], prefix + "[]")

In [None]:
# Check structure
print_json_structure(p1)

In [None]:
# Verify the results
print(p1)
print(p2)

In [74]:
# Verify the count of posts in each page
countp1 = 0
countp2 = 0
for post in p1['posts']:
    countp1 += 1
for post in p2['posts']:
    countp2 += 1

print(f"Page 1 has {countp1} posts.")
print(f"Page 2 has {countp2} posts.")

# Verify the types of the results
print(type(p1))
print(type(p2))

Page 1 has 50 posts.
Page 2 has 45 posts.
<class 'dict'>
<class 'dict'>


## Combine all posts
| Step | Description | Screenshot |
| --- | --- | --- |
| 1 | If we expand `posts` key further, it contains a list of posts with attched key values pairs. | ![image.png](attachment:image.png) |
| 2 | Each post contains a post id and topic id, which probabably help to formulate the source urls. | Look above |
| 3 | In additin to that we have two seperate list of posts which we need to combine. | |


In [76]:
import json

merged = p1.copy()

for key, val in p2.items():
    if key == 'posts' and isinstance(val, list):
        # concatenate the two lists
        merged['posts'] = p1.get('posts', []) + val
    else:
        # for other keys, p2’s value wins
        merged[key] = val

json_data_str = json.dumps(merged, indent=4)


In [77]:
# Verifying after the merge the count of post is same as the sum of previous two post lists (50 + 45)
countmerged = 0
for post in merged['posts']:
    countmerged += 1

print(f"Merged data has {countmerged} posts.")

Merged data has 95 posts.


In [None]:
# Verify structure
print_json_structure(merged)

#### Remove unnecessary keys from the data

In [82]:
keys_to_remove = [
    "topics",
    "users",
    "categories",
    "tags",
    "groups",
    "grouped_search_result"
]

json_data = json.loads(json_data_str)
json_data_filtered = {k: v for k, v in json_data.items() if k not in keys_to_remove}

json_data_filtered_json = json.dumps(json_data_filtered, indent=2)

In [83]:
print_json_structure(json_data_filtered)

posts : list
posts[] : list of dict
posts[].id : int
posts[].name : str
posts[].username : str
posts[].avatar_template : str
posts[].created_at : str
posts[].like_count : int
posts[].blurb : str
posts[].post_number : int
posts[].topic_id : int


#### I thought of creating a list of set (topic_id, post_id), individual posts contains too less informations to formulate answers for a query. The answers will be a symantic formulation of whole messages within a post. In such cases it is really hard to point to a single post as refernce url. Rather i choose to use topic URL.


In [88]:
list_topic_id = [entry["topic_id"] for entry in json_data_filtered["posts"]]

In [89]:
print(list_topic_id)

[161071, 163247, 166189, 169029, 169283, 169888, 164277, 168916, 169807, 169369, 168832, 168506, 169393, 169352, 166576, 169247, 99838, 168449, 169045, 168537, 168987, 168901, 168943, 168825, 141413, 168476, 166651, 168515, 168567, 168384, 168458, 168482, 168011, 168310, 168303, 168142, 165959, 168143, 168057, 168017, 167878, 166816, 167679, 167344, 167415, 167471, 167410, 167699, 167172, 164214, 166891, 167072, 166738, 166647, 166634, 166498, 165416, 165433, 161120, 166303, 166357, 166349, 165687, 166100, 165396, 165922, 165830, 165746, 165593, 23335, 163158, 165142, 164869, 164737, 164462, 164460, 164291, 164147, 163765, 164205, 164089, 163381, 163241, 163224, 163147, 161072, 163144, 162425, 161214, 160251, 160773, 161083, 169456, 166866, 166593]


In [90]:
len(list_topic_id)

95

#### Traverse through each topic URL and record those data. Structure of the response json contains this structure.
![image.png](attachment:image.png)

#### Find number of posts under each topic


In [None]:


https://discourse.onlinedegree.iitm.ac.in/t/163247.json

In [None]:
for i in range(10):
    if post_id > 0:
        print(f"Fetching post {post_id} for topic {topic_id}")
    else:
        print(f"Fetching first post for topic {topic_id}")

In [None]:
def scrape_topic_data(topic_id):
    full_url = f"https://discourse.onlinedegree.iitm.ac.in/t/{topic_id}.json"

    return get_discourse_session(full_url)


#### Image understanding using gemini models. Need to set either GOOGLE_API_KEY or GEMINI_API_KEY as environemnte variable

In [288]:
from google import genai
from google.genai import types
from requests.exceptions import RequestException
import os
import mimetypes
from urllib.parse import urlparse

client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])

def understand_image(image_path):
    print("Image:", image_path)
    try:
        path = urlparse(image_path).path
        mime_type, _ = mimetypes.guess_type(path)

        image_bytes = requests.get(image_path).content

        image = types.Part.from_bytes(
            data=image_bytes,mime_type=mime_type
        )
        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[image, "What is in this image?"],
        )
        return response.text
    
    except RequestException as e:
        print(f"[Warning] Failed to download image header: {e}")
        return None

Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.


In [299]:
from google import genai
from google.genai import types
from requests.exceptions import RequestException
import os
import mimetypes
from urllib.parse import urlparse
import httpx
from openai import OpenAI

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"),http_client=httpx.Client(verify=False))

def understand_image_openai(image_path):
    print("Image:", image_path)
    try: 
        question = [
            {"type": "text", "text": "What does this image show?"},
            {
                "type": "image_url",
                "image_url": {
                    "url": image_path,
                    "detail": "high"  # or "low"
                }
            }   
        ]

        response = client.chat.completions.create(
            model="gpt-4o",  # or "gpt-4"
            messages=[
                {"role": "user", "content": question}
            ],
            temperature=0.3,
            max_tokens=1000
        )

        return response.choices[0].message.content
    
    except RequestException as e:
        print(f"[Warning] Failed to download image header: {e}")
        return None

In [300]:

print(understand_image_openai("https://europe1.discourse-cdn.com/flex013/uploads/iitm/optimized/3X/e/b/ebc5f88e712a270b0763135c5a220d2fcd690c71_2_690x256.png"))

Image: https://europe1.discourse-cdn.com/flex013/uploads/iitm/optimized/3X/e/b/ebc5f88e712a270b0763135c5a220d2fcd690c71_2_690x256.png
The image shows a screenshot of a code editor with a Python script and an error message. The script is making a POST request to an API, likely from OpenAI, to list valid English words from a given set. The error message indicates a status code 429, which means "Too Many Requests." The message suggests that the user has exceeded their current quota and should check their plan and billing details. It also provides a link to OpenAI's documentation for more information.


In [290]:
path = urlparse("https://europe1.discourse-cdn.com/flex013/uploads/iitm/original/3X/9/8/98ee116ce238aa6d9ea75357ff3194592c56a173.gif").path
print(path)

mime_type, _ = mimetypes.guess_type(path)
print(mime_type)



/flex013/uploads/iitm/original/3X/9/8/98ee116ce238aa6d9ea75357ff3194592c56a173.gif
image/gif


#### Testing topic data scraping

In [None]:
from bs4 import BeautifulSoup

topic_data = scrape_topic_data(161120)
question = 0
for post in topic_data.get("post_stream", {}).get("posts", []):
    r = post.get('cooked', '<p>No content available<p>')
    soup = BeautifulSoup(r, 'html.parser')

    print(topic_data.get("slug"))
    print(topic_data.get("title"))
    if question == 0:
        print(f"Question: {soup.get_text(separator=' ',strip=True)}\n")
    else:
        print(f"Response {question}: {soup.get_text(separator=' ',strip=True)}\n")
    images = soup.find_all('img')
    if images:
        for img in images:
            if img.get('src'):
                if "slight_smile.png" not in img.get('src'):
                    print(understand_image(img.get('src')))
    question += 1

#### Testing the delay setting while scraping

In [96]:

from datetime import datetime, timedelta
import time

now_local = time.time()
print(now_local)
time.sleep(5)
print(time.time() - now_local)

1750020999.1067069
5.005321025848389


In [97]:
if all(sub not in "https://emoji.discourse-cdn.com/google/clap.png?v=12" for sub in ["slight_smile.png","emoji"]):
    print(1)

In [None]:
topic_data = {}

In [None]:
topic_data_summary = {}
for topic_id in list_topic_id:
    print(topic_id,"\n")
    topic_data = scrape_topic_data(topic_id)
    topic_data_summary[topic_id] = {
        "title": topic_data.get("title", ' '),
        "posts_count": topic_data.get("posts_count", 0),
        "highest_post_number": topic_data.get("highest_post_number", 0),
        "chunk_size": topic_data.get("chunk_size", 0),
        "slug": topic_data.get("slug", ' ')
    }

In [240]:
topic_data_summary

{161071: {'title': 'Which subject to choose in jan term',
  'posts_count': 5,
  'highest_post_number': 5,
  'chunk_size': 20,
  'slug': 'which-subject-to-choose-in-jan-term',
  'posts': [{'id': 575209,
    'post_number': 1,
    'cooked': '<p>Hii all…i know this may not be the correct platform to ask this question.<br>\nActually i have 3 diploma subjects to cover - MLP, JAVA, TDS. So which 2 subject should i choose for this term?</p>',
    'post_url': '/t/which-subject-to-choose-in-jan-term/161071/1',
    'created_at': '2025-01-01T16:04:48.996Z',
    'updated_at': '2025-01-01T16:04:48.996Z',
    'accepted_answer': False},
   {'id': 575247,
    'post_number': 2,
    'cooked': '<p>Don’t take TDS, this subject should be taken in the end else you might have some problem</p>',
    'post_url': '/t/which-subject-to-choose-in-jan-term/161071/2',
    'created_at': '2025-01-01T16:49:57.552Z',
    'updated_at': '2025-01-01T16:49:57.552Z',
    'accepted_answer': False},
   {'id': 575349,
    'post_

In [231]:
def scrape_post_data(topic_id,post_id):
    full_url = f"https://discourse.onlinedegree.iitm.ac.in/t/{topic_id}/{post_id}.json"

    return get_discourse_session(full_url)

In [258]:
temp_post_details = get_discourse_session("https://discourse.onlinedegree.iitm.ac.in/t/169807/5.json")



<Response [200]>


In [261]:
print(json.dumps(temp_post_details, indent=4))

{
    "post_stream": {
        "posts": [
            {
                "id": 606330,
                "name": "AnandMurti",
                "username": "22ds2000011",
                "avatar_template": "/user_avatar/discourse.onlinedegree.iitm.ac.in/22ds2000011/{size}/14423_2.png",
                "created_at": "2025-03-13T15:57:41.320Z",
                "cooked": "<p>Hi Carlton,<br>\nI am writing to confirm the basic requirements for successfully completing the course.</p>\n<p>As I understand, the criteria for passing are:</p>\n<ol>\n<li>Achieving a score greater than 40 marks in the graded assignments.</li>\n<li>Securing more than 40 marks in the end-term examination.</li>\n</ol>\n<p>Could you kindly confirm if this understanding is accurate? Additionally, please let me know if there are any other prerequisites to be met.</p>",
                "post_number": 1,
                "post_type": 1,
                "posts_count": 2,
                "updated_at": "2025-03-13T15:57:41.320Z",


In [None]:
for k, v in topic_data_summary.items():
    temp_post_uid_list = []
    temp_post_list = []
    for i in range(5, v['highest_post_number']+1, 20):
        post_details = scrape_post_data(k, i)
        if post_details and 'post_stream' in post_details:
            for j in post_details['post_stream']['posts']:
                post_id = j['id']
                if post_id not in temp_post_uid_list:
                    temp_post_dict = {
                        "id": post_id,
                        "post_number": j['post_number'],
                        "cooked": j['cooked'],
                        "post_url": j['post_url'],
                        "created_at": j['created_at'],
                        "updated_at": j['updated_at'],
                        "accepted_answer": j['accepted_answer']
                    }
                    temp_post_list.append(temp_post_dict)
                    temp_post_uid_list.append(post_id)
    topic_data_summary[k]['posts'] = temp_post_list

In [243]:
for k, v in topic_data_summary.items():
    print(f"Topic ID: {k}, Post Count: {v['posts_count']}, Actual Posts Count: {len(v['posts'])}")

Topic ID: 161071, Post Count: 5, Actual Posts Count: 5
Topic ID: 163247, Post Count: 149, Actual Posts Count: 149
Topic ID: 166189, Post Count: 21, Actual Posts Count: 21
Topic ID: 169029, Post Count: 690, Actual Posts Count: 690
Topic ID: 169283, Post Count: 44, Actual Posts Count: 44
Topic ID: 169888, Post Count: 27, Actual Posts Count: 27
Topic ID: 164277, Post Count: 614, Actual Posts Count: 611
Topic ID: 168916, Post Count: 23, Actual Posts Count: 20
Topic ID: 169807, Post Count: 2, Actual Posts Count: 0
Topic ID: 169369, Post Count: 28, Actual Posts Count: 28
Topic ID: 168832, Post Count: 113, Actual Posts Count: 113
Topic ID: 168506, Post Count: 5, Actual Posts Count: 5
Topic ID: 169393, Post Count: 2, Actual Posts Count: 0
Topic ID: 169352, Post Count: 2, Actual Posts Count: 0
Topic ID: 166576, Post Count: 103, Actual Posts Count: 103
Topic ID: 169247, Post Count: 3, Actual Posts Count: 0
Topic ID: 99838, Post Count: 3, Actual Posts Count: 0
Topic ID: 168449, Post Count: 80, Ac

In [281]:
outstanding_topics = {}
for k, v in topic_data_summary.items():
    print(f"Topic ID: {k}, Post Count: {v['posts_count']}, Actual Posts Count: {len(v['posts'])}")
    if len(v['posts']) < v['posts_count']:
        outstanding_topics[k] = topic_data_summary[k]


Topic ID: 161071, Post Count: 5, Actual Posts Count: 5
Topic ID: 163247, Post Count: 149, Actual Posts Count: 149
Topic ID: 166189, Post Count: 21, Actual Posts Count: 21
Topic ID: 169029, Post Count: 690, Actual Posts Count: 690
Topic ID: 169283, Post Count: 44, Actual Posts Count: 44
Topic ID: 169888, Post Count: 27, Actual Posts Count: 27
Topic ID: 164277, Post Count: 614, Actual Posts Count: 0
Topic ID: 168916, Post Count: 23, Actual Posts Count: 0
Topic ID: 169807, Post Count: 2, Actual Posts Count: 2
Topic ID: 169369, Post Count: 28, Actual Posts Count: 28
Topic ID: 168832, Post Count: 113, Actual Posts Count: 113
Topic ID: 168506, Post Count: 5, Actual Posts Count: 5
Topic ID: 169393, Post Count: 2, Actual Posts Count: 2
Topic ID: 169352, Post Count: 2, Actual Posts Count: 2
Topic ID: 166576, Post Count: 103, Actual Posts Count: 103
Topic ID: 169247, Post Count: 3, Actual Posts Count: 3
Topic ID: 99838, Post Count: 3, Actual Posts Count: 3
Topic ID: 168449, Post Count: 80, Actua

In [285]:
for k, v in outstanding_topics.items():
    if len(v['posts']) < v['posts_count']:
        print(f"Topic ID: {k}, Post Count: {v['posts_count']}, Actual Posts Count: {len(v['posts'])}")

In [284]:
len(outstanding_topics)

7

In [280]:
len(topic_data_summary.keys())

95

In [283]:
for k, v in outstanding_topics.items():
    temp_post_uid_list = []
    temp_post_list = []
    for i in range(0, v['highest_post_number']+1, 15):
        post_details = scrape_post_data(k, i)
        if post_details and 'post_stream' in post_details:
            for j in post_details['post_stream']['posts']:
                post_id = j['id']
                if post_id not in temp_post_uid_list:
                    temp_post_dict = {
                        "id": post_id,
                        "post_number": j['post_number'],
                        "cooked": j['cooked'],
                        "post_url": j['post_url'],
                        "created_at": j['created_at'],
                        "updated_at": j['updated_at'],
                        "accepted_answer": j['accepted_answer']
                    }
                    temp_post_list.append(temp_post_dict)
                    temp_post_uid_list.append(post_id)
    outstanding_topics[k]['posts'] = temp_post_list




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>




<Response [200]>


In [286]:
for k, v in outstanding_topics.items():
    if len(v['posts']) == v['posts_count']:
        print(f"Topic ID: {k}, Post Count: {v['posts_count']}, Actual Posts Count: {len(v['posts'])}")
        topic_data_summary[k] = outstanding_topics[k]

Topic ID: 164277, Post Count: 614, Actual Posts Count: 614
Topic ID: 168916, Post Count: 23, Actual Posts Count: 23
Topic ID: 168449, Post Count: 80, Actual Posts Count: 80
Topic ID: 166651, Post Count: 1, Actual Posts Count: 1
Topic ID: 165396, Post Count: 24, Actual Posts Count: 24
Topic ID: 23335, Post Count: 42, Actual Posts Count: 42
Topic ID: 161083, Post Count: 131, Actual Posts Count: 131


In [241]:
topic_data_summary

{161071: {'title': 'Which subject to choose in jan term',
  'posts_count': 5,
  'highest_post_number': 5,
  'chunk_size': 20,
  'slug': 'which-subject-to-choose-in-jan-term',
  'posts': [{'id': 575209,
    'post_number': 1,
    'cooked': '<p>Hii all…i know this may not be the correct platform to ask this question.<br>\nActually i have 3 diploma subjects to cover - MLP, JAVA, TDS. So which 2 subject should i choose for this term?</p>',
    'post_url': '/t/which-subject-to-choose-in-jan-term/161071/1',
    'created_at': '2025-01-01T16:04:48.996Z',
    'updated_at': '2025-01-01T16:04:48.996Z',
    'accepted_answer': False},
   {'id': 575247,
    'post_number': 2,
    'cooked': '<p>Don’t take TDS, this subject should be taken in the end else you might have some problem</p>',
    'post_url': '/t/which-subject-to-choose-in-jan-term/161071/2',
    'created_at': '2025-01-01T16:49:57.552Z',
    'updated_at': '2025-01-01T16:49:57.552Z',
    'accepted_answer': False},
   {'id': 575349,
    'post_

In [308]:
image_dictionary = {}
for topic_id in [161071,163247,166189]:
    image_dictionary[topic_id] = []
    for post in topic_data_summary[topic_id].get("posts", []):
        image_dictionary[topic_id].append(post.get("id"))

In [309]:
image_dictionary

{161071: [575209, 575247, 575349, 576005, 635402],
 163247: [579668,
  579673,
  580013,
  580073,
  581443,
  581855,
  582119,
  582598,
  582639,
  582722,
  582744,
  582749,
  582810,
  583185,
  583854,
  583913,
  583919,
  584032,
  584038,
  584042,
  584257,
  584261,
  584413,
  584421,
  584453,
  584459,
  586911,
  586942,
  587025,
  587062,
  587070,
  587175,
  587180,
  587188,
  587193,
  587196,
  587325,
  587371,
  587379,
  587575,
  587577,
  587882,
  588058,
  588067,
  588214,
  588213,
  588228,
  588278,
  588283,
  588333,
  588423,
  588469,
  588481,
  588484,
  588749,
  589234,
  589391,
  589614,
  589632,
  590138,
  590143,
  590322,
  590325,
  590342,
  590350,
  590386,
  590397,
  590398,
  590422,
  590530,
  590614,
  590620,
  590626,
  590627,
  590666,
  590677,
  590682,
  590701,
  590709,
  590740,
  590786,
  590788,
  590789,
  590795,
  590796,
  590800,
  590801,
  590803,
  590804,
  590811,
  590813,
  590838,
  590867,
  590869,
 

In [311]:
from bs4 import BeautifulSoup
import time

now_local = 5

for topic_id, topic_data in topic_data_summary.items():
    print(topic_id,"\n")
    if topic_id in [161071,163247,166189]:
        try:
            file_exists = os.path.exists(f'discourse/{topic_id}.md')
            with open(f'discourse/{topic_id}.md', 'a', encoding='utf-8') as f:
                if file_exists:
                    f.write("\n")
                else:
                    image_dictionary[topic_id] = []
                    print(f"Title: {topic_data.get('title' , ' ')}\n")
                    f.write(f"# Title: {topic_data.get('title' , ' ')}\n")
                    print(f"source_url: https://discourse.onlinedegree.iitm.ac.in/t/{topic_data.get('slug' , '')}/{topic_id}\n\n")
                    f.write(f"[View on Discourse](https://discourse.onlinedegree.iitm.ac.in/t/{topic_data.get('slug' , '')}/{topic_id})\n\n")

                for post in topic_data.get("posts", []):
                    if post.get('id') not in image_dictionary[topic_id]:
                        r = post.get('cooked', '<p>No content available<p>')
                        soup = BeautifulSoup(r, 'html.parser')

                        f.write(f"Post {post.get('post_number', '')}: {soup.get_text(separator=' ',strip=True)}\n")
                        images = soup.find_all('img')

                        if images:
                            for img in images:
                                if img.get('src'):
                                    if all(sub not in img.get('src') for sub in ["slight_smile.png","emoji","avatar","favicon","logo"]):
                                        if (time.time() - now_local) < 5:
                                            time.sleep(5)
                                        now_local = time.time()
                                        path = urlparse(img.get('src')).path
                                        if path.endswith(('.png', '.webp', '.jpeg', '.jpg')):
                                            image_def = understand_image_openai(img.get('src'))
                                            if image_def:
                                                f.write(f"![{image_def}]({img.get('src')})\n")

                        f.write(f"[View on Discourse](https://discourse.onlinedegree.iitm.ac.in/t/{topic_data.get('slug' , '')}/{topic_id}/{post.get('post_number', '')})\n\n")
                        f.write("\n")
                        question += 1
                        image_dictionary[topic_id].append(post.get('id', ''))
                    else:
                        print(f"Post '{post.get('id', '')}' already updated; will not overwrite.")
        except Exception as e:
            print(f"Error processing file '{topic_id}.md': {e}")

161071 

Title: Which subject to choose in jan term

source_url: https://discourse.onlinedegree.iitm.ac.in/t/which-subject-to-choose-in-jan-term/161071


163247 

Title: GA3 - Large Language Models - Discussion Thread [TDS Jan 2025]

source_url: https://discourse.onlinedegree.iitm.ac.in/t/ga3-large-language-models-discussion-thread-tds-jan-2025/163247


Image: https://europe1.discourse-cdn.com/flex013/uploads/iitm/optimized/3X/1/d/1d37c6ff7591a3175f7be06068d9025f2627e65b_2_690x314.png
Image: https://europe1.discourse-cdn.com/flex013/uploads/iitm/original/3X/a/c/ac8e969c93aa57f9b61d8e5a90ddf2a6174220e5.png
Image: https://europe1.discourse-cdn.com/flex013/uploads/iitm/original/3X/4/0/4014d114b8ab5a993183871727062efe6a839400.png
Image: https://europe1.discourse-cdn.com/flex013/uploads/iitm/optimized/3X/a/a/aa81c404ee3eb793693a5bc6406886bd079e1635_2_690x347.png
Image: https://europe1.discourse-cdn.com/flex013/uploads/iitm/optimized/3X/e/b/ebc5f88e712a270b0763135c5a220d2fcd690c71_2_690x256.

In [305]:
topic_data_summary[169029]

{'title': 'Project 2 - TDS Solver - Discussion Thread',
 'posts_count': 690,
 'highest_post_number': 715,
 'chunk_size': 20,
 'slug': 'project-2-tds-solver-discussion-thread',
 'posts': [{'id': 602836,
   'post_number': 1,
   'cooked': '<p>Please post any questions related to <a href="https://tds.s-anand.net/#/project-2">Project 2 - TDS Solver</a>.</p>\n<p>Deadline: <span class="discourse-local-date" data-date="2025-03-31" data-email-preview="2025-03-31T18:29:00Z UTC" data-format="LLLL" data-time="23:59:00" data-timezone="Asia/Calcutta">Monday, March 31, 2025 6:29 PM</span></p>',
   'post_url': '/t/project-2-tds-solver-discussion-thread/169029/1',
   'created_at': '2025-03-03T03:42:19.146Z',
   'updated_at': '2025-03-03T03:42:19.146Z',
   'accepted_answer': False},
  {'id': 602837,
   'post_number': 2,
   'cooked': '',
   'post_url': '/t/project-2-tds-solver-discussion-thread/169029/2',
   'created_at': '2025-03-03T03:43:13.472Z',
   'updated_at': '2025-03-03T03:43:13.472Z',
   'accept

In [None]:
from bs4 import BeautifulSoup
import time

now_local = 5

for topic_id in list_topic_id:
    print(topic_id,"\n")
    try:
        with open(f'discourse/{topic_id}.txt', 'x', encoding='utf-8') as f:
            topic_data = scrape_topic_data(topic_id)
            if "error" not in topic_data:
                print(f"Title: {topic_data.get('title' , ' ')}\n")
                f.write(f"Title: {topic_data.get('title' , ' ')}\n")
                print(f"source_url: https://discourse.onlinedegree.iitm.ac.in/t/{topic_data.get('slug' , '')}/{topic_id}\n\n")
                f.write(f"source_url: https://discourse.onlinedegree.iitm.ac.in/t/{topic_data.get('slug' , '')}/{topic_id}\n\n")
                question = 0
                for post in topic_data.get("post_stream", {}).get("posts", []):
                    r = post.get('cooked', '<p>No content available<p>')
                    soup = BeautifulSoup(r, 'html.parser')
                    if question == 0:
                        f.write(f"Question: {soup.get_text(separator=' ',strip=True)}\n")
                    else:
                        f.write(f"Response {question}: {soup.get_text(separator=' ',strip=True)}\n")
                    
                    f.write("\n")
                    images = soup.find_all('img')

                    if images:
                        for img in images:
                            if img.get('src'):
                                if all(sub not in img.get('src') for sub in ["slight_smile.png","emoji","avatar","favicon","logo"]):
                                    if (time.time() - now_local) < 5:
                                        time.sleep(5)
                                    now_local = time.time()
                                    image_def = understand_image(img.get('src'))
                                    if image_def:
                                        f.write(f"Image explanation: {image_def}\n")
                    question += 1
            else:
                print(f"Failed to retrieve data for topic ID {topic_id}: {topic_data['error']}")
    except FileExistsError:
        print(f"File '{topic_id}.txt' already exists; will not overwrite.") 

In [None]:
# Ignore below script, it is used to rename files in the discourse directory in certain circumstances earlier.
import os

directory = "discourse/"
for filename in os.listdir(directory):
    if filename.lower().endswith('.txt') and ' ' in filename:
        original_path = os.path.join(directory, filename)
        new_name = filename.split(' ')[0]
        new_path = os.path.join(directory, new_name)
        print(original_path, new_path)

        try:
            os.rename(original_path, new_path)
            print(f"Renamed '{filename}' to '{new_name}'.")
        except OSError as e:
            print(f"Error renaming '{filename}': {e}")
        

In [None]:
# This script is not required as the source urls and Title update is done in the previous script.
for topic_id in list_topic_id:

    path = f"discourse/{topic_id}.txt"
    topic_data = scrape_topic_data(topic_id)

    if "error" not in topic_data:
        # 1. Read the existing content
        with open(path, 'r', encoding='utf-8') as f:
            old = f.read()

        # 2. Re-open in write mode (this truncates the file) and write header + old
        with open(path, 'w', encoding='utf-8') as f:
            print(f"Title: {topic_data.get('title' , ' ')}\n")
            f.write(f"Title: {topic_data.get('title' , ' ')}\n")
            print(f"source_url: https://discourse.onlinedegree.iitm.ac.in/t/{topic_data.get('slug' , '')}/{topic_id}\n\n")
            f.write(f"source_url: https://discourse.onlinedegree.iitm.ac.in/t/{topic_data.get('slug' , '')}/{topic_id}\n\n")
            f.write(old)


In [99]:

import nltk

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

def getchunks(text: str, max_chunk_chars: int = 1000, overlap_chars: int = 200):
    length = len(text)
    # Determine step size, ensure it's positive
    step = max_chunk_chars - overlap_chars if max_chunk_chars > overlap_chars else max_chunk_chars
    chunks = []
    # Slide window across the text
    for start in range(0, length, step):
        end = start + max_chunk_chars
        chunks.append(text[start:end])
        if end >= length:
            break
    return chunks

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/weigfhx/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [104]:

import nltk

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

def semantic_split_text(text: str, max_chunk_chars: int = 1000):
    normalized = text.replace('\n\n', ' <NL> ')
    normalized = normalized.replace('\n', ' <NL> ')

    return getchunks(normalized)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/weigfhx/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
with open("discourse/160773.txt", 'r', encoding='utf-8') as file:
    text = file.read()

semantic_split_text(text)

In [106]:
import re

def find_images_from_markdown(markdown_text):
    # Match inline image syntax ![alt](url)
    inline_img_pattern = r'!\[.*?\]\((.*?)\)'
    
    # Match reference-style image labels ![alt][label]
    ref_label_pattern = r'!\[.*?\]\[(.*?)\]'
    
    # Match the actual reference definitions [label]: url
    ref_def_pattern = r'\[(.*?)\]:\s*(\S+)'

    # Find inline image URLs
    inline_imgs = re.findall(inline_img_pattern, markdown_text)

    # Extract reference definitions
    ref_def_dict = dict(re.findall(ref_def_pattern, markdown_text))

    # Find all reference image labels
    ref_labels = re.findall(ref_label_pattern, markdown_text)

    # Resolve labels to actual URLs
    ref_imgs = [ref_def_dict.get(label) for label in ref_labels if label in ref_def_dict]

    # Combine all found image URLs
    all_images = inline_imgs + ref_imgs
    return all_images


In [None]:
for dirpath, _, filenames in os.walk("tds_backup"):
        for fname in filenames:
            if fname.lower().endswith('.md'):
                path = os.path.join(dirpath, fname)
                with open(path, 'r', encoding='utf-8') as f:
                    text = f.read()

                print(path," = ", find_images_from_markdown(text))

In [123]:
import re

def find_youtube_links(markdown_text):
    # Pattern to match YouTube links (both plain and inside markdown links)
    youtube_pattern = re.compile(
        r'https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)[\w\-]+(?:[&?=\w\-]*)'
    )
    return youtube_pattern.findall(markdown_text)



In [117]:
# Extract video ID from URL
def extract_video_id(url):
    import re
    match = re.search(r"(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})", url)
    return match.group(1) if match else None

In [137]:
from youtube_transcript_api import YouTubeTranscriptApi

def get_youtube_transcript(video_url):
    # Extract video ID from the URL
    video_id = extract_video_id(video_url)
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        transcript = transcript_list.find_transcript(['en'])
        transcript_data = transcript.fetch()
        # full_text = "\n".join([item['text'] for item in transcript_data])
        return transcript_data
    except Exception as e:
        print(f"Error fetching transcript for {video_url}: {e}")
        return []


In [None]:
t = get_youtube_transcript('https://youtu.be/pqNCD_5r0IU')
print(t)

In [147]:
print(t.snippets[0].text)
print(type(t))

now we are live so we can start decision
<class 'youtube_transcript_api._transcripts.FetchedTranscript'>


In [152]:
yt = []
if t:
    for i in t.snippets:
        yt.append(i.text)

print(" ".join(yt))




In [124]:
full_yt_links = []
for dirpath, _, filenames in os.walk("tds_backup"):
    for fname in filenames:
        if fname.lower().endswith('.md'):
            path = os.path.join(dirpath, fname)
            with open(path, 'r', encoding='utf-8') as f:
                text = f.read()

            full_yt_links.extend(find_youtube_links(text))

In [None]:
full_yt_links

In [None]:
from tqdm import tqdm

full_yt_transcripts = []
for video_url in tqdm(full_yt_links):
    t = get_youtube_transcript(video_url)
    yt = []
    if t:
        for i in t.snippets:
            yt.append(i.text)
    full_yt_transcripts.append((video_url, " ".join(yt)))


In [157]:
total_chunks = []
for t in full_yt_transcripts:
    total_chunks.append(t[1])

import tiktoken
total_chunks_string = " ".join(total_chunks)

encoding = tiktoken.encoding_for_model("gpt-4o")
tokens = encoding.encode(total_chunks_string)
print(f"Number of tokens: {len(tokens)}")

Number of tokens: 1135137


In [162]:
def get_no_tokens(text: str):
    encoding = tiktoken.encoding_for_model("gpt-4o")
    tokens = encoding.encode(text)
    return len(tokens)

test_string ="Hello! It seems you haven't provided any text for me to summarize. Could you please share the text you'd like summarized?"
print(f"Number of tokens: {get_no_tokens(test_string)}")

Number of tokens: 24


In [156]:
len(total_chunks)

235

In [158]:
import openai
import httpx

client = openai.OpenAI(http_client=httpx.Client(verify=False))
final_yt_transcripts = []
for t in tqdm(full_yt_transcripts):
    response = client.chat.completions.create(
        model="gpt-4.0-mini",  # or "gpt-4"
        messages=[
            {"role": "developer", "content": "You are a helpful assistant. Summarize the following text without losing any important information."},
            {"role": "user", "content": t[1]},
        ],
        temperature=0.7
    )
    final_yt_transcripts.append((t[0], response.choices[0].message.content))

100%|██████████| 235/235 [25:21<00:00,  6.48s/it]


In [None]:
final_yt_transcripts

In [164]:
final_yt_transcripts_map = {}
for t in final_yt_transcripts:
    final_yt_transcripts_map[t[0]] = t[1]


In [None]:
final_yt_transcripts_map

In [175]:
# Replacement function
def replace_with_summary(match):
    full_url = match.group(1)
    summary = final_yt_transcripts_map.get(full_url)

    if summary:
        return f"{full_url}\n<youtube_summary>{summary}</youtube_summary>\n"
    else:
        return full_url

In [None]:
youtube_pattern = re.compile(r"(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)([\w-]+))")
with open("tds_backup/data-preparation-in-the-shell.md", "r", encoding="utf-8") as f:
    md_content = f.read()
updated_md = youtube_pattern.sub(replace_with_summary, md_content)

print(md_content)
print(updated_md)

In [None]:
youtube_pattern = re.compile(r"(https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)([\w-]+))")
for dirpath, _, filenames in os.walk("tds"):
    for fname in filenames:
        if fname.lower().endswith('.md'):
            path = os.path.join(dirpath, fname)
            with open(path, 'r', encoding='utf-8') as f:
                text = f.read()
            links = find_youtube_links(text)
            print(links)

            for link in links:
                if link in final_yt_transcripts_map.keys():
                    if get_no_tokens(final_yt_transcripts_map[link]) > 30:
                        replacement = f"{link}\n<youtube_summary>{final_yt_transcripts_map[link]}</youtube_summary>\n"

                        with open(path, "r", encoding="utf-8") as f:
                            md_content = f.read()

                        updated_md = md_content.replace(link, replacement)
                        with open(path, "w", encoding="utf-8") as f:
                            f.write(updated_md)


In [178]:
print(links)

[]


In [180]:

import nltk
import re

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

def semantic_split_markdown(md: str, max_chunk_chars: int = 1000):
    # 1. Remove front matter & comments
    md = re.sub(r'^---.*?---\s*', '', md, flags=re.S)
    md = re.sub(r'<!--.*?-->', '', md, flags=re.S)

    # 2. Inline link & image normalization
    md = re.sub(r'!\[(.*?)\]\((.*?)\)', r'Image: \1 (\2)', md)
    md = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1 (\2)', md)

    # 3. Heading markers
    md = re.sub(r'^(#{1,3})\s*(.+)$', lambda m: f"<H{len(m.group(1))}> {m.group(2)}", md, flags=re.M)

    # 4. Flatten whitespace
    md = md.replace('\n\n', ' <PARA> ')
    md = md.replace('\n', '<NL>')
    md = re.sub(r'\s+', ' ', md)
    
    return getchunks(md.strip(), max_chunk_chars)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/weigfhx/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
with open("tds/cors.md", 'r', encoding='utf-8') as file:
    text = file.read()

semantic_split_markdown(text)
# for i in semantic_split_markdown(text):
#     print(i)

In [None]:
for dirpath, _, filenames in os.walk('tds'):
    print(dirpath,filenames)

In [None]:
from google import genai
from google.genai import types
import certifi
from requests.exceptions import RequestException
from openai import OpenAI
from google.auth.transport.requests import Request

def get_embedding(chunk: str, model: str = "text-embedding-3-small"):
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

    response = client.embeddings.create(
        input=chunk,
        model=model
    )

    return response.data[0].embedding

In [None]:
# with open("tds/cors.md", 'r', encoding='utf-8') as file:
    # text = file.read()

# print(get_embedding(semantic_split_markdown(text)))
print(get_embedding('abc'))
# import ssl
# print(ssl.get_default_verify_paths())


In [None]:
topic_data = scrape_topic_data(topic_id)
if "error" not in topic_data:
    print(f"Title: {topic_data.get('title' , ' ')}\n")
    f.write(f"Title: {topic_data.get('title' , ' ')}\n")
    print(f"source_url: https://discourse.onlinedegree.iitm.ac.in/t/{topic_data.get('slug' , '')}/{topic_id}\n\n")
    f.write(f"source_url: https://discourse.onlinedegree.iitm.ac.in/t/{topic_data.get('slug' , '')}/{topic_id}\n\n")

In [None]:
topic_data_summary[int("161120")]

In [350]:
import time

def process_files_new(root_dir: str, max_chunk_chars: int = 10000):
    now_local = 5
    all_chunks = []
    all_embeddings = []
    all_source_urls = []

    all_files = []
    # First, collect all files to show accurate tqdm
    for dirpath, _, filenames in os.walk(root_dir):
        for filename in filenames:
            all_files.append(os.path.join(dirpath, filename))

    for filepath in tqdm(all_files, desc="Scanning Markdown files"):
        if filepath.lower().endswith('.md'):
            with open(filepath, 'r', encoding='utf-8') as file:
                text = file.read()
            url = ""
            chunks = semantic_split_markdown(text, max_chunk_chars)
            if root_dir == "discourse":
                topic_id = os.path.basename(filepath).removesuffix('.md')
                topic_slug = topic_data_summary[int(topic_id)].get('slug', '')
                url = f"https://discourse.onlinedegree.iitm.ac.in/t/{topic_slug}/{topic_id}"
            if  root_dir == "tds":
                topic_id = os.path.basename(filepath).removesuffix('.md')
                url = f"https://tds.s-anand.net/#/{topic_id}"

            
            for chunk in chunks:
                all_chunks.append(chunk)
                if (time.time() - now_local) < 5:
                    time.sleep(5)
                now_local = time.time()
                all_embeddings.append(get_embedding(chunk))
                all_source_urls.append(url)
        else:
            print("Skipping: ", path)

    return all_chunks, all_embeddings, all_source_urls

In [333]:
a, b, c = process_files_new("test")

Scanning Markdown files: 100%|██████████| 1/1 [00:13<00:00, 13.23s/it]


In [334]:
c

['https://tds.s-anand.net/#/bbc-weather-api-with-python',
 'https://tds.s-anand.net/#/bbc-weather-api-with-python',
 'https://tds.s-anand.net/#/bbc-weather-api-with-python']

In [185]:

import time

def process_files(root_dir: str, max_chunk_chars: int = 1000):
    now_local = 5
    all_chunks = []
    all_embeddings = []
    all_source_urls = []

    for dirpath, _, filenames in os.walk(root_dir):
        for fname in filenames:
            path = os.path.join(dirpath, fname)
            if fname.lower().endswith('.txt') or fname.lower().endswith('.md'):
                print("Procesing: ", path)
                with open(path, 'r', encoding='utf-8') as file:
                    text = file.read()
                
                url = ""
                if fname.lower().endswith('.txt'):
                    print("Trigger Text splitting")
                    chunks = semantic_split_text(text, max_chunk_chars)
                    topic_id = fname.removesuffix('.txt')
                    topic_data = scrape_topic_data(topic_id)
                    topic_slug = topic_data.get('slug', '')
                    url = f"https://discourse.onlinedegree.iitm.ac.in/t/{topic_slug}/{topic_id}"
                if fname.lower().endswith('.md'):
                    print("Trigger Markdown splitting")
                    chunks = semantic_split_markdown(text, max_chunk_chars)
                    topic_id = fname.removesuffix('.md')
                    url = f"https://tds.s-anand.net/#/{topic_id}"

                print(f"\nFile: {path}\nTotal chunks: {len(chunks)}")
                for i, chunk in enumerate(chunks[:3], 1):
                    print(f"  Chunk {i} preview: {repr(chunk[:80])}...")
                
                for chunk in chunks:
                    all_chunks.append(chunk)
                    if (time.time() - now_local) < 5:
                        time.sleep(5)
                    now_local = time.time()
                    all_embeddings.append(get_embedding(chunk))
                    all_source_urls.append(url)
            else:
                print("Skipping: ", path)

    return all_chunks, all_embeddings, all_source_urls

In [15]:
def test_process_files(root_dir: str, max_chunk_chars: int = 1000):
    for dirpath, _, filenames in os.walk(root_dir):
        for fname in filenames:
            path = os.path.join(dirpath, fname)
            print(path)
            print(fname)
            print(fname.lower().endswith('.txt'))
            print(fname.lower().endswith('.md'))
            # if fname.lower().endswith('.txt'):
            #     chunks = semantic_split_text(path, max_chunk_chars)
            # elif fname.lower().endswith('.md'):
            #     chunks = semantic_split_markdown(path, max_chunk_chars)
            # print(f"\nFile: {path}\nTotal chunks: {len(chunks)}")
            # for i, chunk in enumerate(chunks[:3], 1):
            #     print(f"  Chunk {i} preview: {repr(chunk[:80])}...")

In [None]:
tds_chunks,tds_embeddings,tds_sourse_urls = process_files_new("tds")
discourse_chunks,discourse_embeddings,discourse_sourse_urls = process_files_new("discourse")

In [335]:
discourse_chunks,discourse_embeddings,discourse_sourse_urls = process_files_new("discourse")

Scanning Markdown files: 100%|██████████| 95/95 [3:38:08<00:00, 137.78s/it]    


In [347]:
discourse_chunks_new,discourse_embeddings_new,discourse_source_urls_new = process_files_new("discourse")
tds_chunks_new,tds_embeddings_new,tds_source_urls_new = process_files_new("tds")

Scanning Markdown files: 100%|██████████| 95/95 [24:22<00:00, 15.40s/it]   
Scanning Markdown files:  10%|█         | 16/155 [01:43<14:58,  6.47s/it]

Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png


Scanning Markdown files:  14%|█▎        | 21/155 [04:39<29:44, 13.32s/it]  


KeyboardInterrupt: 

In [348]:
discourse_source_urls_new

['https://discourse.onlinedegree.iitm.ac.in/t/for-project-1/166593',
 'https://discourse.onlinedegree.iitm.ac.in/t/revised-dates-tds-jan-2025/168506',
 'https://discourse.onlinedegree.iitm.ac.in/t/where-are-my-bonus-marks/160773',
 'https://discourse.onlinedegree.iitm.ac.in/t/project-1-casual-banter/167344',
 'https://discourse.onlinedegree.iitm.ac.in/t/inconsistent-information-in-the-grading-document-and-the-website/167679',
 'https://discourse.onlinedegree.iitm.ac.in/t/can-i-take-the-end-term-exam-without-submitting-assignments-if-i-missed-it-due-to-an-emergency/161072',
 'https://discourse.onlinedegree.iitm.ac.in/t/solving-roe-realtime/168943',
 'https://discourse.onlinedegree.iitm.ac.in/t/drop-course-window-for-tds/164737',
 'https://discourse.onlinedegree.iitm.ac.in/t/concerns-regarding-tds-course-difficulty-and-grading-fairness/168476',
 'https://discourse.onlinedegree.iitm.ac.in/t/concerns-regarding-tds-course-difficulty-and-grading-fairness/168476',
 'https://discourse.onlinede

In [351]:
tds_chunks_new,tds_embeddings_new,tds_source_urls_new = process_files_new("tds")

Scanning Markdown files:  10%|█         | 16/155 [01:43<14:51,  6.41s/it]

Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png


Scanning Markdown files:  15%|█▌        | 24/155 [05:10<30:01, 13.75s/it]  

Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png
Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png


Scanning Markdown files:  21%|██▏       | 33/155 [06:04<15:56,  7.84s/it]

Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png


Scanning Markdown files:  30%|███       | 47/155 [07:34<14:50,  8.25s/it]

Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png


Scanning Markdown files:  47%|████▋     | 73/155 [10:25<09:17,  6.79s/it]

Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png


Scanning Markdown files:  95%|█████████▌| 148/155 [19:02<00:48,  6.98s/it]

Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png
Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png
Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png
Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png
Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png


Scanning Markdown files: 100%|██████████| 155/155 [19:08<00:00,  7.41s/it]

Skipping:  /flex013/uploads/iitm/optimized/3X/2/9/29b6332cf814991a2cdae74355e67478bcc57c57_2_690x356.png





In [338]:
print(tds_sourse_urls)

['https://tds.s-anand.net/#/hybrid-rag-typesense', 'https://tds.s-anand.net/#/hybrid-rag-typesense', 'https://tds.s-anand.net/#/hybrid-rag-typesense', 'https://tds.s-anand.net/#/hybrid-rag-typesense', 'https://tds.s-anand.net/#/hybrid-rag-typesense', 'https://tds.s-anand.net/#/hybrid-rag-typesense', 'https://tds.s-anand.net/#/hybrid-rag-typesense', 'https://tds.s-anand.net/#/llm-agents', 'https://tds.s-anand.net/#/llm-agents', 'https://tds.s-anand.net/#/llm-agents', 'https://tds.s-anand.net/#/llm-agents', 'https://tds.s-anand.net/#/llm-agents', 'https://tds.s-anand.net/#/llm-agents', 'https://tds.s-anand.net/#/llm-agents', 'https://tds.s-anand.net/#/llm-agents', 'https://tds.s-anand.net/#/llm-agents', 'https://tds.s-anand.net/#/llm-agents', 'https://tds.s-anand.net/#/llm-agents', 'https://tds.s-anand.net/#/live-session-2025-01-30', 'https://tds.s-anand.net/#/live-session-2025-01-30', 'https://tds.s-anand.net/#/live-session-2025-01-30', 'https://tds.s-anand.net/#/live-session-2025-01-30

In [None]:
print(tds_sourse_urls)

In [336]:
print(len(tds_chunks))
print(len(discourse_chunks))
print(len(tds_embeddings))
print(len(discourse_embeddings))
print(len(tds_sourse_urls))
print(len(discourse_sourse_urls))

1529
2267
1529
2267
1529
2267


In [356]:
print(len(tds_chunks_new))
print(len(discourse_chunks_new))
print(len(tds_embeddings_new))
print(len(discourse_embeddings_new))
print(len(tds_source_urls_new))
print(len(discourse_source_urls_new))

196
251
196
251
196
251


In [190]:
list1 = ["a",3,[1,2,3]]
list2 = ["b",4,[4,5,6]]
list3 = list1 + list2
print(list3)

['a', 3, [1, 2, 3], 'b', 4, [4, 5, 6]]


In [339]:
final_chunks = tds_chunks + discourse_chunks
final_embeddings = tds_embeddings + discourse_embeddings
final_source_urls = tds_sourse_urls + discourse_sourse_urls


In [355]:
final_chunks_new = tds_chunks_new + discourse_chunks_new
final_embeddings_new = tds_embeddings_new + discourse_embeddings_new
final_source_urls_new = tds_source_urls_new + discourse_source_urls_new

In [340]:
print(len(tds_chunks))
print(len(discourse_chunks))
print(len(tds_embeddings))
print(len(discourse_embeddings))
print(len(tds_sourse_urls))
print(len(discourse_sourse_urls))
print(len(final_chunks))
print(len(final_embeddings))
print(len(final_source_urls))

1529
2267
1529
2267
1529
2267
3796
3796
3796


In [357]:
print(len(final_chunks_new))
print(len(final_embeddings_new))
print(len(final_source_urls_new))

447
447
447


In [354]:
print(final_chunks_new[0])
print(final_embeddings_new[0])
print(final_source_urls_new[0])

<H2> Hybrid Retrieval Augmented Generation (Hybrid RAG) with TypeSense <PARA> Hybrid RAG combines semantic (vector) search with traditional keyword search to improve retrieval accuracy and relevance. By mixing exact text matches with embedding-based similarity, you get the best of both worlds: precision when keywords are present, and semantic recall when phrasing varies. TypeSense (https://typesense.org/) makes this easy with built-in hybrid search and automatic embedding generation. <PARA> Below is a fully self-contained Hybrid RAG tutorial using TypeSense, Python, and the command line. <PARA> <H3> Install and run TypeSense <PARA> Install TypeSense (https://typesense.org/docs/guide/install-typesense.html). <PARA> ```bash<NL>mkdir typesense-data <PARA> docker run -p 8108:8108 \<NL> -v typesense-data:/data typesense/typesense:28.0 \<NL> --data-dir /data \<NL> --api-key=secret-key \<NL> --enable-cors<NL>``` <PARA> - **`docker run`**: spins up a containerized TypeSense server on port 8108

In [342]:
print(final_chunks[2])

` that has one `{id, content}` JSON object per line. <PARA> TypeSense supports automatic embedding of documents (https://typesense.org/docs/28.0/api/vector-search.html#option-b-auto-embedding-generation-within-typesense). We'll use that capability. <PARA> Save the following as `addnotes.py` and run it with `uv run addnotes.py`. <PARA> ```python<NL><H1> /// script<NL><H1> requires-python = ">=3.13"<NL><H1> dependencies = ["httpx"]<NL><H1> ///<NL>import json<NL>import httpx<NL>import os <PARA> headers = {"X-TYPESENSE-API-KEY": "secret-key"} <PARA> schema = {<NL> "name": "notes",<NL> "fields": [<NL> {"name": "id", "type": "string", "facet": False},<NL> {"name": "content", "type": "string", "facet": False},<NL> {<NL> "name": "embedding",<NL> "type": "float[]",<NL> "embed": {<NL> "from": ["content"],<NL> "model_config": {<NL> "model_name": "openai/text-embedding-3-small",<NL> "api_key": os.getenv("OPENAI_API_KEY"),<NL> },<NL> },<NL> },<NL> ],<NL>} <PARA> with open("chunks.json", "r") as f:<

In [358]:
import numpy as np

np.savez("embeddings.npz", chunks=final_chunks_new, embeddings=final_embeddings_new, source_urls=final_source_urls_new)

In [198]:
ranked_text = """To address the user question "How to convert log files to CSV?", the snippets should be ranked based on their relevance to the process of converting log files to CSV format. Here is the reranked order:

1. Snippet 5: This snippet directly addresses converting logs to CSV using `sed` to replace characters, which is highly relevant to the user's question about converting log files to CSV.

2. Snippet 8: This snippet provides a specific command using `sed` to convert log files to CSV format, which is directly relevant to the user's question.

3. Snippet 2: This snippet discusses using `sed` to convert log files into a CSV-like format, which is relevant to the user's question about converting log files to CSV.

4. Snippet 1: This snippet mentions downloading a `log.csv` file and discusses issues with CSV formatting, which is somewhat relevant to the user's question.

5. Snippet 3: This snippet includes code for parsing log files, which could be relevant for understanding how to extract data from logs before converting to CSV.

6. Snippet 9: This snippet mentions using shell commands like `sed` and `awk` for text processing, which could be relevant for converting log files to CSV.

7. Snippet 4: This snippet discusses converting data to various formats, including CSV, but is less specific about log files.

8. Snippet 6: This snippet is about downloading log files, which is not directly relevant to converting them to CSV.

9. Snippet 10: This snippet discusses handling corrupted CSV files and creating large CSV datasets, which is not directly relevant to converting log files to CSV.

10. Snippet 7: This snippet is about converting JSON to CSV and text manipulation in Excel, which is not directly relevant to converting log files to CSV.
"""
print(re.findall(r"(?:\*\*|^|\n)?(?:\d+\.\s*)?Snippet\s+(\d+)", ranked_text, flags=re.IGNORECASE))

['5', '8', '2', '1', '3', '9', '4', '6', '10', '7']


In [None]:
print(final_chunks[1230])
print(final_chunks[1228])

In [206]:
import numpy as np

# Load the .npz archive
data = np.load('embeddings.npz', allow_pickle=True)
final_chunks_new = data['chunks']           # shape (N, D)
final_embeddings_new = data['embeddings']
final_source_urls_new = data['source_urls']   


# Define a condition to filter out unwanted rows
# For example, remove rows where the source contains the word "project"
mask = np.array(['project' not in str(src).lower() for src in final_source_urls_new])

# Apply the mask
filtered_final_chunks_new = final_chunks_new[mask]
filtered_final_embeddings_new = final_embeddings_new[mask]
filtered_final_source_urls_new = final_source_urls_new[mask]

# Save the filtered results back to a new .npz file (or overwrite if needed)
len(final_chunks_new), len(final_embeddings_new), len(final_source_urls_new)

(1971, 1971, 1971)

In [207]:
len(filtered_final_chunks_new), len(filtered_final_embeddings_new), len(filtered_final_source_urls_new)

(1866, 1866, 1866)

In [208]:
np.savez("embeddings.npz", chunks=filtered_final_chunks_new, embeddings=filtered_final_embeddings_new, source_urls=filtered_final_source_urls_new)

In [209]:
print(type(final_source_urls_new))

<class 'numpy.ndarray'>
