In [1]:
import requests
import json
import os
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor

In [2]:
import requests
import json
import os
import re
from urllib.parse import urlparse

def sanitize_filename(filename):
    # Remove invalid characters and replace spaces with underscores
    return re.sub(r'[<>:"/\\|?*&]', '', filename).replace(' ', '_')

def download_file(url, folder, filename):
    response = requests.get(url)
    if response.status_code == 200:
        os.makedirs(folder, exist_ok=True)
        file_path = os.path.join(folder, sanitize_filename(filename))
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {file_path}")
    else:
        print(f"Failed to download: {url}")

def process_subsection(subsection, course_folder, course_id):
    if 'pdfUrl' in subsection:
        pdf_url = subsection['pdfUrl']
        pdf_filename = f"{course_id}_{subsection.get('fileName', os.path.basename(urlparse(pdf_url).path))}"
        download_file(pdf_url, os.path.join(course_folder, 'pdfs'), pdf_filename)
    
    if 'videoUrl' in subsection:
        video_url = subsection['videoUrl']
        video_filename = f"{course_id}_{subsection.get('title', 'video')}.mp4"
        download_file(video_url, os.path.join(course_folder, 'videos'), video_filename)

def process_course(course):
    course_id = course['_id']
    course_name = sanitize_filename(course['courseName'])
    course_folder = os.path.join('studynotiondata', f"{course_id}_{course_name}")
    
    print(f"Processing course: {course_name} (ID: {course_id})")
    
    for section in course.get('courseContent', []):
        for subsection in section.get('subSection', []):
            process_subsection(subsection, course_folder, course_id)
    
    # Extract course description
    description = course.get('courseDescription', '')
    if description:
        description_file = os.path.join(course_folder, f"{course_id}_description.txt")
        with open(description_file, 'w', encoding='utf-8') as f:
            f.write(description)
        print(f"Saved course description: {description_file}")
    
    print(f"Finished processing course: {course_name} (ID: {course_id})")

def main():
    url = "https://studynotion-backend-z1s7.onrender.com/api/v1/course/getAllCoursesPopulated"
    
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        courses = data.get('data', [])
        
        os.makedirs('studynotiondata', exist_ok=True)
        
        print(f"Found {len(courses)} courses. Processing...")
        
        for course in courses:
            process_course(course)
        
        print("All courses processed successfully!")
    else:
        print(f"Failed to fetch data from API. Status code: {response.status_code}")

if __name__ == "__main__":
    main()

Found 3 courses. Processing...
Processing course: Test (ID: 66e28db3e03af0d0f22e0ecd)
Downloaded: studynotiondata\66e28db3e03af0d0f22e0ecd_Test\videos\66e28db3e03af0d0f22e0ecd_x_clone.mp4
Downloaded: studynotiondata\66e28db3e03af0d0f22e0ecd_Test\pdfs\66e28db3e03af0d0f22e0ecd_Internshala_web_development_course_certificate.pdf
Saved course description: studynotiondata\66e28db3e03af0d0f22e0ecd_Test\66e28db3e03af0d0f22e0ecd_description.txt
Finished processing course: Test (ID: 66e28db3e03af0d0f22e0ecd)
Processing course: Artificial_Intelligence__Machine_Learning_in_Finance_ (ID: 66e3ca71fec7d8a2197d52e3)
Downloaded: studynotiondata\66e3ca71fec7d8a2197d52e3_Artificial_Intelligence__Machine_Learning_in_Finance_\videos\66e3ca71fec7d8a2197d52e3_Objective_.mp4
Downloaded: studynotiondata\66e3ca71fec7d8a2197d52e3_Artificial_Intelligence__Machine_Learning_in_Finance_\videos\66e3ca71fec7d8a2197d52e3_Intro-1.mp4
Downloaded: studynotiondata\66e3ca71fec7d8a2197d52e3_Artificial_Intelligence__Machine_L

FileNotFoundError: [Errno 2] No such file or directory: 'studynotiondata\\66e4189d7f1beb57e8423402__Machine_Learning_Basics\\66e4189d7f1beb57e8423402_description.txt'

In [5]:
import os
import PyPDF2
import cohere
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm

# Initialize Cohere and Pinecone clients
cohere_api_key = "OVQ8g0SvIGz59Jxyuj2YKPKmrV17IqvxPpUlxL23"
pinecone_api_key = "b1492bce-247e-46d9-b61e-25a4abe015ca"
pinecone_environment = "us-east-1"
pinecone_index_name = "studynotion-project"


co = cohere.Client(cohere_api_key)
pc = Pinecone(api_key=pinecone_api_key)

# Function to get embedding dimension
def get_embedding_dimension(text):
    response = co.embed(
        texts=[text],
        model="embed-english-v3.0",
        input_type="search_document"
    )
    return len(response.embeddings[0])

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def process_course_folder(course_folder):
    pdf_folder = os.path.join(course_folder, 'pdfs')
    if not os.path.exists(pdf_folder):
        print(f"No PDF folder found in {course_folder}")
        return

    course_texts = []
    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            text = extract_text_from_pdf(pdf_path)
            course_texts.append((pdf_file, text))
    
    return course_texts

def embed_and_store(course_id, course_texts):
    # Get the embedding dimension
    sample_text = course_texts[0][1] if course_texts else "Sample text"
    embedding_dim = get_embedding_dimension(sample_text)

    # Check if the index exists, if not create it
    if pinecone_index_name not in pc.list_indexes().names():
        pc.create_index(
            name=pinecone_index_name,
            dimension=embedding_dim,
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region=pinecone_environment
            )
        )
    
    # Connect to the index
    index = pc.Index(pinecone_index_name)

    # Check if the index dimension matches the embedding dimension
    index_stats = index.describe_index_stats()
    if index_stats['dimension'] != embedding_dim:
        raise ValueError(f"Index dimension ({index_stats['dimension']}) does not match embedding dimension ({embedding_dim})")

    for pdf_name, text in tqdm(course_texts, desc=f"Processing {course_id}"):
        # Generate embeddings
        response = co.embed(
            texts=[text],
            model="embed-english-v3.0",
            input_type="search_document"
        )
        embedding = response.embeddings[0]

        # Store in Pinecone
        try:
            index.upsert(vectors=[
                (f"{course_id}_{pdf_name}", embedding, {"text": text[:1000]})  # Truncate text if needed
            ])
        except Exception as e:
            print(f"Error upserting vector for {pdf_name}: {str(e)}")

def main():
    base_folder = 'studynotiondata'
    for course_folder in os.listdir(base_folder):
        course_path = os.path.join(base_folder, course_folder)
        if os.path.isdir(course_path):
            course_id = course_folder.split('_')[0]
            course_texts = process_course_folder(course_path)
            if course_texts:
                embed_and_store(course_id, course_texts)

if __name__ == "__main__":
    main()

Processing 66e28db3e03af0d0f22e0ecd: 100%|███████████████████████████████████████████████| 1/1 [00:01<00:00,  1.41s/it]
Processing 66e3ca71fec7d8a2197d52e3: 100%|███████████████████████████████████████████████| 7/7 [00:19<00:00,  2.75s/it]


In [7]:
import cohere
import pinecone
import numpy as np

# Initialize the Cohere client
co = cohere.Client('OVQ8g0SvIGz59Jxyuj2YKPKmrV17IqvxPpUlxL23')  # Replace with your Cohere API key

# Initialize the Pinecone client
pc = pinecone.Pinecone(api_key='b1492bce-247e-46d9-b61e-25a4abe015ca', environment='us-east-1')  # Replace with your Pinecone API key

# Connect to your Pinecone index
index_name = 'studynotion-project'  # Replace with your Pinecone index name
index = pc.Index(index_name)

def get_query_embedding(query):
    # Generate the embedding for the query
    response = co.embed(
        texts=[query],
        model='embed-english-v3.0',
        input_type='search_query',
        truncate='END'
    )
    return response.embeddings[0]

def query_pinecone(query, top_k=5):
    # Get the embedding for the query
    embedding = get_query_embedding(query)
    
    # Query the Pinecone index
    response = index.query(vector=embedding, top_k=top_k, include_metadata=True)
    
    # Return the results
    return response['matches']

# The input query
query = "What caused the 1929 Great Depression?"

# Fetch similar data from Pinecone
matches = query_pinecone(query)

# Print the results
for match in matches:
    print(f"Score: {match['score']:.2f}, Text: {match['metadata']['text']}")


Score: 0.19, Text:  
   
 
 
 
 
 
 
 
   
 
 
  
 
   
Artificial intelligence and machine learning in financial services  
Market developments and financial stability implications  
 
        
1 November  2017  
 
 
  
  
 
                
 
The Financial Stability Board (FSB) is established to coordinate at the international level the 
work of national financial authorities and international standard -setting bodies in order to 
develop and promote the implementation of effective regulatory, sup ervisory and other 
financial sector policies. Its mandate is set out in the FSB Charter, which governs the policymaking and related activities of the FSB. These activities, including any decisions reached in their context, shall not be binding or give rise  to any legal rights or obligations under 
the FSB’s Articles of Association.  
 
 
Contacting the Financial Stability Board  
Sign up for e -mail alerts: www.fsb.org/emailalert  
Follow the FSB on Twitter: @FinStbBoard  
E-mail the FSB 