<a href="https://colab.research.google.com/github/Falakejaz786/scraping/blob/main/Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q groq beautifulsoup4 requests chromadb sentence-transformers textstat gtts playwright
!playwright install chromium

In [None]:
import requests
from bs4 import BeautifulSoup
import textstat
from gtts import gTTS
from IPython.display import Audio, Image, display
from getpass import getpass
from groq import Groq
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from playwright.async_api import async_playwright
import asyncio
import random

In [None]:
groq_api_key = getpass("Enter your Groq API Key: ")
client = Groq(api_key=groq_api_key)
print("Groq Client initialized.")

In [None]:
GROQ_MODEL = "llama3-8b-8192"
print(f"Using model: {GROQ_MODEL}")

In [None]:
async def take_screenshot(url, filename="chapter_screenshot.png"):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto(url)
        await page.screenshot(path=filename, full_page=True)
        await browser.close()
    print(f"Screenshot saved: {filename}")

def show_screenshot(filename="chapter_screenshot.png"):
    display(Image(filename))

In [None]:
url_for_screenshot = input("Enter URL to screenshot: ").strip()
await take_screenshot(url_for_screenshot)
show_screenshot()

In [None]:
def scrape_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.get_text(separator="\n")

def reward_scraping(text):
    length_reward = 1 if len(text) > 500 else 0
    paragraph_reward = 1 if text.count('\n\n') > 3 else 0
    return length_reward + paragraph_reward

In [None]:
scrape_url = input("Enter URL to scrape: ").strip()
raw_text = scrape_text(scrape_url)

print("\nScraped Text Preview (first 500 chars):\n")
print(raw_text[:500])

scraping_reward = reward_scraping(raw_text)
print(f"\nScraping Reward: {scraping_reward}")

In [None]:
def ai_writer(text, client, model=GROQ_MODEL):
    prompt = f"Rewrite this chapter in modern style:\n\n{text}"
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

def ai_reviewer(text, client, model=GROQ_MODEL):
    prompt = f"Improve clarity, grammar, and readability:\n\n{text}"
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [None]:
spun_text = ai_writer(raw_text[:2000], client)

print("\nAI Writer Output Preview (first 500 chars):\n")
print(spun_text[:500])

In [None]:
reviewed_text = ai_reviewer(spun_text, client)

print("\nAI Reviewer Output Preview (first 500 chars):\n")
print(reviewed_text[:500])

In [None]:
def reward_novelty(text):
    return 1 if len(text.split()) > 100 else 0

def reward_readability(text):
    return 1 if textstat.flesch_reading_ease(text) > 50 else 0

In [None]:
novelty_reward = reward_novelty(spun_text)
readability_reward = reward_readability(reviewed_text)

print(f"Novelty Reward: {novelty_reward}")
print(f"Readability Reward: {readability_reward}")

In [None]:
def simulate_rl_loop(original_text, client, n_iters=3):
    logs = []
    current_text = original_text
    for i in range(n_iters):
        spun = ai_writer(current_text, client)
        reviewed = ai_reviewer(spun, client)

        r_scrape = reward_scraping(current_text)
        r_novelty = reward_novelty(spun)
        r_read = reward_readability(reviewed)

        total_reward = r_scrape + r_novelty + r_read
        logs.append({"iteration": i, "reward": total_reward})

        print(f"\nIteration {i} Reward: {total_reward}")
        current_text = reviewed

    return current_text, logs

In [None]:
final_ai_text, rl_logs = simulate_rl_loop(raw_text, client, n_iters=3)

print("\nRL Reward Logs:")
for log in rl_logs:
    print(log)

In [None]:
def human_edit(text):
    print("\nAI Reviewed Text Preview:")
    print(text[:500])
    user_input = input("\nEdit the text or press Enter to accept:\n").strip()
    return user_input if user_input else text

In [None]:
final_text = human_edit(final_ai_text)

print("\nFinal Text Preview (first 500 chars):\n")
print(final_text[:500])

In [None]:
def text_to_speech(text, filename="final_audio.mp3"):
    tts = gTTS(text)
    tts.save(filename)
    return Audio(filename)

In [None]:
audio = text_to_speech(final_text)
print("\nPlaying Generated Audio:")
display(audio)

In [None]:
chroma_client = chromadb.Client(Settings())
collection = chroma_client.create_collection("books_version")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def add_version(name, text):
    vec = embedder.encode(text).tolist()
    collection.add(ids=[name], documents=[text], embeddings=[vec])
    print(f"Added version: {name}")

def search_versions(query, n_results=2):
    vec = embedder.encode(query).tolist()
    return collection.query(query_embeddings=[vec], n_results=n_results)

In [None]:
add_version("original", raw_text)
add_version("ai_final", final_ai_text)
add_version("human_final", final_text)

In [None]:
query = input("\nEnter a semantic search query: ").strip()
results = search_versions(query)

print("\nSearch Results (IDs):")
print(results['ids'])

print("\nDocuments Preview:")
for doc in results['documents'][0]:
    print(doc[:300] + "\n---\n")
