<a href="https://colab.research.google.com/github/Alexrosulek/Cs50/blob/main/makearticle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!curl https://ollama.ai/install.sh | sh

!echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
!sudo apt-get update && sudo apt-get install -y cuda-drivers

import os

# Set LD_LIBRARY_PATH so the system NVIDIA library
os.environ.update({'LD_LIBRARY_PATH': '/usr/lib64-nvidia'})

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 13269    0 13269    0     0  61602      0 --:--:-- --:--:-- --:--:-- 61716
>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
####################################################################################           90.7%

In [None]:
!nohup ollama serve &

!ollama pull deepseek-r1:32b

!ollama pull deepseek-r1:14b
!ollama pull deepseek-r1:7b
!ollama pull qwen2.5:3b
!pip install ollama
#!unzip -q /content/processed_shops.zip

In [None]:

import asyncio
import re
import json
import os
import subprocess
import hashlib
from urllib.parse import urlparse
from pathlib import Path
import ollama
import httpx
import requests
import asyncio
########################################################################
# Configuration
########################################################################
MODEL_SEMAPHORE = asyncio.Semaphore(1)  # Ensures only one model runs at a time
CHECKPOINT_FILE = "progress.checkpoint2"
BASE_INPUT_DIR = "processed_shops"      # Root directory containing input shop data
BASE_OUTPUT_DIR = "generated_articles"  # Root directory for generated articles
STATES = ["Florida"]                    # Target states to process
MIN_CONTENT_LENGTH = 1000                # Minimum combined content length to process
CHUNK_SIZE = 8000
                 # Maximum text chunk size for summarization
SHOPDD = False
########################################################################
# AI Client Classes
########################################################################


class QwenClient:
    async def fast_summarize(self, text):
        """Use Qwen (0.5b) for business/SEO-focused summarization."""
        async with MODEL_SEMAPHORE:
            prompt = (
                "Your task is to extract key service details from this text\n"


                "STRICTLY IGNORE:\n"
                "- Business name/address (already known do not worry about)\n"

                "- Marketing fluff ('best in town')\n"
                "- Customer reviews\n"
                "- Repeated information\n\n"
                "FORMAT REQUIREMENTS:\n"
                "- Bullet points only\n"
                "No extra spaceing\n"
                "- No complete sentences\n"
                "- Preserve exact numbers/special & unique info\n\n"
                "Keep all concrete facts.\n"
                "No one is talking to you, you are extracting ALL info.\n"
              "REMOVE COOKIE AND PRIVACY POLICY"
                "If error or empty or unsure don't respond. "
                f"RAW INPUT TEXT:\n{text}"
            )

            try:

                response = ollama.chat(
                        model='qwen2.5:3b',
                        messages=[{'role': 'user', 'content': prompt}]
                    )

                response = response['message']['content'].strip()


                # Debug line for you—feel free to remove if you want fewer logs:
                print(f"[AI] QwenClient.fast_summarize response: {response}")

                # Fallback if response is empty
                if not response:
                    print("Empty summary response, returning partial original text.")
                    return text[: CHUNK_SIZE // 4]  # integer slice

                return response

            except Exception as e:
                print(f"Qwen error: {e}")
                # Fallback return if something goes wrong
                SHOPDD = True
                return text[: CHUNK_SIZE // 4]  # integer slice


async def chunked_summarize(qwen_client, text, chunk_size=CHUNK_SIZE):
    """
    Splits 'text' into chunk_size blocks, summarizes each,
    then combines those summaries into one final summary.
    This produces a single, concise summary without repeated partials.
    """

    # If it's already small enough, just return it.
    if len(text) <= chunk_size:

        summary = await qwen_client.fast_summarize(text)
        print(f"Text fits in one chunk (length={len(text)}). No further chunking needed.")
        return summary

    split_size = chunk_size // 2
    chunks = [text[i : i + split_size] for i in range(0, len(text), split_size)]
    print(f"Split into {len(chunks)} chunks ~{split_size} chars each.")

    # Step B: Summarize each chunk individually
    summarized_chunks = []
    for idx, chunk in enumerate(chunks, start=1):
        print("starting", idx)
        summary = await qwen_client.fast_summarize(chunk)
        if not summary:
            print("nosum")
            # Fallback if the model returns empty
            summary = chunk[: split_size // 4]
        summarized_chunks.append(summary)

    # Step C: Join all chunk summaries
    combined_summary = "\n".join(summarized_chunks)

    summary = await qwen_client.fast_summarize(combined_summary)

    return summary

class DeepSeekClient:
    async def seo_analysis(self, text):
        """Use DeepSeek 1.5b for SEO analysis."""
        async with MODEL_SEMAPHORE:
            prompt = f"Analyze this text for SEO for keywords, not for the webpage but for us to make an article about this webpage for findthatshop.com; Keep aligned with the shop info and ensure the writer know that he must write an article using this info about the shop specifically for findthatshop.com, and not to mention seo. most likely to be typed in google search. no emojis. You are a writer for findthatshop.com writing what these shops have. shortly suggest seo for us to rank for their keywords ect:\n{text}"

            try:

                response = ollama.chat(
                        model='deepseek-r1:7b',
                        messages=[{'role': 'user', 'content': prompt}]
                    )

                response = response['message']['content'].strip()

                clean_response = self._clean_output(response)
                print(f"[AI] DeepSeekClient.seo_analysis response: {clean_response}")
                return clean_response
            except Exception as e:
                print(f"DeepSeek SEO error: {e}")
                return ""

    async def generate_article(self, prompt):
        """Use DeepSeek 7b for article generation."""
        async with MODEL_SEMAPHORE:
            try:
                response = ollama.chat(
                        model='deepseek-r1:14b',
                        messages=[{'role': 'user', 'content': prompt}]
                    )

                response = response['message']['content'].strip()

                clean_response = self._clean_output(response)
                print(f"[AI] DeepSeekClient.generate_article response: {clean_response}")
                return clean_response
            except Exception as e:
                print(f"DeepSeek Article error: {e}")
                return ""



    async def validate_article(self, article):
        """Validation logic remains same"""
        async with MODEL_SEMAPHORE:
            prompt = (
                "You are an article checker. Please check the following article for these requirements. The article must be a 1-3 paragraphs, "
 "YOU ARE WORK FOR FINDTHATSHOP.COM CHECKING ARTICLES WRITTEN ABOUT WHAT SHOPS HAVE TO OFFER OFF GIVEN WEBSITE INFO WE GATHERED ON THEM FOR THE MAIN PAGE OF FINDTHATSHOP.COM!\n"
                "MAKE SURE IT IS FRIENDY CONSUMER LANGUAGE READABLE ARTICLE ABOUT THE SPECIFIC SHOP SERVICES"
                "IF ITS ABOUT SOME HOLIDAY OR DEAL OR ISNT WHAT A READER WOULD EXPECT ON A REPORT ABOUT A SHOPS SERVICES SAY 'BAD'"
                "Article must be a article about a shop not anything else or somthing random."
                  "ENSURE THAT IT IS A PROPER ARTICLE BASED ON A SHOP. "
                  "DENY ARTICLES WITH BAD FORMATTING OR HTML. "

                "INVALIDATE COOKIE AND PRIVACY POLICY ARTICLES"
                "must be aligned with writing from findthatshop.com about this shops details. "
                "NO markdown formatting, no *, extra commentary, or additional text beyond the core article content. NO EXTRA COMMENTS BEFORE OR AFTER THE ARTICLE, JUST THE ARTICLE. "
                "article must not mention seo tactics or anything bad or anything caused by its ai generation which is noticable and bad for a reader, must be consumer readable article."
                "must be based on a shop"
                  "FORBIDDEN PHRASES:\n"
    "- 'Here is the article'\n"
    "- 'As a business'\n"
    "- 'In conclusion'\n"
    "- 'We recommend'\n\n"

    "- 'Deals"
    "- 'deal saturday'\n\n"

                "ONLY If the article meets these criteria, you MUST say 'VALID', IF GOOD SAY exactly 'VALID' ELSE SAY 'BAD'.  "
                "\n\n"
                f"Article:\n{article}\n"
            )
            try:
                response = ollama.chat(
                    model='deepseek-r1:14b',
                    messages=[{'role': 'user', 'content': prompt}]
                )
                clean_response = self._clean_output(response['message']['content'].strip())
                return "valid" in clean_response.lower(), clean_response
            except Exception as e:
                print(f"Validation error: {e}")
                return False, "validation error"

    async def fixer(self, article, validation_response):
        """Targeted article repair based on validation feedback"""
        async with MODEL_SEMAPHORE:
            prompt = (
                "ARTICLE REPAIR PROTOCOL\n\n"

 "YOU WORK FOR FINDTHATSHOP.COM WRITING ABOUT WHAT SHOPS HAVE TO OFFER OFF GIVEN WEBSITE INFO WE GATHERED ON THEM FOR THE MAIN PAGE OF FINDTHATSHOP.COM!\n"
                "MAKE SURE IT IS FRIENDY CONSUMER LANGUAGE READABLE ARTICLE ABOUT THE SPECIFIC SHOP SERVICES"
                "FIX THESE SPECIFIC ISSUES:\n"
                f"{validation_response}\n\n"
                "REPAIR RULES:\n"
                "MAKE SURE IT IS CONSUMER LANGUAGE READABLE ARTICLE"
                " MOST IMPORTANT! Remove ALL text before/after article content, RETURN NOTHING BUT THE ARTICLE\n"
                "ENSURE THAT IT IS A PROPER ARTICLE BASED ON A SHOP"

                " Preserve all business details\n"
                " Use natural consumer language\n\n"
                  "FORBIDDEN PHRASES:\n"
    "- 'Here is the article'\n"
    "- 'As a business'\n"
    "- 'In conclusion'\n"
    "- 'We recommend'\n\n"

    "- 'deals'\n\n"
                "BAD EXAMPLE TO AVOID:\n"
                "'Here's your article: ...'\n\n"
                "GOOD EXAMPLE FORMAT:\n"
                "'ABC Shop at 123 Main St offers... Their services include...'\n\n"
                "DEFECTIVE ARTICLE:\n"
                f"{article}\n\n"
                "RETURN ONLY THE FIXED ARTICLE, NO EXTRA REMARKS OR COMMENTS, DO NOT SAY 'HERE IS YOUR ARTICLE', GIVE ONLY THE ARTICLE!"
            )
            try:
                response = ollama.chat(
                    model='deepseek-r1:14b',
                    messages=[{'role': 'user', 'content': prompt}]
                )
                return self._clean_output(response['message']['content'].strip())
            except Exception as e:
                print(f"Fixer error: {e}")
                return article



    def _clean_output(self, text):
        """Clean model output."""
        return re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()

########################################################################
# Helper Functions
########################################################################
def clean_filename(name):
    """Clean a string to be a valid filename."""
    return re.sub(r'[\\/*?:"<>|]', "", name).strip()

async def save_checkpoint(index):
    """Save current shop index to a checkpoint file."""
    with open(CHECKPOINT_FILE, "w") as f:
        f.write(str(index))

async def load_checkpoint():
    """Load the current shop index from a checkpoint file."""
    try:
        with open(CHECKPOINT_FILE, "r") as f:
            return int(f.read().strip())
    except FileNotFoundError:
        return 0

def validate_shop_data(shop_data):
    """Validate required fields in shop data and content length."""
    required_metadata = ['name', 'city', 'state', 'website']
    meta = shop_data.get('metadata', {})

    missing = [field for field in required_metadata if not meta.get(field)]
    if missing:
        return False, f"Missing metadata fields: {missing}"

    content = shop_data.get('content', '')
    visual_text = ' '.join([v.get('extracted_text', '')
                          for v in shop_data.get('visual_analysis', [])])
    combined = f"{meta.get('name', '')} {meta.get('city', '')} {meta.get('state', '')} {content} {visual_text}"

    if len(combined) < MIN_CONTENT_LENGTH:
        return False, f"Content too short ({len(combined)} chars)"

    return True, "Valid shop data"

########################################################################
# Processing Functions
########################################################################
import math

def split_text_evenly_dynamic(text, max_chunk_size):
    """
    Splits the text into an equal number of chunks, where each chunk is
    as close as possible to the same length and no chunk exceeds max_chunk_size.
    """
    total_length = len(text)
    # Determine the number of chunks needed such that each chunk is at most max_chunk_size.
    n_chunks = math.ceil(total_length / max_chunk_size)
    # Determine the equal chunk size.
    chunk_size = math.ceil(total_length / n_chunks)

    chunks = []
    for i in range(n_chunks):
        start = i * chunk_size
        end = start + chunk_size
        chunks.append(text[start:end])
    return chunks

async def process_shop(shop_path, ai_client, index, total):
    """Process a single shop JSON file."""


    try:
        with open(shop_path, 'r', encoding='utf-8', errors='replace') as f:
            shop_data = json.load(f)
    except Exception as e:
        print(f"Error loading {shop_path}: {e}")
        return

    is_valid, validation_msg = validate_shop_data(shop_data)
    if not is_valid:
        print(f"Skipping {shop_path}: {validation_msg}")
        return

    meta = shop_data['metadata']
    slug  = meta['slug']
    print(meta)
    content = shop_data['content']
    visual_text = ' '.join([v.get('extracted_text', '')
                          for v in shop_data.get('visual_analysis', [])])
    output_path = Path(BASE_OUTPUT_DIR) / meta['state'].lower() / meta['city'].lower()
    output_path.mkdir(parents=True, exist_ok=True)
    filename = output_path / f"{clean_filename(meta['name'])}.txt"
    if filename.exists():
        print(f"Skipping {shop_path}: Generated article already exists at {filename}")
        return

    combined_text = (
        f"Business Name: {meta['name']}\n"
        f"Location: {meta['address']}\n"
        f"Services: {content}\n"
        f"Additional Details: {visual_text}"
    )

    try:
        summary = await chunked_summarize(ai_client.qwen, combined_text)
        print(f"[AI] Final summary: {summary}")
    except Exception as e:
        print(f"Summarization failed: {e}")
        return




    article_prompt = (
    "ROLE: Professional writer for findthatshop.com  writing about shop services\n"
    "TASK: Create article about this shop using ONLY the provided data\n\n"
    "YOU DO NOT GIVE BACK HTML OR SHIT WE WANT AN ARTICLE ON SERVICES BASED ON INFO, NOT SHIT"
    "KEEP SEO IN MIND FOR MOST LIKELY SEARCHED TERMS FOR ARTICLE FOR FINDTHATSHOP.COM\n"
    "INCLUDE TERMS LIKE 'NEAR ME' ECT FOR FINDTHATSHOP.COM AND YOU WRITE IN A STYLE LIKE YOU ARE REPORTING ON THEIR SERVICES PERSPECTIVE AS A FINDTHATSHOP.COM WRITER, YOU WORK FOR FINDTHATSHOP.COM WRITING ABOUT WHAT SHOPS HAVE TO OFFER OFF GIVEN WEBSITE INFO WE GATHERED ON THEM FOR THE MAIN PAGE OF FINDTHATSHOP.COM!\n"
    "STRICT RULES:\n"

                "MAKE SURE IT IS FRIENDY CONSUMER LANGUAGE READABLE ARTICLE ABOUT THE SPECIFIC SHOP SERVICES"
    "1. BEGIN IMMEDIATELY WITH ARTICLE CONTENT. "

     "You must only provide the article wanted and no other extra text, no *. NO EXTRA COMMENTS BEFORE OR AFTER THE ARTICLE, JUST RESPOND WITH ONLY THE ARTICLE."
    "2. NO text before/after the article\n"
    "3. NO markdown, asterisks, or special formatting\n"
    "4. INCLUDE NO URLS\n\n"
    "You write about the shop not the webpage."
    "REPORT ON THE SHOP GENERALLY as a findthatshop.com writer"
    "CONTENT STRUCTURE:\n"
      "ENSURE THAT IT IS A PROPER ARTICLE BASED ON A SHOP"
    "2-3 PARAGRAPHS ON SERVICES IF NOT SERVICES LOCATION ECT, NOT SPECIFIC DEALS OR HOLIDAY STUFF"
  "Do not say 'explore our' YOU WORK FOR FINDTHATSHOP.COM, they arent our things its theirs and we are reporting"
    "FORBIDDEN PHRASES:\n"
    "- 'Here is the article'\n"
    "- 'As a business'\n"
    "- 'In conclusion'\n"
    "- 'We recommend'\n\n"
       "- 'Deals"

    "- 'explore our"
    "- 'deal saturday'\n\n"


    "BAD EXAMPLE (REJECT THIS):\n"
    "'Here's your article about Example Shop: They provide...'\n\n"
    "'Here's your article about...' [WRONG START, ONLY PROVIDE THE ARTICLE]\n"


    "GOOD EXAMPLE (COPY THIS FORMAT):\n"
    "'ABC Auto Repair in downtown Miami offers... Their team specializes in...'\n"
    "'123 Main Street offers... Their services include...'\n\n"

    "BUSINESS DATA:\n"
    f"Name: {meta['name']}\n"
    f"Address: {meta['address']}\n"
    f"Services: {summary}\n"


    "- Natural language with location keywords\n"
    "- Address reader as 'you'\n\n"
    "- No markdown, bullets, or special characters\n"
    "IMPERATIVE: RESPOND ONLY WITH THE RAW ARTICLE TEXT - NO OTHER CONTENT OR REMARKS!"
)
    ()
    try:
        article = await ai_client.deepseek.generate_article(article_prompt)
        print(f"[AI] Generated article: {article}")
    except Exception as e:
        print(f"Article generation failed: {e}")
        return

    is_valid, validation_msg = await ai_client.deepseek.validate_article(article)
    if not is_valid:
        print("Article failed validation. Attempting fix...")

        article = await ai_client.deepseek.fixer(article, validation_msg)


        if not await ai_client.deepseek.validate_article(article):
            print("final validation failed")
            return

    if article:
        print("ARTICLE VALID")
        print(article)
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(article)
        if slug:
            print("HERE WITH SLUG")
            payload = {"slug": slug, "description": article}
            endpoint = "https://www.findthatshop.com/e/s/s/update/"

            async with httpx.AsyncClient() as client:

                response = await client.post(endpoint, json=payload, timeout = 50)
                print(response)


########################################################################
# Main Execution
########################################################################
class AIClient:
    def __init__(self):
        self.deepseek = DeepSeekClient()
        self.qwen = QwenClient()

async def main():
    start_index = await load_checkpoint()
    shop_files = []
    for state in os.listdir(BASE_INPUT_DIR):
        if state.lower() not in [s.lower() for s in STATES]:
            continue
        state_path = Path(BASE_INPUT_DIR) / state
        for city in os.listdir(state_path):
            city_path = state_path / city
            if city_path.is_dir():
                shop_files.extend([
                    city_path / shop_file
                    for shop_file in os.listdir(city_path)
                    if shop_file.endswith('.json')
                ])

    total_shops = len(shop_files)
    print(f"Found {total_shops} shops to process")

    ai_client = AIClient()
    print("starting",start_index)

    for i, shop_path in enumerate(shop_files[start_index:]):
        current_index =  start_index + i
        await process_shop(shop_path, ai_client, current_index, total_shops)
        if SHOPDD:
          break
        await save_checkpoint(current_index + 1)

    print("Processing completed successfully")

if __name__ == "__main__":
    await main()