In [None]:
! pip install -q newspaper3k litellm requests bs4 lancedb tantivy "ai21>=2" instructor

In [None]:
import os
import ast
from litellm import completion

os.environ["OPENROUTER_API_KEY"] = "OPENROUTER_KEY"  # Replace with your Anthropic API key
os.environ["OPENAI_API_KEY"] = "OPENAI KEY"  # Replace

os.environ["SERP_API_KEY"] = "SERP.DEV"  # Replace with your SERP API key
os.environ["AI21_API_KEY"] = "AI21 Key"  # Replace with your SERP engine


MODEL_NAME = "openrouter/anthropic/claude-3-opus"  # Replace with your model name

In [None]:
def get_search_Terms(topic):
    SYSTEM_MESSAGE = """
    You are a world-class Indian Legal journalist. 
    Generate a list of 5 search terms to search for to research and write an article about the topic.
    """

    USER_MESSAGE = f"""
    Please provide a list of 5 search terms related to '{topic}' for researching and writing an article.
    Respond with the search terms in a Python-parseable list, separated by commas.
    """
    messages = [{"content":SYSTEM_MESSAGE,"role":"system"},
                { "content": USER_MESSAGE,"role": "user"}]
    print(MODEL_NAME)
    response = completion(model = MODEL_NAME, messages=messages)
    response_text = response['choices'][0]['message']['content']
    search_terms = ast.literal_eval(response_text)
    return search_terms

In [None]:
import requests
import json

def search_serper(query):
    print(f"Searching SERP API for '{query}'")
    url = "https://google.serper.dev/search"
    payload = json.dumps({
        "q": query
    })
    headers = {
        'X-API-KEY': os.environ["SERP_API_KEY"],
        'Content-Type': 'application/json'
    }

    response = requests.post(url, data=payload, headers=headers)
    data = response.json()
    return data["organic"]

In [None]:
def get_relevant_articles(search_results):
    SYSTEM_PROMPT = "You are a journalist assistant. From the given search results, select the URLs that seem most relevant and informative for writing an article on the topic."
    search_results_text = "\n".join([f"{i+1}. {result['link']}" for i, result in enumerate(search_results)])
    USER_PROMPT = f"""
    Search Results:\n{search_results_text}\n\nPlease select the numbers of the URLs that seem most relevant and informative for writing an article on the topic. 
    Ignore the Urls which has PDFs or not relevant to the topic.
    Respond with the numbers in a Python-parseable list, separated by commas."
    """
    messages = [{"content":SYSTEM_PROMPT,"role":"system"},
                { "content": USER_PROMPT,"role": "user"}]
    
    response = completion(model=MODEL_NAME, messages=messages)
    response_text = response['choices'][0]['message']['content']
    selected_urls = ast.literal_eval(response_text)
    selected_articles = [search_results[i-1] for i in selected_urls]
    return selected_articles

In [None]:
from newspaper import Article
from newspaper.article import ArticleException

def get_article_text(url):
    print(f"Fetching article from URL: {url}")
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except ArticleException as e:
        print(f"Error downloading or parsing article: {str(e)}")
        return None
    except Exception as e:
        print(f"Error fetching article: {str(e)}")
        return None

In [None]:
from ai21 import AI21Client
from ai21.models import DocumentType

def segment_text_and_collect_results(text):
    client = AI21Client(
        api_key=os.environ["AI21_API_KEY"],
    )
    if len(text) < 30:
        print("Text is too short.")
        return None
    
    response = client.segmentation.create(
        source= text,
        source_type=DocumentType.TEXT,
    )
    segments_array = []

    for segment in response.segments:
        segments_array.append({
            "segment_text": segment.segment_text,
            "segment_type": segment.segment_type
        })
        
    return segments_array

In [None]:
import uuid
import lancedb
from tqdm import tqdm
from litellm import embedding

model_name = 'text-embedding-3-small'

def generate_embedding(text):
    response = embedding(model=model_name, input=[text])
    return response["data"][0]["embedding"]

def embed_chunks(chunks):
    vector_data = []
    for chunk in tqdm(chunks, desc="Processing Chunks"):
        embedding_response = generate_embedding(chunk)
        vector_data.append({"vector": embedding_response, "text": chunk})

    uri = "data/sample-lancedb"
    db = lancedb.connect(uri)
    table = db.create_table(str(uuid.uuid4()), data=vector_data)
    return table


In [None]:
def search(query,table):
    search_query = generate_embedding(query)
    content = table.search(search_query).limit(30).to_list()
    contents = [content[i]["text"] for i in range(len(content))]
    return contents

In [None]:
def prepare_outlines(query, search_results):
    segments = "\n\n".join(search_results)
    OUTLINE_SYSTEM_PROMPT = f"""You are a journalist assistant. 
    Generate an outline for an detailed article on the topic of {query} based on the segmented text.

    Respond with the numbers in a Python-parseable list, separated by commas.
    """

    OUTLINE_USER_PROMPT = f"""
    Please provide an outline for an article on the topic of {query} based on the segmented text.

    <TextSegment>
    {segments}
    <TextSegment>
    """

    messages = [{"content":OUTLINE_SYSTEM_PROMPT,"role":"system"},
                { "content": OUTLINE_USER_PROMPT,"role": "user"}]
    response = completion(model=MODEL_NAME, messages=messages)
    response_text = response['choices'][0]['message']['content']
    return response_text


In [None]:
import instructor
from openai import OpenAI
from pydantic import BaseModel,Field
from typing import List,Optional

# This enables response_model keyword
# from client.chat.completions.create
client = instructor.patch(OpenAI())

class OutlineItem(BaseModel):
    text: str = Field(description="The text of the outline item with its children with newlines separating them")
    number: Optional[str] = Field(description="The number of the outline item, if available")

def parse_outline(outlines: str) -> List[OutlineItem]:
    prompt = f"""
    Parse the following text into a structured outline:
    
    {outlines}
    
    Respond with a JSON representation of the outline, where each item has the following fields:
    - text: The text of the outline item with its children with newlines separating them
    - number: The number of the outline item, if available
    
    Use double quotes for string values and follow the JSON format strictly.
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        response_model=List[OutlineItem],
        messages=[
            {"role": "user", "content": prompt},
        ],
    )
    return response

In [None]:
def outline_to_article(outline, query,table):
    info = search(outline,table)
    notes = "\n\n".join(info)
    
    SYSTEM_TEXT = f"""
    You are a journalist assistant. You will be given a part of the outline for an article on the topic of {query}. 
    You will also be provided with notes on the outline items. Your task is to write the article based on the outline and notes.
    <outline>
    {outline}
    </outline>
    """
    
    USER_TEXT = f"""
    Please write an article based on the provided outline and notes.
    <notes>
    {notes}
    </notes>
    Notes may not be arranged in the same order as the outline items. Please ensure that the notes are used to provide additional information and context to the article.
    """
    
    messages = [
        {"content": SYSTEM_TEXT, "role": "system"},
        {"content": USER_TEXT, "role": "user"}
    ]
    
    response = completion(model=MODEL_NAME, messages=messages)
    return response['choices'][0]['message']['content']

In [32]:
class ResearchPipeline:
    def __init__(self, topic):
        self.topic = topic
        self.search_terms = get_search_Terms(topic)
        self.search_results = []
        self.selected_articles = []
        self.selected_texts = []
        self.segmented_texts = []
        self.table = None
        self.outlines = []
        self.completed_article = ""

    def search(self):
        print(f"Search google for these terms: {self.search_terms}")
        for term in self.search_terms:
            self.search_results.extend(search_serper(term))
        print(f"Found {len(self.search_results)} search results.")

    def select_articles(self):
        self.selected_articles = get_relevant_articles(self.search_results)
        print(f"We have selected {len(self.selected_articles)} articles for further processing.")

    def extract_text(self):
        self.selected_texts = [get_article_text(article["link"]) for article in self.selected_articles]

    def segment_text(self):
        for text in self.selected_texts:
            if text is not None:
                segments = segment_text_and_collect_results(text)
                if segments is not None:
                    for segment in segments:
                        self.segmented_texts.append(segment["segment_text"])

    def embed_chunks(self):
        self.table = embed_chunks(self.segmented_texts)

    def search_table(self):
        search_query = f"Prepare a detailed Outline for {self.topic}"
        search_results = search(search_query, self.table)
        print(f"Found {len(search_results)} search results.")
        self.outlines = prepare_outlines(search_query, search_results)
        print("Getting Outlines...")
        print(self.outlines)

    def parse_outlines(self):
        outline_list = parse_outline(str(self.outlines))
        prepare_outlines("Got All the Outlines", [outline.text for outline in outline_list])
        for outline in outline_list:
            print("Writing Article on {}...".format(outline.text))
            article = outline_to_article(outline.text, self.topic, self.table)
            self.completed_article += article + "\n\n"


    def write_article(self):
        WRITER_SYSTEM_PROMPT = f"""
        You are a journalist. Write a high-quality, NYT-worthy article on the given topic based on the provided article texts.
        The article should be well-structured, informative, and engaging. Expect output should be in Markdown format.
        """

        WRITER_USER_PROMPT = f"""
        Topic: {self.topic}

        Article Texts:
        {self.completed_article}

        Please write a high-quality, NYT-worthy article on the topic based on the provided article texts.
        The article should be well-structured, informative, and engaging.
        Ensure the length is at least as long as a NYT cover story -- at a minimum, 15 paragraphs.
        """

        messages = [
            {"content": WRITER_SYSTEM_PROMPT, "role": "system"},
            {"content": WRITER_USER_PROMPT, "role": "user"}
        ]

        response = completion(model="openrouter/anthropic/claude-3-opus", messages=messages)
        self.edited_article = response['choices'][0]['message']['content']

    def run(self):
        self.search()
        self.select_articles()
        self.extract_text()
        self.segment_text()
        self.embed_chunks()
        self.search_table()
        self.parse_outlines()
        self.write_article()


topic = "Money Laundering in India"
pipeline = ResearchPipeline(topic)
pipeline.run()

openrouter/anthropic/claude-3-opus
Search google for these terms: ['money laundering in india', 'indian anti-money laundering laws', 'financial crimes enforcement network india', 'india financial intelligence unit', 'india enforcement directorate money laundering']
Searching SERP API for 'money laundering in india'
Searching SERP API for 'indian anti-money laundering laws'
Searching SERP API for 'financial crimes enforcement network india'
Searching SERP API for 'india financial intelligence unit'
Searching SERP API for 'india enforcement directorate money laundering'
Found 46 search results.
We have selected 24 articles for further processing.
Fetching article from URL: https://economictimes.indiatimes.com/definition/money-laundering
Fetching article from URL: https://sanctionscanner.com/Aml-Guide/anti-money-laundering-aml-in-india-87
Fetching article from URL: https://www.statista.com/statistics/1321843/india-number-of-money-laundering-cases-investigated-by-type-of-offence/
Fetching 

Processing Chunks: 100%|██████████| 436/436 [03:01<00:00,  2.40it/s]


Found 30 search results.
Getting Outlines...
Here is the outline for the article on Money Laundering in India:

1. The Crime of Money Laundering and Criminal Enforcement
   1.1 What is the legal authority to prosecute money laundering at the national level?
   1.2 Describe anti-money laundering enforcement priorities or areas of particular focus for enforcement.

2. The Prevention of Money Laundering Act (PMLA) of 2002
   2.1 Enactment and key provisions of the PMLA
   2.2 Establishment of specialized enforcement agencies
   2.3 Reporting requirements for financial institutions and intermediaries
   2.4 India's membership in the Financial Action Task Force (FATF)
   2.5 Government agencies/competent authorities responsible for compliance and enforcement

3. Process of Money Laundering
   3.1 The three-step process: placement, layering, and integration

4. Penalties for Money Laundering in India
   4.1 Imprisonment
   4.2 Fines
   4.3 Confiscation of proceeds
   4.4 Attachment of proper

In [33]:
print(pipeline.edited_article)

# India's Resolute Stance Against Money Laundering: A Comprehensive Crackdown

## The Gravity of Money Laundering

In the shadowy realm of financial crimes, money laundering stands as a pervasive and pernicious threat, not only undermining the integrity of economic systems but also enabling a vast array of illicit activities, from drug trafficking to terrorism. As a rapidly ascending global power and a hub for financial transactions, India has found itself on the frontlines of this battle, confronting the scourge of money laundering with an unwavering commitment and a comprehensive approach.

## The Cornerstone: The Prevention of Money Laundering Act

India's response to the money laundering menace is anchored in the Prevention of Money Laundering Act (PMLA) of 2002, a robust and comprehensive legislative framework that criminalizes the concealment of illicit proceeds and provides the legal basis for tracing, seizing, and confiscating assets derived from criminal activities.

The PMLA 