# ChatBot

## Step-1: Choosing a DataSet

First going to train on Wikipedia Data.

And to avoid additional Computation & Internet Costs, would be going with using API to stream the data instead of downloading all at once

### A) Streaming Data

In [17]:
import requests
import json
import os
import re
import random

In [18]:
print("Libraries versions:\n")
print(f"requests\t{requests.__version__}")
print(f"json\t\t{json.__version__}")

Libraries versions:

requests	2.32.3
json		2.0.9


In [19]:
PROGRESS_FILE = 'progress.json'

In [21]:
def load_progress():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as file:
            return json.load(file)
    return {"last_title": None, "fetched_titles": set()}

def save_progress(progress):
    with open(PROGRESS_FILE, 'w') as file:
        json.dump(progress, file)

def fetch_wikipedia_articles(limit=100, batch_size=10, output_dir='data_chunks'):
    '''Fetches a list of Wikipedia articles and their content, starting from the last fetched title.'''
    progress = load_progress()
    last_title = progress["last_title"]
    fetched_titles = progress["fetched_titles"]

    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "allpages",
        "aplimit": batch_size,
        "apfilterredir": "nonredirects",
    }

    if last_title:
        params["apfrom"] = last_title

    articles = []
    os.makedirs(output_dir, exist_ok=True)

    while len(articles) < limit:
        response = requests.get(base_url, params=params)
        data = response.json()
        pages = data["query"]["allpages"]

        for page in pages:
            page_title = page["title"]
            if page_title not in fetched_titles:
                page_content = fetch_page_content(page_title)
                articles.append(page_content)
                fetched_titles.add(page_title)

                # Save the article content to a file
                save_article_to_disk(page_title, page_content, output_dir)

                if len(articles) >= limit:
                    break

        if "continue" not in data:
            break

        params.update(data["continue"])

    # Update progress
    progress["last_title"] = pages[-1]["title"] if pages else last_title
    progress["fetched_titles"] = list(fetched_titles)
    save_progress(progress)

    return articles

def fetch_page_content(page_title):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": page_title,
        "prop": "extracts",
        "explaintext": True
    }

    response = requests.get(base_url, params=params)
    data = response.json()
    pages = data["query"]["pages"]
    page_id = next(iter(pages))
    page_content = pages[page_id]["extract"]

    return page_content

def sanitize_filename(filename):
    # Remove invalid characters from the filename
    return re.sub(r'[\\/:*?"<>|]', '', filename)

def save_article_to_disk(page_title, page_content, output_dir):
    file_path = os.path.join(output_dir, f"{sanitize_filename(page_title)}.txt")
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(page_content)

def fetch_random_wikipedia_articles(limit=100, output_dir='data_chunks'):
    '''Fetches a list of random Wikipedia articles and their content.'''
    progress = load_progress()
    fetched_titles = set(progress["fetched_titles"])

    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "random",
        "rnlimit": limit,
        "rnnamespace": 0,  # Main namespace
    }

    articles = []
    os.makedirs(output_dir, exist_ok=True)

    response = requests.get(base_url, params=params)
    data = response.json()
    pages = data["query"]["random"]

    for page in pages:
        page_title = page["title"]
        if page_title not in fetched_titles:
            page_content = fetch_page_content(page_title)
            articles.append(page_content)
            fetched_titles.add(page_title)

            # Save the article content to a file
            save_article_to_disk(page_title, page_content, output_dir)

    # Update progress
    progress["fetched_titles"] = list(fetched_titles)
    save_progress(progress)

    return articles

In [50]:
# Fetch articles starting from the last fetched title
# articles = fetch_wikipedia_articles(limit=10, batch_size=3)

# Fetch random articles
random_articles = fetch_random_wikipedia_articles(limit=5)


In [52]:
with open("progress.json", "r") as file:
    file = json.load(file)
    print(len(file['fetched_titles']))

130


## Step-2 Training the Model