In [10]:
import streamlit as st
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download once
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Load text file
@st.cache_data
def load_text():
    with open("kenya_finance_guide.txt", "r", encoding="utf-8") as f:
        return f.read()

# Custom preprocessing for Kenya finance context
def preprocess(text):
    # Remove headers/footers if any
    text = re.sub(r"Page \d+.*", "", text)
    text = re.sub(r"Kenya Revenue Authority.*\d{4}", "", text)

    # Kenya-specific stop words to reduce noise
    kenya_stops = {
        "kenya", "kenyan", "kes", "ksh", "kshs", "ltd", "limited",
        "chapter", "section", "article", "act", "page", "pages", "www", "http",
        "kra", "nse", "cbk", "nss", "brs", "government", "authority", "commission"
    }
    stop_words = set(stopwords.words("english")).union(kenya_stops)

    sentences = sent_tokenize(text)
    clean_sentences = []

    for sent in sentences:
        sent = sent.lower()
        sent = re.sub(r"[^a-z\s]", " ", sent)  # keep only letters
        words = word_tokenize(sent)
        words = [w for w in words if w not in stop_words and len(w) > 2]
        if len(words) >= 4:
            clean_sentences.append(" ".join(words))

    return clean_sentences

# Build TF-IDF model
@st.cache_resource
def build_model(sentences):
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=2)
    tfidf = vectorizer.fit_transform(sentences)
    return vectorizer, tfidf

# Find best matching sentence
def get_best_match(query, vectorizer, tfidf_matrix, sentences):
    query = query.lower()
    query = re.sub(r"[^a-z\s]", " ", query)
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    best_idx = np.argmax(similarities)
    return sentences[best_idx], similarities[best_idx]

# Chatbot response
def chatbot_response(user_input, sentences, vectorizer, tfidf_matrix):
    if len(user_input.strip()) < 3:
        return "Please ask a proper question ðŸ˜Š"

    response, score = get_best_match(user_input, vectorizer, tfidf_matrix, sentences)

    if score < 0.12:
        return ("I'm not sure I have clear info on that. Try asking about: "
                "taxes, savings, investing in NSE, pension, VAT, property investment, "
                "or starting a business in Kenya.")

    # Make response more natural
    response = response.capitalize()
    if not response.endswith(('.', '!', '?')):
        response += "."
    return response

# Streamlit UI
def main():
    st.set_page_config(page_title="Kenya Finance Bot", layout="centered")

    st.title("ðŸ‡°ðŸ‡ª Kenya Finance & Investment Chatbot")
    st.markdown("""
    **Hi @DavyStanley!**
    Ask me anything about personal finance, taxes, investing, or doing business in Kenya.
    Powered by official KRA, CBK, NSE & NSSF guides.
    """)

    # Load data
    with st.spinner("Loading Kenya financial knowledge base..."):
        raw_text = load_text()
        sentences = preprocess(raw_text)
        vectorizer, tfidf_matrix = build_model(sentences)

    st.success(f"Ready! Knowledge base loaded with {len(sentences):,} sentences.")

    # Chat history
    if "messages" not in st.session_state:
        st.session_state.messages = [
            {"role": "assistant", "content": "Hello! How can I help you with finance in Kenya today?"}
        ]

    # Display chat
    for msg in st.session_state.messages:
        with st.chat_message(msg["role"]):
            st.write(msg["content"])

    # User input
    if prompt := st.chat_input("Ask about tax, investment, savings, business in Kenya..."):
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.write(prompt)

        with st.chat_message("assistant"):
            with st.spinner("Thinking..."):
                reply = chatbot_response(prompt, sentences, vectorizer, tfidf_matrix)
            st.write(reply)
            st.session_state.messages.append({"role": "assistant", "content": reply})

if __name__ == "__main__":
    main()

2025-11-17 07:54:56.644 No runtime found, using MemoryCacheStorageManager
