#1: LLM integration
The task involves building an AI capable of language translation.

In [None]:
!pip install python-dotenv



In [None]:
import os
import google.generativeai as genai
from dotenv import load_dotenv
import numpy as np

# Tải biến môi trường từ tệp .env
load_dotenv("/content/drive/MyDrive/Colab Notebooks/untitled.env")

# Lấy API Key từ biến môi trường
API_KEY = os.getenv("API_KEY")
if not API_KEY:
    raise ValueError("API_KEY không được tìm thấy. Vui lòng kiểm tra tệp .env hoặc biến môi trường.")

# Cấu hình thư viện với API Key
genai.configure(api_key=API_KEY)

In [None]:
model = genai.GenerativeModel('gemini-pro')


##1.1 Single Text Translation:

In [None]:
def translate_single_text(json_input):
    """
    Dịch một văn bản từ JSON đầu vào.
    """
    text = json_input['text']
    dest_language = json_input['dest_language']

    # Prompt để dịch văn bản
    translate_prompt = f"Translate the following text to {dest_language}:\n{text}"
    response = model.generate_content(translate_prompt)
    translated_text = response.text.strip()
    return translated_text

# Ví dụ sử dụng
if __name__ == "__main__":

    # Dịch văn bản đơn lẻ input là tiếng anh
    print("Translating into Vietnamese 1:")
    json_1 = {
        'text': 'Hello!',
        'dest_language': 'vi'
    }
    result_1 = translate_single_text(json_1)
    print(f"Translated single text: {result_1}")

    # Dịch văn bản đơn lẻ nếu là input là tiếng việt
    print("Translating into Vietnamese 2:")
    json_2 = {
        'text': 'Xin chào!',
        'dest_language': 'vi'
    }
    result_2 = translate_single_text(json_2)
    print(f"Translated single text: {result_2}")


Translating into Vietnamese 1:
Translated single text: Xin chào!
Translating into Vietnamese 2:
Translated single text: Xin chào!


##1.2 Multiple Texts Translation:

In [None]:
def translate_multiple_texts(json_input):
    """
    Dịch danh sách các văn bản từ JSON đầu vào.
    """
    texts = json_input['text']
    dest_language = json_input['dest_language']

    translated_texts = []
    for text in texts:
        # Prompt để dịch từng văn bản
        translate_prompt = f"Translate the following text to {dest_language}:\n{text}"
        response = model.generate_content(translate_prompt)
        translated_text = response.text.strip()
        translated_texts.append(translated_text)

    return translated_texts
if __name__ == "__main__":
  # Dịch danh sách văn bản
    json_3 = {
        'text': ["Hello", "I am John", "Tôi là sinh viên"],
        'dest_language': 'vi'
    }
    result_3 = translate_multiple_texts(json_3)
    print(f"Translated multiple texts: {result_3}")

Translated multiple texts: ['Xin chào', 'Tôi là John', 'Tôi là sinh viên']


#2:Chatbot Development

##2.1 Data Access and Indexing

In [None]:
import requests
from bs4 import BeautifulSoup

# URL cần lấy dữ liệu
url = "https://www.presight.io/privacy-policy.html"

# Gửi yêu cầu HTTP để lấy nội dung HTML của trang
response = requests.get(url)
response.encoding = 'utf-8'
if response.status_code == 200:
    print("Successfully fetched the webpage.")
else:
    print(f"Failed to fetch the webpage. Status code: {response.status_code}")
    exit()

# Phân tích nội dung HTML bằng BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

Successfully fetched the webpage.


In [None]:
sections=[]

In [None]:
headers=soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

In [None]:
title=headers[0].get_text(strip=True)
body=headers[1].get_text(strip=True)+' '+headers[1].find_next('p').get_text(strip=True)

In [None]:
sections.append({"title": title, "body": body})

In [None]:
for header in headers[2:]:
    if header==headers[6]:
        continue
    else:
        title = header.get_text(strip=True)
        sibling = header.find_next_sibling()
        body = []

        while sibling:
            if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                break

            if sibling.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'style']:
                if sibling.find_all(['i', 'p', 'li']):
                    content = []
                    for sub_element in sibling.find_all(['i', 'p', 'li']):
                        text_content = sub_element.get_text(strip=True)
                        if text_content:
                            content.append(text_content)
                    body.append("\n".join(content))
                else:
                    body.append(sibling.get_text(strip=True))

            sibling = sibling.find_next_sibling()

        sections.append({"title": title, "body": ' '.join(body).strip()})


In [None]:
div = soup.find('div', class_='chakra-stack css-1rr8l2z')
cities = [p.get_text() for p in div.find_all('p', class_='chakra-text css-1y9pbx2')]

In [None]:
sections.append({"title": "Location of Presight", "body": ' '.join(cities).strip()})

In [None]:
sections

[{'title': 'PRIVACY POLICY',
  'body': 'Last updated 15 Sep 2023 At Presight, we are committed to protecting the privacy of our customers and visitors to our website. This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.'},
 {'title': 'Information Collection and Use',
  'body': 'We collect several different types of information for various purposes to provide and improve our Service to you.'},
 {'title': 'Types of Data Collected',
  'body': 'Personal Data\nWhile using our Service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you ("Personal Data"). Personally identifiable information may include, but is not limited to:\nEmail address\nFirst name and last name\nPhone number\nAddress, State, Province, ZIP/Postal code, City\nCookies and Usage Data\nUsage Data\nWe may also collect information that your browser sends whenever you visit our Service or when you access

###Embedding data

In [None]:
sections_embeddings = []
model_embed = "models/text-embedding-004"

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
for section in sections:
    content=section["body"]+' '+section["title"]
    result = genai.embed_content(model=model_embed, content=content,task_type="retrieval_document")
    sections_embeddings.append(result['embedding'])

# Chuyển sections_embeddings thành numpy array để tính toán
sections_embeddings = np.array(sections_embeddings)

###Find the paragraph closest to the user's query

In [None]:
def get_best_match(user_query, sections, sections_embeddings, model):
    query_embedding_result = genai.embed_content(model=model_embed, content=user_query)
    query_embedding = query_embedding_result['embedding']
    query_embedding = np.array(query_embedding).reshape(1, -1)
    similarities = cosine_similarity(query_embedding, sections_embeddings)
    best_index = np.argmax(similarities)
    return sections[best_index], similarities[0][best_index]

In [None]:
user_query ="Where is the company located?"
best_match = get_best_match(user_query,sections,sections_embeddings,model_embed)
print(f"Best Passage: {best_match}")

Best Passage: ({'title': 'Location of Presight', 'body': 'Ho Chi Minh City, Vietnam Singapore Seattle, WA, USA'}, 0.5622150117056195)


In [None]:
model=genai.GenerativeModel('gemini-pro')
genai.configure(api_key=API_KEY)

##2.2 Chatbot Development

In [None]:
def generate_answer(question, context,model):
    prompt = f"You are a helpful and informative bot that answers questions using text from the reference passage included below. \
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
  However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
  strike a friendly and converstional tone. \
  If the passage is irrelevant to the answer, you may ignore it.\
  Based on the following paragraph {context}, please provide a safe and factual answer to the following question: {question}"
    try:
        response=model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
            max_output_tokens=100,
            temperature=1,
    ),)
        print(response.text)

    except Exception as e:
        print(f"Error generating answer: {e}")
        return "Sorry, I couldn't generate an answer at this moment."


In [None]:
def clean_input(text):
    # Loại bỏ các từ nhạy cảm hoặc nguy hiểm (giả lập)
    prohibited_words = ["dangerous", "expired", "illegal", "violent"]  # Ví dụ từ cấm
    for word in prohibited_words:
        text = text.replace(word, "[filtered]")
    return text

In [None]:
def chatbot(user_query):
    user_query = clean_input(user_query)
    best_match_paragraph,_ = get_best_match(user_query,sections, sections_embeddings,model_embed)
    combined_text = f"{best_match_paragraph['title']}: {best_match_paragraph['body']}"
    combined_text = clean_input(combined_text)
    final_answer = generate_answer(user_query, combined_text,model)
    return final_answer

In [None]:
import time

In [None]:
start = time.time()
user_query = "How to contact?"
answer = chatbot(user_query)
end = time.time()
print(f"Running time: {end-start:.4f} seconds")

You can get in touch with Presight about their Privacy Policy through their customer portal or by sending an email to presight@presight.io.
Running time: 3.2946 seconds


In [None]:
start = time.time()
user_query = "Where is the location of Presight?"
answer = chatbot(user_query)
end = time.time()
print(f"Running time: {end-start:.4f} seconds")

Presight is located in Ho Chi Minh City, Vietnam.
Running time: 2.7273 seconds


In [None]:
start = time.time()
user_query = "When was the privacy policy last updated?"
answer = chatbot(user_query)
end = time.time()
print(f"Running time: {end-start:.4f} seconds")

Unfortunately the reference passage you provided does not specify when the privacy policy was last updated.
Running time: 2.5569 seconds


#### Nhận xét:

 Việc chỉ chọn 1 đoạn văn gần với câu hỏi của người dùng nhất khiến xác suất mà kết quả hàm get_best_match sẽ chọn phải đoạn văn không chứa thông tin cần thiết cao nên ta lựa chọn phương án chọn 3 đoạn văn gần nhất để chatbot có thể có đầy đủ thông tin để trả lời chính xác hơn.

###Find the three paragraphs closest to the user's query

In [None]:
def get_top_matches(user_query, sections, sections_embeddings, model_embed, top_n=3):
    # Tạo embedding cho câu truy vấn
    query_embedding_result = genai.embed_content(model=model_embed, content=user_query)
    query_embedding = query_embedding_result['embedding']
    query_embedding = np.array(query_embedding).reshape(1, -1)

    # Tính toán Cosine Similarity
    similarities = cosine_similarity(query_embedding, sections_embeddings)[0]  # Lấy mảng 1 chiều

    # Sắp xếp chỉ số theo thứ tự giảm dần của similarity
    top_indices = np.argsort(similarities)[::-1][:top_n]  # Lấy top_n giá trị cao nhất

    top_matches = [{"section": sections[i], "similarity": similarities[i]} for i in top_indices]
    return top_matches


In [None]:
user_query ="When was the privacy policy last updated?"
best_match = get_top_matches(user_query,sections,sections_embeddings,model_embed,top_n=3)
print(f"Best Passage: {best_match}")

Best Passage: [{'section': {'title': 'Changes to Privacy Policy', 'body': 'We may update this Privacy Policy from time to time. The updated Privacy Policy will be posted on our website.'}, 'similarity': 0.602447592354112}, {'section': {'title': 'PRIVACY POLICY', 'body': 'Last updated 15 Sep 2023 At Presight, we are committed to protecting the privacy of our customers and visitors to our website. This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.'}, 'similarity': 0.5661056300205725}, {'section': {'title': 'Contact Us', 'body': 'If you have any questions about this Privacy Policy, please contact us through the customer portal or by email atpresight@presight.io.'}, 'similarity': 0.5479795390576668}]


In [None]:
model=genai.GenerativeModel('gemini-pro')
genai.configure(api_key=API_KEY)

###Chatbot

In [None]:
def chatbot3(user_query):
    user_query = clean_input(user_query)
    top_matches_paragraph = get_top_matches(user_query,sections,sections_embeddings,model_embed,top_n=3)
    combined_text = "\n".join([f"{paragraph['section']['title']}: {paragraph['section']['body']}" for paragraph in top_matches_paragraph])
    combined_text = clean_input(combined_text)
    final_answer = generate_answer(user_query, combined_text,model)
    return final_answer

In [None]:
start = time.time()
user_query = "When was the privacy policy last updated?"
answer = chatbot3(user_query)
end = time.time()
print(f"Running time: {end-start:.4f} seconds")

According to the passage, the privacy policy was last updated on September 15th, 2023.
Running time: 2.7694 seconds


In [None]:
start = time.time()
user_query = "Where is the location of Presight?"
answer = chatbot3(user_query)
end = time.time()
print(f"Running time: {end-start:.4f} seconds")

Presight has offices in Ho Chi Minh City, Vietnam and Seattle, Washington in the United States.
Running time: 2.7090 seconds


In [None]:
start = time.time()
user_query = "What happens when a user authenticates with Google Workspace?"
answer = chatbot3(user_query)
end = time.time()
print(f"Running time: {end-start:.4f} seconds")

When users authenticate with Google Workspace, Google does not retain or use their data to develop, improve, or train AI or ML models. Additionally, Google does not transfer user data to third parties for the purpose of developing AI or ML models.
Running time: 3.2106 seconds


#Báo cáo quy trình thực hiện:

##1. Cào dữ liệu bằng BeautifulSoup

##2. Chia dữ liệu thành các đoạn gồm tiêu đề và nội dung

##3. Embedding dữ liệu và lưu trong mảng

##4. Hàm get_best_match trả về đoạn văn bản phù hợp với câu hỏi nhất dựa vào giá trị đã embedding

##5. Khởi tạo hàm chatbot dựa vào hàm get_best_match để tìm câu trả lời

##6. Tương tự với hàm get_top_matches và hàm chatbot3 dựa vào 3 đoạn văn bản để tìm câu trả lời