In [5]:
! pip install langchain
! pip install chromadb
! pip install sentence_transformers
! pip install pypdf
!pip install langchain-huggingface
!pip install langchain-groq
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.2-py3-none-any.whl.metadata (2.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.5.2-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.3.2-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
import os
import requests
from lxml import html
import pandas as pd
from langchain_groq import ChatGroq
from langchain import PromptTemplate, LLMChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from time import sleep
import markdown

In [9]:
base_url = 'https://muqawil.org/en/contractors?page={}'

contractors_data = []
total_pages = 5  # Change this number to fetch more pages

for page in range(1, total_pages + 1):
    response = requests.get(base_url.format(page))
    print(f"Fetching page {page}: {response.status_code}")

    tree = html.fromstring(response.content)
    contractors = tree.xpath('//*[@id="all_contractor"]/div')

    for contractor in contractors:
        company_name = contractor.xpath('.//h4/a/text()')
        company_name = company_name[0].strip() if company_name else 'N/A'

        membership_value = contractor.xpath('.//div[@class="info-name" and text()="Membership Number"]/following-sibling::div[@class="info-value"]/text()')
        membership_value = membership_value[0].strip() if membership_value else 'N/A'

        city_name = contractor.xpath('.//div[@class="info-name" and text()="City - Region"]/following-sibling::div[@class="info-value"]/text()')
        city_name = city_name[0].strip() if city_name else 'N/A'

        email = contractor.xpath('/html/body/main/div/div/div/div[1]/div/div[2]/div[2]/div/div[7]/div/div[2]/div[2]/a/text()')
        email = email[0].strip() if email else 'N/A'

        contractors_data.append({
            'company_name': company_name,
            'membership_value': membership_value,
            'city_name': city_name,
            'email': email
        })

    sleep(2)

df = pd.DataFrame(contractors_data)
df.to_csv('Test.csv', index=False, encoding='utf-8-sig')
df.replace('N/A', pd.NA, inplace=True)


Fetching page 1: 200
Fetching page 2: 200
Fetching page 3: 200
Fetching page 4: 200
Fetching page 5: 200


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   company_name      100 non-null    object
 1   membership_value  100 non-null    object
 2   city_name         99 non-null     object
 3   email             0 non-null      object
dtypes: object(4)
memory usage: 3.4+ KB


In [11]:
df.isna().sum()

Unnamed: 0,0
company_name,5
membership_value,5
city_name,6
email,105


In [12]:
df.drop(columns=['email'], inplace=True)

In [13]:
df['company_name'].fillna('Unknown', inplace=True)
df['membership_value'].fillna(0, inplace=True)
df['city_name'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['company_name'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['membership_value'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

In [14]:
df.to_csv('Our_Data.csv', index=False, encoding='utf-8-sig')

# i try to use Selenium and i get good results also ✅

In [15]:
directory = 'data/markdown_files'
os.makedirs(directory, exist_ok=True)

for i in range(len(df)):
    title = df['company_name'].iloc[i]
    content = f"Membership Value: {df['membership_value'].iloc[i]}\nCity Name: {df['city_name'].iloc[i]}"
    markdown_content = f"# {title}\n\n{content}\n\n"

    with open(f'{directory}/{i}.md', 'w', encoding='utf-8') as file:
        file.write(markdown_content)

html_texts = []
for filename in os.listdir(directory):
    if filename.endswith(".md"):
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            markdown_content = file.read()
            html_content = markdown.markdown(markdown_content)
            html_texts.append(html_content)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = text_splitter.create_documents(html_texts)

embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
document_texts = [doc.page_content for doc in documents]
embeddings = embedding_function.embed_documents(document_texts)

db = Chroma.from_documents(documents, embedding_function, persist_directory="./chroma_db")

def query_chroma_db(query, db, top_k=5):
    """
    Query the Chroma database for documents similar to the given query.

    Parameters:
        query (str): The search query.
        db (Chroma): The Chroma database instance to search in.
        top_k (int): The number of top similar documents to retrieve (default is 5).

    Returns:
        list: A list of the content of the top similar documents.
    """
    docs = db.similarity_search(query, k=top_k)
    results = [doc.page_content for doc in docs]
    return results

print("the database ready for queries!!")

  embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

the database ready for queries!!


In [16]:
groq_api_key = "****************************" # here ur Key
llm = ChatGroq(temperature=0, groq_api_key=groq_api_key, model_name="llama3-8b-8192")

In [17]:
# Define the prompt template string
PROMPT_TEMPLATE = """
Provide a brief and informative answer based on the following context:
Context: {context}
Question: {question}
Your answer should be concise, relevant, and aligned with the contracting industry.
"""


In [18]:
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=PROMPT_TEMPLATE,
)


In [19]:
MODEL = LLMChain(llm=llm, prompt=prompt_template, verbose=True)

  MODEL = LLMChain(llm=llm, prompt=prompt_template, verbose=True)


In [20]:
def query_rag(query: str):
    """
  Query the Chroma database for documents similar to the given query.

    Parameters:
        query (str): The search query.
        db (Chroma): The Chroma database instance to search in.
        top_k (int): The number of top similar documents to retrieve (default is 5).

    Returns:
        list: A list of the content of the top similar documents.
    """
    similarity_search_results = db.similarity_search_with_score(query, k=4)
    context_text = "\n\n".join([doc.page_content for doc, _score in similarity_search_results])
    rag_response = MODEL.invoke({"context": context_text, "question": query})
    return rag_response

In [21]:
query_string = "How many times was Riyadh repeated?"
response = query_rag(query_string)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Provide a brief and informative answer based on the following context:
Context: <h1>Sham Najd International Co Ltd</h1>
<p>Membership Value: 200006630
City Name: RIYADH
- Riyadh</p>

<h1>Alrajhi Building and Construction Co</h1>
<p>Membership Value: 100005210
City Name: RIYADH
- Riyadh</p>

<h1>Arab Builders for Telecommunications and Security Services</h1>
<p>Membership Value: 100004019
City Name: RIYADH
- Riyadh</p>

<h1>Safety Arabian Company Ltd.</h1>
<p>Membership Value: 10002387
City Name: RIYADH
- Riyadh</p>
Question: How many times was Riyadh repeated?
Your answer should be concise, relevant, and aligned with the contracting industry.
[0m

[1m> Finished chain.[0m


In [22]:
print(f'Text: \n{response["text"]}')

Text: 
Based on the provided context, the city name "Riyadh" was repeated 4 times.
