In [1]:
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
llm = Ollama(
    model="llama3",
    num_gpu = 0, # To use CPU only
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    # format="json",
    temperature=1,
    base_url="http://192.168.3.32:11000"
)

In [3]:
llm.invoke("hi, who are you ?")

Nice to meet you! I'm LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. My primary function is to provide helpful and accurate responses to your questions and engage in productive conversations.

I've been trained on a massive dataset of text from the internet and can generate human-like responses to a wide range of topics, including but not limited to:

* Chatting about current events or news
* Sharing interesting facts or trivia
* Offering language translation services
* Playing games like 20 Questions or Would You Rather
* Assisting with writing or proofreading tasks

I'm constantly learning and improving my responses based on user interactions, so please feel free to share your thoughts, ask questions, or just chat with me!

"Nice to meet you! I'm LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. My primary function is to provide helpful and accurate responses to your questions and engage in productive conversations.\n\nI've been trained on a massive dataset of text from the internet and can generate human-like responses to a wide range of topics, including but not limited to:\n\n* Chatting about current events or news\n* Sharing interesting facts or trivia\n* Offering language translation services\n* Playing games like 20 Questions or Would You Rather\n* Assisting with writing or proofreading tasks\n\nI'm constantly learning and improving my responses based on user interactions, so please feel free to share your thoughts, ask questions, or just chat with me!"

In [4]:
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import time
import json

In [5]:
llm_1 = ChatOllama(model = "llama3",
                 format = "json",
                 temperature = 1,
                 num_gpu = 0, # To use CPU only
                 # callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]),
                 max_new_tokens = 255,
                 base_url="http://192.168.3.32:11000"
                )

In [6]:
messages = [
        (
        "system","""You are an agent tasked with extracting contact information (emails, phone and fax numbers, addresses) from an HTML code. For example:
                    HTML Code Example:
                        <html>
                        <head>
                            <title>Web Site Title</title>
                        </head>
                        <body>
                            <h1>Contact Details</h1>
                            <p>Emails: <span> abdelghaffour@gmail.com </span> <span> mouhsine.abdo@usmba.ac.ma </span></p>
                            <p>Phones: 06 82 10 33 81 / 07 21 23 54 12</p>
                            <p>Address: 123 Main St, City, Country</p>
                        </body>
                        </html>
                    Your task is to extract the contact information and format it into the following JSON object:
                        [
                             "emails": ["abdelghaffour@gmail.com", "mouhsine.abdo@usmba.ac.ma"],
                             "phones": ["06 82 10 33 81", "07 21 23 54 12"],
                             "addresses": ["123 Main St, City, Country"]
                        ]
                    Make sure to identify and extract the relevant information accurately.
                        """
    ),
    (
        "human","""Extract the contact information and format it into a JSON object:"""
    ),
    (
        "human","{htmlCode}"
    ),
]

html_source_code = """
<html>
    <head>
        <title>Contact Information</title>
    </head>
    <body>
        Adresse :20-26, Rue Bassatine – Immeuble Myr – Etage 5 – Bd de la Résistance – Casablanca
        :الهاتف
        00 212 539 93 63 63

        212.539.94.90.37
        الهاتف:
        +212 682103381
        212661988374 – 0021266339713
        
        البريد الإلكتروني
        info@medi1.com

        : للحصول على وظيفة أو تدريب
        rh@medi1.com


        : لإلتحاق بهيئة التحرير
        redaction@medi1.com


        : لطلب الإعلان أو الشراكة
        pub@medi1.com


        :لإعطاء ملاحظات حول القناة أو برامجها
        info@medi1.com


        الإميل: infos.amtm@gmail.com

    </body>
</html>
"""
template = ChatPromptTemplate.from_messages(messages)

In [7]:
chain_1 = template | llm_1 | StrOutputParser()

In [8]:
start = time.perf_counter()
response_1 = chain_1.invoke({ "htmlCode": html_source_code })
end = time.perf_counter()

In [9]:
response_1

'{"emails": ["info@medi1.com", "rh@medi1.com", "redaction@medi1.com", "pub@medi1.com", "info@medi1.com", "infos.amt@gmail.com"],\n"phones": ["00 212 539 93 63 63", "212.539.94.90.37", "+212 682103381", "212661988374", "0021266339713"],\n"addresses": ["20-26, Rue Bassatine – Immeuble Myr – Etage 5 – Bd de la Résistance – Casablanca"]}'

In [10]:
print(f"Execution time : {end - start :.8} seconds | {(end - start)/60 :.8} min")

Execution time : 91.815159 seconds | 1.5302527 min


In [11]:
# Parse the JSON string
parsed_json = json.loads(response_1)

# Now we can access the values in the JSON
print(parsed_json["emails"]) 
print(parsed_json["phones"]) 
print(parsed_json["addresses"])

['info@medi1.com', 'rh@medi1.com', 'redaction@medi1.com', 'pub@medi1.com', 'info@medi1.com', 'infos.amt@gmail.com']
['00 212 539 93 63 63', '212.539.94.90.37', '+212 682103381', '212661988374', '0021266339713']
['20-26, Rue Bassatine – Immeuble Myr – Etage 5 – Bd de la Résistance – Casablanca']


In [12]:
llm_2 = ChatOllama(model = "llama3:70b",
                 format = "json",
                 temperature = 1,
                 num_gpu = 0, # To use CPU only
                 # callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]),
                 max_new_tokens = 255,
                 base_url="http://192.168.3.32:11000"
                )

In [13]:
chain_2 = template | llm_2 | StrOutputParser()

In [14]:
start = time.perf_counter()
response_2 = chain_2.invoke({ "htmlCode": html_source_code })
end = time.perf_counter()

In [15]:
response_2

'{\n    "addresses": ["20-26, Rue Bassatine – Immeuble Myr – Etage 5 – Bd de la Résistance – Casablanca"],\n    "phones": ["00 212 539 93 63 63", "212.539.94.90.37", "+212 682103381", "212661988374", "0021266339713"],\n    "emails": ["info@medi1.com", "rh@medi1.com", "redaction@medi1.com", "pub@medi1.com", "infos.amtm@gmail.com"]\n}'

In [16]:
print(f"Execution time : {end - start :.8} seconds | {(end - start)/60 :.8} min")

Execution time : 318.30638 seconds | 5.3051063 min


In [17]:
# Parse the JSON string
parsed_json = json.loads(response_2)

# Now we can access the values in the JSON
print(parsed_json["emails"]) 
print(parsed_json["phones"]) 
print(parsed_json["addresses"])

['info@medi1.com', 'rh@medi1.com', 'redaction@medi1.com', 'pub@medi1.com', 'infos.amtm@gmail.com']
['00 212 539 93 63 63', '212.539.94.90.37', '+212 682103381', '212661988374', '0021266339713']
['20-26, Rue Bassatine – Immeuble Myr – Etage 5 – Bd de la Résistance – Casablanca']


# Class ContactLLM

In [55]:
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import time
import json

class ContactLLM:

    def __init__(self, llm_name = "llama3"):
        self.llm_name = llm_name
        self.llm = ChatOllama(model = self.llm_name,
                     format = "json",
                     temperature = 1,
                     num_gpu = 0, # To use CPU only
                     # callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]),
                     max_new_tokens = 255,
                     base_url="http://192.168.3.32:11000"
                )
        self.messages = [
                (
                    "system","""You are an agent tasked with extracting contact information (emails, phone and fax and whatsapp numbers) from a text of contact page. For example:
                                Example (text of contact page) :
                                    * Lanarkshire G32 8FG Telephone Number |&| 06 82 10 33 81 |&| News desk |&| 0141 302 7002 |&| abdelghaffour@gmail.com |&| fax : +971-3-3447111 |&| business with. or Newspaper Sales |&| mouhsine.abdo@usmba.ac.ma |&| Advertising - Kirsty McKinney - +212 682103381 | Craig Anderson - 0141 302 7133 | abdo@gmail.com |&| Digital Subscriptions - 0800 731 4900 (Mon-Fri, 9-5.30pm)*
                                
                                Your task is to extract the contact information and format it into the following JSON object:
                                    [
                                         "emails": ["abdelghaffour@gmail.com", "mouhsine.abdo@usmba.ac.ma", "abdo@gmail.com"],
                                         "phones": ["06 82 10 33 81", "0141 302 7002", "+971-3-3447111", "+212 682103381", "0141 302 7133", "0800 731 4900"]
                                    ]
                                => Make sure the value of the key "phones" contain a phone, fax and whatsapp numbers.
                                => Make sure to identify and extract the relevant information accurately.
                                    """
                ),
                (
                    "human","""Extract the contact information (emails, phones) and format it into a JSON object from the following text (Make sure  don't create an other key, juste 'emails' and 'phones' and the value of the key "phones" contain all phone, fax and whatsapp numbers in the text contact page) :"""
                ),
                (
                    "human","{htmlCode}"
                )
            ]

        self.template = ChatPromptTemplate.from_messages(self.messages)
        self.chain = self.template | self.llm | StrOutputParser()
        
    def predict(self, html_source_code, execution_time=False):
        start = time.perf_counter()
        response = self.chain.invoke({ "htmlCode": html_source_code })
        end = time.perf_counter()
        parsed_json = json.loads(response)
        if execution_time :
            parsed_json["execution_time"] = f"{end - start} s | {(end - start)/60} min"
        return parsed_json

In [1]:
from contactLLM.contactLLM import ContactLLM
from contactLLM.pageProcessing import PageProcessing
from contactLLM.sentenceProcessing import SentenceProcessing

In [2]:
pageProcessing = PageProcessing()
sentenceProcessing = SentenceProcessing()
contactLLM = ContactLLM(_base_url="http://192.168.3.32:11000")

In [3]:
chunk = """23,2024 CONNECT WITH US Contact Us Head Office EMAIL US info@nation.com.pk Shahrah e Fatima Jinnah, 
            Lahore - Pakistan CALL US T: +92(42)36367580 F: +92(42)36367005 Branch Offices Karachi Office info@nation.com.pk 
            Shamsheer, Phase V, D. H. S., Karachi - Pakistan T: +92(21)58437203 F: +92(21)5854932 Multan Office 
            info@nation.com.pk Nawaiwaqt House Abdali Road, Multan - Pakistan T: +92(61)5455714 F: +92(61)580958 
            Islamabad Office info@nation.com.pk Nawaiwaqt House Zero Point, Islamabad - Pakistan T: +92(51)22026414 
            F: +92(42)22026456 Contact Us Form Loading... Hajj 2024:31,057 Pakistani pilgrims reach Saudi Arabia 1:34 
            PM | May 23,202412:28 PM | May 23,202412:08 PM | May 23,202412:01 NIPCO House, 4- Shaharah e Fatima Jinnah, 
            Lahore, Pakistan Tel: +924236367580 | Fax : +924236367005'
            """

In [4]:
contactLLM.predict(chunk, execution_time=True)

{'emails': ['info@nation.com.pk'],
 'phones': ['+92(42)36367580',
  '+92(42)36367005',
  '+92(21)58437203',
  '+92(21)5854932',
  '+92(61)5455714',
  '+92(61)580958',
  '+92(51)22026414',
  '+92(42)22026456'],
 'execution_time': '109.35778345912695 s | 1.8226297243187826 min'}

In [3]:
url = "https://www.nation.com.pk/contact-us"
clean_html_text = pageProcessing.get_clean_html_from_url(url)

Start get_source_page_from_url
Execution time of get_source_page_from_url : 3.173569 seconds
number of words in the source_page : 7774
number of characters in the source_page : 110854
----------------------------------------------------------------------------------------------------
Start get_clean_html_from_source_page
-----------> Vider le contenu de la balise <head>: -1782 deleted words
-----------> supprimer les commentaires : 0 deleted words
-----------> Supprimer les balises avec leur contenu : script, noscript, style, ul, form, table, footer :  -1392 deleted words
-----------> Sélection et filtrer de toutes les balises <a>: -1806 deleted words
Execution time of get_clean_html_from_source_page : 0.352242 seconds
number of words in the clean_html : 263
number of characters in the clean_html : 2808
----------------------------------------------------------------------------------------------------


In [4]:
chunks = sentenceProcessing.get_chunks_from_clean_html_text(clean_html_text)
print(len(chunks))
print(chunks)

1
['23,2024 CONNECT WITH US Contact Us Head Office EMAIL US info@nation.com.pk Shahrah e Fatima Jinnah, Lahore - Pakistan CALL US T: +92(42)36367580 F: +92(42)36367005 Branch Offices Karachi Office info@nation.com.pk Shamsheer, Phase V, D. H. S., Karachi - Pakistan T: +92(21)58437203 F: +92(21)5854932 Multan Office info@nation.com.pk Nawaiwaqt House Abdali Road, Multan - Pakistan T: +92(61)5455714 F: +92(61)580958 Islamabad Office info@nation.com.pk Nawaiwaqt House Zero Point, Islamabad - Pakistan T: +92(51)22026414 F: +92(42)22026456 Contact Us Form Loading... Hajj 2024:31,057 Pakistani pilgrims reach Saudi Arabia 1:34 PM | May 23,202412:28 PM | May 23,202412:08 PM | May 23,202412:01 NIPCO House, 4- Shaharah e Fatima Jinnah, Lahore, Pakistan Tel: +924236367580 | Fax : +924236367005']


In [5]:
contactLLM.predict(chunks[0], execution_time=True)

{'emails': ['info@nation.com.pk'],
 'phones': ['+92(42)36367580',
  '+92(21)58437203',
  '+92(61)5455714',
  '+92(51)22026414',
  '+92(42)36367005',
  '+92(21)5854932',
  '+92(61)580958',
  '+924236367580'],
 'execution_time': '63.807145146653056 s | 1.0634524191108843 min'}

## PageProcessing (clean html text)

In [57]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup, Comment

import time
import csv

import pandas as pd

class PageProcessing:
    def __init__(self):
        # IP address and port and server of the Selenium hub and browser options
        self.HUB_HOST = "192.168.3.32"
        self.HUB_PORT = 4444
        self.server = f"http://{self.HUB_HOST}:{self.HUB_PORT}/wd/hub"
        self.options = webdriver.ChromeOptions()
        self.driver = None
        
    def count_numbers(self, word):
        i=0
        word = word.split(' ')
        word = ''.join(word)
        for char in word:
            if char.isdigit():
                i += 1
        return i
        
    def get_source_page_from_url(self, url):
        try:
            self.driver = webdriver.Remote(command_executor=self.server, options=self.options)

            self.driver.get(url)

            # Attendre que la page soit complètement chargée
            wait = WebDriverWait(self.driver, 5)
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))

            source_page = self.driver.page_source
            
            self.driver.quit()
            
            return source_page
        except Exception as e:
            print(f"Errooooooor : Une exception s'est produite : {e}")
            
    def get_clean_html_from_source_page(self, source_page):
        soup = BeautifulSoup(source_page, 'html.parser')

        # Vider le contenu de la balise <head>
        before = len(str(soup).split())
        head_tag = soup.find('head')
        head_tag.clear()
        after = len(str(soup).split())
        print(f"-----------> Vider le contenu de la balise <head>: {after - before} deleted words")

        # supprimer les commentaires
        before = len(str(soup).split())
        comments = soup.find_all(string=lambda text: isinstance(text, Comment))
        for comment in comments:
            comment.extract()
        after = len(str(soup).split())
        print(f"-----------> supprimer les commentaires : {after - before} deleted words")
        
        # Supprimer les balises script... avec leur contenu
        before = len(str(soup).split())
        for script in soup(["script", "noscript", "style", "img","input", "textarea"]):
            script.extract()
        after = len(str(soup).split())
        print(f"-----------> Supprimer les balises avec leur contenu : script, noscript, style, ul, form, table, footer :  {after - before} deleted words")

        # Supprimer les balises de formatage de text sans suppression du contenu de balise
        # before = len(str(soup).split())
        # for tag in soup(["b", "strong", "i", "em", "u", "s", "sub", "sup", "small", "abbr", "mark", "del", "ins", "span"]):
        #     tag.unwrap()
        # after = len(str(soup).split())
        # print(f"-----------> Supprimer les balises de formatage de text sans suppression du contenu de balise: {after - before} deleted words")

        # # Sélection de toutes les balises <a>
        before = len(str(soup).split())
        for tag in soup.find_all('a'):
            if '@' not in tag.get_text() and '@' not in tag.get('href', '') and self.count_numbers(tag.get_text()) <= 8:
                tag.extract()
        after = len(str(soup).split())
        print(f"-----------> Sélection et filtrer de toutes les balises <a>: {after - before} deleted words")

        # Récupérer le HTML nettoyé
        return soup.text

    def get_clean_html_from_url(self, url):
        print("Start get_source_page_from_url")
        start = time.perf_counter()
        source_page = self.get_source_page_from_url(url)
        end = time.perf_counter()
        print(f"Execution time of get_source_page_from_url : {end - start:.6f} seconds")
        print(f"number of words in the source_page : {len(str(source_page).split())}")
        print(f"number of characters in the source_page : {len(str(source_page))}\n{'-'*100}")

        print("Start get_clean_html_from_source_page")
        start = time.perf_counter()
        clean_html = self.get_clean_html_from_source_page(source_page)
        end = time.perf_counter()
        print(f"Execution time of get_clean_html_from_source_page : {end - start:.6f} seconds")
        print(f"number of words in the clean_html : {len( str(clean_html).split() )}")
        print(f"number of characters in the clean_html : {len(str(clean_html))}\n{'-'*100}")

        return clean_html

## SentenceProcessing (and chunking)

In [58]:
class SentenceProcessing:
    def __init__(self):
        self.MIN_LENGTH_PHONE = 8
        
    def contains_letters(self, word):
        return any(char.isalpha() for char in word)
    
    def contains_numbers(self, word):
        return any(char.isdigit() for char in word)
        
    def count_numbers(self, word):
        i=0
        word = word.split(' ')
        word = ''.join(word)
        for char in word:
            if char.isdigit():
                i += 1
        return i
        
    def is_phone(self, word):
        return self.contains_numbers(word) and self.count_numbers(word) > self.MIN_LENGTH_PHONE
        
    def is_email(self,word):
        return self.contains_letters(word) and '@' in word and '.' in word
        
    def get_text_elements(self, text, max_words_before_phone_number_or_email = 5):
        splited_text = [ n.strip() for n in text.split('\n') if (n.strip() != '')]
        number_of_words_before_phone_or_email = 0
        p_text_liste = []
        for index, elem in enumerate(splited_text):
            if (self.is_phone(elem) or ('@' in elem and '.' in elem)): # if elem contains phone number or email
                if(number_of_words_before_phone_or_email==0):
                    p_text_liste.append(elem)
                else:
                    last_elems_tokens = (" ".join([elem_ for elem_ in splited_text[index-number_of_words_before_phone_or_email:index]])).split()
                    last_min = min(max_words_before_phone_number_or_email, len(last_elems_tokens))
                    p_text_liste.append(" ".join(last_elems_tokens[-last_min:]))
                    p_text_liste.append(elem)
                number_of_words_before_phone_or_email = 0
            else:
                number_of_words_before_phone_or_email += 1
        return p_text_liste

    def word_process(self, word):
        new_word = ""
        if (self.contains_letters(word) or (not self.contains_numbers(word))) and ('@' not in word):
            if word[0].isdigit():
                last_type = "digit"
            elif word[0].isalpha():
                last_type = "alpha"
            else:
                last_type = "other"
                
            for i, char in enumerate(word):
                if (char.isdigit() and last_type == "digit") or (char.isalpha() and last_type == "alpha"): # the same type
                    new_word += char
                elif char.isdigit() and last_type != "digit":
                    new_word += " " + char
                    last_type = "digit"
                elif char.isalpha() and last_type != "alpha":
                    new_word += " " + char
                    last_type = "alpha"
                    
                else : # char is other
                    if last_type == "alpha" and (char not in ["+","-","("]):
                        new_word += char
                        last_type = "other"
                        
                    elif last_type == "digit" :
                        new_word += char
                        last_type = "other"
                        
                    elif last_type == "other" and (len(new_word)>0 and (new_word[-1] not in [":",",","،",".","?","!"]) or (char in [":",",","،",".","?","!"])):
                        new_word += char
                    else:
                        new_word += " " + char
                        last_type = "other"
        else:
            new_word = word
        return new_word.strip()
        
    def sentence_process(self, text_elements) :
        new_text_elements = []
        for sentence in text_elements:
            words = sentence.split(' ')
            new_words = []
            for word in words:
                if word != '':
                    new_word = self.word_process(word)
                    if new_word != '':
                        for wrd in new_word.split(' '):
                            new_words.append(wrd)
                        
            new_sentence = ""
            for index, word in enumerate(new_words):
                if not self.contains_letters(word):  # word = number or [+ ( ) - . , ...]
                    if len(new_sentence) > 0 :
                        if self.contains_letters(new_sentence.split(' ')[-1]) : # or (not self.contains_numbers(new_sentence.split(' ')[-1])) or (not self.contains_numbers(word))
                            new_sentence += " "+word
                        # if two numbers
                        elif self.contains_numbers(new_sentence.split(' ')[-1]) and self.count_numbers(new_sentence.split(' ')[-1]) > self.MIN_LENGTH_PHONE and self.count_numbers(word) > self.MIN_LENGTH_PHONE :
                            new_sentence += " "+word
                        # if number with number or with [+ ( ) - . , ...]
                        else:
                            new_sentence += word
                    else:
                        new_sentence += word
                else :
                    if len(new_sentence) > 0 :
                        new_sentence += " "+word
                    else :
                        new_sentence += word
            
            new_text_elements.append(new_sentence.strip())
        return new_text_elements
        
    def clean_phone(self,text_elements):
        new_text_elements = []
        for sentence in text_elements:
            sentence_list = sentence.split(' ')
            new_sentence_list = []
            for word in sentence_list:
                new_phone_number = word
                if self.is_phone(word): # word == phone number
                    start_index = -1;
                    end_index = -1
                    len_phone = len(word)
                    for i, char in enumerate(word):
                        if ( char.isdigit() or char in ["+","(","-"] ) and start_index == -1:
                            start_index = i
                            
                        if ( word[len_phone-1 - i].isdigit() or word[len_phone-1 - i] == ")" ) and end_index == -1:
                            end_index = len_phone - i
                            
                        if start_index != -1 and end_index != -1:
                            break
                            
                    new_phone_number= word[start_index:end_index]
                    if start_index != 0:
                        new_phone_number = word[0:start_index] + " " + new_phone_number
                    if end_index != len_phone :
                        new_phone_number += " " + word[end_index:len_phone]
                        
                new_sentence_list.append(new_phone_number)
                
            new_text_elements.append((" ".join(new_sentence_list)).strip())
        return new_text_elements
    
    def reduce_words_before_phone_and_email(self, liste_elements, max_words_bitween_phones_emails = 10):
        new_liste_elements = []
        number_of_words_before_phone_or_email = 0
        for elem in liste_elements:
            liste_words_at_elem = elem.split()
            new_liste_words_at_elem = []
            for index, word in enumerate(liste_words_at_elem):
                if self.is_phone(word) or self.is_email(word) : # word in elem is a phone or an email
                    if (number_of_words_before_phone_or_email <= index) : # add juste words in the same elem
                        min_number_before = min(number_of_words_before_phone_or_email, max_words_bitween_phones_emails)
                        for wrd in liste_words_at_elem[index-min_number_before: index+1]:
                            new_liste_words_at_elem.append(wrd)
                    else:  # add words in the same elem and in the previous elem ( if the previous elem does not contain phone or email )
                        if index >= max_words_bitween_phones_emails:
                            for wrd in liste_words_at_elem[index-max_words_bitween_phones_emails: index+1]:
                                new_liste_words_at_elem.append(wrd)
                            if not (self.is_phone(new_liste_elements[-1]) or self.is_email(new_liste_elements[-1])):
                                new_liste_elements.pop()
                        else:
                            for wrd in liste_words_at_elem[0: index+1]:
                                new_liste_words_at_elem.append(wrd)
                                
                    number_of_words_before_phone_or_email = 0
                else:                                           # word in elem is not a phone and not an email
                    number_of_words_before_phone_or_email += 1
                    
            if not new_liste_words_at_elem:
                if new_liste_elements :
                    if not any([ self.is_phone(wrd) or self.is_email(wrd) for wrd in new_liste_elements[-1].split()]) :
                        new_liste_elements.pop()
                new_liste_elements.append(elem)
            else:
                new_liste_elements.append(" ".join(new_liste_words_at_elem))
        return new_liste_elements
        
    def extract_chunks(self, p_text_liste, max_len_chunk = 200, pad = 6):
        new_text = ' |&| '.join(p_text_liste)
        text_list = new_text.split()
        len_text_list = len(text_list)
        n_chunks= int(len_text_list / max_len_chunk) + 1
        chunks = []
        for i in range(n_chunks):
            if i == 0:
                start = i*max_len_chunk
                end = min((i+1)*max_len_chunk,len_text_list)
            else:
                start = end - pad
                end = min(((i+1)*max_len_chunk) - pad , len_text_list)
                
            chunk = text_list[start:end]
            chunks.append(' '.join(chunk))
            
            if i == n_chunks-1 and end != len_text_list and len(chunk)==max_len_chunk:
                start = end - pad
                end = len_text_list
                chunk = text_list[start:end]
                chunks.append(' '.join(chunk))
        return chunks

    def get_chunks_from_clean_html_text(self, clean_html):
        text_elements = self.get_text_elements(clean_html)
        text_elements = self.sentence_process(text_elements)
        text_elements = self.clean_phone(text_elements)
        text_elements = self.reduce_words_before_phone_and_email(text_elements, max_words_bitween_phones_emails=10)
        chunks = self.extract_chunks(text_elements)
        return chunks

In [59]:
pageProcessing = PageProcessing()

In [5]:
urls = ["https://gulfnews.com/contact-us",
        "https://www.heraldscotland.com/contact/",
        "https://www.koreatimes.co.kr/www2/common/contactus.asp",
        "https://menafn.com/mf_contact.aspx",
        "https://middle-east-online.com/contact",
        "https://www.miningweekly.com/page/about-us",
        "https://www.mirror.co.uk/contact-us/",
        "https://www.monitor.co.ug/uganda/contact-us-1934576",
        "https://www.nation.com.pk/contact-us"]

In [102]:
j = 8
clean_html = pageProcessing.get_clean_html_from_url(urls[j])
print(urls[j])
clean_html

Start get_source_page_from_url
Execution time of get_source_page_from_url : 3.288771 seconds
number of words in the source_page : 7774
number of characters in the source_page : 110842
----------------------------------------------------------------------------------------------------
Start get_clean_html_from_source_page
-----------> Vider le contenu de la balise <head>: -1782 deleted words
-----------> supprimer les commentaires : 0 deleted words
-----------> Supprimer les balises avec leur contenu : script, noscript, style, ul, form, table, footer :  -1392 deleted words
-----------> Sélection et filtrer de toutes les balises <a>: -1806 deleted words
Execution time of get_clean_html_from_source_page : 0.358920 seconds
number of words in the clean_html : 263
number of characters in the clean_html : 2808
----------------------------------------------------------------------------------------------------
https://www.nation.com.pk/contact-us


'             nawaiwaqt group                                      Thursday, May 23, 2024                                                                                                                             CONNECT WITH US                                                                               Contact Us           Head Office      EMAIL US       info@nation.com.pk      VISIT US       NIPCO House 23 Shahrah e Fatima Jinnah, Lahore - Pakistan      CALL US       T: +92 (42) 3636 7580 F: +92 (42) 3636 7005       Branch Offices      Karachi Office       info@nation.com.pk        Khayaban-e-Shamsheer, Phase V, D.H.S., Karachi - Pakistan        T: +92 (21) 5843 7203 F: +92 (21) 5854 932      Multan Office       info@nation.com.pk        Nawaiwaqt House Abdali Road, Multan - Pakistan        T: +92 (61) 545 5714 F: +92 (61) 580 958      Islamabad Office       info@nation.com.pk        Nawaiwaqt House Zero Point, Islamabad - Pakistan        T: +92 (51) 2202 6414 F: +92 (42) 2202 645

In [103]:
sentenceProcessing = SentenceProcessing()

In [104]:
chunks = sentenceProcessing.get_chunks_from_clean_html_text(clean_html)
print(len(chunks))
print(chunks)

1
['23,2024 CONNECT WITH US Contact Us Head Office EMAIL US info@nation.com.pk Shahrah e Fatima Jinnah, Lahore - Pakistan CALL US T: +92(42)36367580 F: +92(42)36367005 Branch Offices Karachi Office info@nation.com.pk Shamsheer, Phase V, D. H. S., Karachi - Pakistan T: +92(21)58437203 F: +92(21)5854932 Multan Office info@nation.com.pk Nawaiwaqt House Abdali Road, Multan - Pakistan T: +92(61)5455714 F: +92(61)580958 Islamabad Office info@nation.com.pk Nawaiwaqt House Zero Point, Islamabad - Pakistan T: +92(51)22026414 F: +92(42)22026456 Contact Us Form Loading... Hajj 2024:31,057 Pakistani pilgrims reach Saudi Arabia 1:34 PM | May 23,202412:28 PM | May 23,202412:08 PM | May 23,202412:01 NIPCO House, 4- Shaharah e Fatima Jinnah, Lahore, Pakistan Tel: +924236367580 | Fax : +924236367005']


In [82]:
contactLLM = ContactLLM()

In [90]:
contactLLM.predict(chunks[0], execution_time=True)

{'emails': ['info@nation.com.pk'],
 'phones': ['+92(42)36367580',
  '+92(21)58437203',
  '+92(61)5455714',
  '+92(51)22026414',
  '+924236367580',
  '+924236367005'],
 'execution_time': '47.79198436066508 s | 0.7965330726777514 min'}