In [345]:
import requests
from bs4 import BeautifulSoup
import re
from sklearn.decomposition import LatentDirichletAllocation


In [346]:
num_clusters = 10 # Number of topics to extract

In [347]:
def url_reader(url):
    # Send a GET request to the URL
    keyword = 'Claims'
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")
        
        topic_title = soup.title.string

        components = topic_title.split(" - ")

        title = components[1]

        type_patent = components[2]
        # Exclude content within table elements
        for table in soup.find_all('table'):
            table.extract()
            
        # Find all text on the webpage
        all_text = soup.get_text()

        # Split the text into sentences with periods as separators
        sentences = re.split(r'(?<=[.!?])\s+', all_text)

        # Initialize a flag to indicate when to start adding sentences
        add_sentences = False
        assertion_words = ["claim", "assert", "argue", "propose", "conclude",
                        "suggest", "state", "believe", "affirm", "maintain", "contend", "insist" , "apparatus"]
        
        # List to store sentences containing the keyword
        keyword_sentences = []
        # Iterate through the sentences
        for sentence in sentences:
            # Check if the keyword is found in the sentence
            if keyword in sentence:
                # Set the flag to True to start adding sentences
                add_sentences = True


            # Add the sentence if the flag is True and it's not empty
            if add_sentences and any(word.lower() in sentence.lower() for word in assertion_words) and sentence.strip().endswith(".") and title not in sentence and type_patent not in sentence and "Fig" not in sentence and "Claims (" not in sentence:
                keyword_sentences.append(sentence.strip())

        return keyword_sentences
    else:
        print("Failed to fetch the webpage. Status code:", response.status_code)


In [348]:


# Function to assign topic to a sentence based on occurrence of keywords
def assign_topic(sentences, topic_keywords):
    assigned = []
    remaining_sentences = sentences.copy()  
    # Iterate over each sentence
    for topic, keywords in topic_keywords.items():
        if not remaining_sentences:
            break
        best_topic = "Other"
        best_score = 0 # Track the highest score for the assigned topic
        score = 0 
        # Iterate over each topic and calculate the score for the sentence
        for sentence in remaining_sentences:
            matches = sum(keyword in sentence.lower() for keyword in keywords)
            score = matches / len(keywords)  # Calculate the suitability score
            if best_score < score:
                best_score = score
                best_topic = topic
        id = sentences.index(sentence)
        assigned.append((best_topic, id))
        remaining_sentences.remove(sentence)

    return assigned


In [349]:
def topic_render(text):
    # Define keywords for each topic
    topic_keywords = {
        "Wireless": ["wireless", "loss"],
        "Telephone": ["telephone", "phone"],
        "Call": ["call", "indication" , "conversation"],
        "Functionality": ["functionality","Efficient" , "switching" , "quality of service"],
        "Communication": ["communication", "source" , "network"],
        "Device": ["device"],
        "Service": ["service"],
        "Protocols": ["protocols"],
    }

    # Assign topics to sentences
    titles = assign_topic(text , topic_keywords)
    
    return titles

In [350]:
def split_test_claims(urls):
    claim_sentences = []

    # Data Collection and Preprocessing
    for url in urls:
        # Fetch and process the text from the URL
        text = url_reader(url)
        claim_sentences.extend(text)
    return claim_sentences

In [351]:
urls = ["https://patents.google.com/patent/GB2478972A/en?q=(phone)&oq=phone","https://patents.google.com/patent/US9980046B2/en?oq=US9980046B2", "https://patents.google.com/patent/US9634864B2/en?oq=US9634864B2"]
test = split_test_claims(urls)

In [352]:
def print_the_results(labels, sentences):
    cluster_dict = {}
    for cluster_num, sentence in zip(labels, sentences):
        if cluster_num not in cluster_dict:
            cluster_dict[cluster_num] = [sentence]
        else:
            cluster_dict[cluster_num].append(sentence)

    all_text = []
    array_num_of_claims = []
    text = ""
    # Print the cluster dictionary
    for cluster_num, cluster_sentences in cluster_dict.items():
        number_of_claims = 0
        for sentence in cluster_sentences:
            number_of_claims += 1
            text += " " + sentence
        all_text.append(text)
        array_num_of_claims.append(number_of_claims)
        text = ""

    titles = topic_render(all_text)
    for title in titles:
        print(f"title: {title[0]}, numbers of claims: {array_num_of_claims[title[1]-1]}")
    print()

    


In [353]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [354]:
# Function to generate sentence embeddings using SentenceTransformer
def generate_embeddings(sentences, model_name='bert-base-nli-mean-tokens'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(sentences)
    return embeddings

In [355]:
def model_KMeansST(test_claim_sentences):

    model_name = 'bert-base-nli-mean-tokens'

    # Topic Modeling (K-means clustering)
   
    kmeans_model = KMeans(n_clusters=num_clusters, random_state=42, init='k-means++', n_init=10)
    
    test_embeddings = generate_embeddings(test_claim_sentences, model_name)
    kmeans_model.fit(test_embeddings)
    predicted_clusters = kmeans_model.predict(test_embeddings)
    
    # Print the predicted clusters for each test sentence
    print("Predicted Clusters for New Data:")
    for i, cluster_label in enumerate(predicted_clusters):
        print(f"Document {i + 1}: Cluster = {cluster_label + 1}")
    print_the_results(predicted_clusters, test_claim_sentences)


In [356]:
model_KMeansST(test)



Predicted Clusters for New Data:
Document 1: Cluster = 8
Document 2: Cluster = 5
Document 3: Cluster = 8
Document 4: Cluster = 1
Document 5: Cluster = 1
Document 6: Cluster = 10
Document 7: Cluster = 10
Document 8: Cluster = 9
Document 9: Cluster = 9
Document 10: Cluster = 9
Document 11: Cluster = 1
Document 12: Cluster = 7
Document 13: Cluster = 8
Document 14: Cluster = 2
Document 15: Cluster = 2
Document 16: Cluster = 2
Document 17: Cluster = 2
Document 18: Cluster = 5
Document 19: Cluster = 5
Document 20: Cluster = 5
Document 21: Cluster = 2
Document 22: Cluster = 7
Document 23: Cluster = 2
Document 24: Cluster = 5
Document 25: Cluster = 5
Document 26: Cluster = 2
Document 27: Cluster = 2
Document 28: Cluster = 2
Document 29: Cluster = 2
Document 30: Cluster = 2
Document 31: Cluster = 3
Document 32: Cluster = 6
Document 33: Cluster = 1
Document 34: Cluster = 1
Document 35: Cluster = 6
Document 36: Cluster = 3
Document 37: Cluster = 6
Document 38: Cluster = 4
Document 39: Cluster = 4

In [357]:
# Convert each sentence to a vector representation by averaging word vectors
def sentence_to_vector(sentence, model):
    words = [word for word in sentence.split() if word in model.wv]
    if not words:
        return np.zeros(model.vector_size)
    vectors = [model.wv[word] for word in words]
    return np.mean(vectors, axis=0)

In [358]:
def test_model_KMeansW2V(kmeans_model , test_claim_sentences , word2vec_model):
    new_data_vectors = np.array([sentence_to_vector(sentence, word2vec_model) for sentence in test_claim_sentences])

    if len(new_data_vectors) > 0:
        new_data_clusters = kmeans_model.predict(new_data_vectors)
        
        # Calculate distances to cluster centers for probability-like measure
        distances = kmeans_model.transform(new_data_vectors)

        # Print the predicted topics for each document
        print("Predicted Clusters for New Data:")
        for i, cluster_label in enumerate(new_data_clusters):
            print(f"Document {i+1}: Cluster = {cluster_label + 1}")
        print_the_results(new_data_clusters, test_claim_sentences)

In [359]:
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import numpy as np


In [360]:
def model_KMeansW2V(test_claim_sentences):

    # Train Word2Vec model
    word2vec_model = Word2Vec(sentences=[sentence.split() for sentence in test_claim_sentences], vector_size=100, window=5, min_count=1, workers=4)

    test_vectors = np.array([sentence_to_vector(sentence, word2vec_model) for sentence in test_claim_sentences])

    # Topic Modeling (K-means clustering)
    kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans_model.fit(test_vectors)

    if len(test_vectors) > 0:
        new_data_clusters = kmeans_model.predict(test_vectors)

        # Print the predicted topics for each document
        print("Predicted Clusters for New Data:")
        for i, cluster_label in enumerate(new_data_clusters):
            print(f"Document {i+1}: Cluster = {cluster_label + 1}")
        print_the_results(new_data_clusters, test_claim_sentences)


In [361]:
model_KMeansW2V(test)

Predicted Clusters for New Data:
Document 1: Cluster = 9
Document 2: Cluster = 9
Document 3: Cluster = 9
Document 4: Cluster = 5
Document 5: Cluster = 8
Document 6: Cluster = 5
Document 7: Cluster = 5
Document 8: Cluster = 8
Document 9: Cluster = 8
Document 10: Cluster = 9
Document 11: Cluster = 5
Document 12: Cluster = 5
Document 13: Cluster = 9
Document 14: Cluster = 6
Document 15: Cluster = 3
Document 16: Cluster = 7
Document 17: Cluster = 3
Document 18: Cluster = 1
Document 19: Cluster = 6
Document 20: Cluster = 6
Document 21: Cluster = 1
Document 22: Cluster = 9
Document 23: Cluster = 7
Document 24: Cluster = 6
Document 25: Cluster = 6
Document 26: Cluster = 3
Document 27: Cluster = 7
Document 28: Cluster = 6
Document 29: Cluster = 6
Document 30: Cluster = 3
Document 31: Cluster = 4
Document 32: Cluster = 4
Document 33: Cluster = 4
Document 34: Cluster = 4
Document 35: Cluster = 4
Document 36: Cluster = 10
Document 37: Cluster = 4
Document 38: Cluster = 2
Document 39: Cluster = 2


In [362]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [363]:
def model_LDA(test_claim_sentences):
    # Preprocessing
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(test_claim_sentences)

    # Topic Modeling (LDA)
    lda_model = LatentDirichletAllocation(n_components=num_clusters, random_state=42)
    lda_model.fit(X)
    # Preprocess the new data
    test_vectorized = vectorizer.transform(test_claim_sentences)
    # Predict cluster for the new data
    test_cluster_distributions = lda_model.transform(test_vectorized)
    
    predict = 0
    cluster = 0
    test_clusters = []
    # Print the predicted topics for each document
    print("Predicted Topics for New Data:")
    for i, cluster_distribution in enumerate(test_cluster_distributions):
        for cluster_idx, prob in enumerate(cluster_distribution):
            if predict < prob:
                predict = prob
                cluster = cluster_idx + 1
        test_clusters.append(cluster)
        print(f"Document {i+1}: Cluster: {cluster}")
        predict = 0
    print_the_results(test_clusters, test_claim_sentences)



In [364]:
model_LDA(test)

Predicted Topics for New Data:
Document 1: Cluster: 9
Document 2: Cluster: 9
Document 3: Cluster: 3
Document 4: Cluster: 3
Document 5: Cluster: 9
Document 6: Cluster: 9
Document 7: Cluster: 10
Document 8: Cluster: 9
Document 9: Cluster: 9
Document 10: Cluster: 9
Document 11: Cluster: 3
Document 12: Cluster: 3
Document 13: Cluster: 9
Document 14: Cluster: 8
Document 15: Cluster: 2
Document 16: Cluster: 1
Document 17: Cluster: 1
Document 18: Cluster: 3
Document 19: Cluster: 5
Document 20: Cluster: 5
Document 21: Cluster: 2
Document 22: Cluster: 2
Document 23: Cluster: 1
Document 24: Cluster: 5
Document 25: Cluster: 5
Document 26: Cluster: 1
Document 27: Cluster: 1
Document 28: Cluster: 8
Document 29: Cluster: 2
Document 30: Cluster: 1
Document 31: Cluster: 3
Document 32: Cluster: 7
Document 33: Cluster: 7
Document 34: Cluster: 7
Document 35: Cluster: 7
Document 36: Cluster: 7
Document 37: Cluster: 7
Document 38: Cluster: 5
Document 39: Cluster: 5
Document 40: Cluster: 5
Document 41: Clus