In [246]:
import requests
from bs4 import BeautifulSoup
import re
from sklearn.decomposition import LatentDirichletAllocation


In [247]:
num_clusters = 10 # Number of topics to extract

In [248]:
def url_reader(url):
    # Send a GET request to the URL
    keyword = 'Claims'
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")
        
        topic_title = soup.title.string

        components = topic_title.split(" - ")

        title = components[1]

        type_patent = components[2]
        # Exclude content within table elements
        for table in soup.find_all('table'):
            table.extract()
            
        # Find all text on the webpage
        all_text = soup.get_text()

        # Split the text into sentences with periods as separators
        sentences = re.split(r'(?<=[.!?])\s+', all_text)

        # Initialize a flag to indicate when to start adding sentences
        add_sentences = False
        assertion_words = ["claim", "assert", "argue", "propose", "conclude",
                        "suggest", "state", "believe", "affirm", "maintain", "contend", "insist" , "apparatus"]
        
        # List to store sentences containing the keyword
        keyword_sentences = []
        # Iterate through the sentences
        for sentence in sentences:
            # Check if the keyword is found in the sentence
            if keyword in sentence:
                # Set the flag to True to start adding sentences
                add_sentences = True


            # Add the sentence if the flag is True and it's not empty
            if add_sentences and any(word.lower() in sentence.lower() for word in assertion_words) and sentence.strip().endswith(".") and title not in sentence and type_patent not in sentence and "Fig" not in sentence and "Claims (" not in sentence:
                keyword_sentences.append(sentence.strip())

        return keyword_sentences
    else:
        print("Failed to fetch the webpage. Status code:", response.status_code)


In [249]:


# Function to assign topic to a sentence based on occurrence of keywords
def assign_topic(sentences, topic_keywords):
    assigned = []
    remaining_sentences = sentences.copy()  
    # Iterate over each sentence
    for topic, keywords in topic_keywords.items():
        if not remaining_sentences:
            break
        best_topic = "Other"
        best_score = 0 # Track the highest score for the assigned topic
        score = 0 
        # Iterate over each topic and calculate the score for the sentence
        for sentence in remaining_sentences:
            matches = sum(keyword in sentence.lower() for keyword in keywords)
            score = matches / len(keywords)  # Calculate the suitability score
            if best_score < score:
                best_score = score
                best_topic = topic
        id = sentences.index(sentence)
        assigned.append((best_topic, id))
        remaining_sentences.remove(sentence)

    return assigned


In [250]:
def topic_render(text):
    # Define keywords for each topic
    topic_keywords = {
        "Wireless": ["wireless", "loss"],
        "Telephone": ["telephone", "phone"],
        "Call": ["call", "indication" , "conversation"],
        "Functionality": ["functionality","Efficient" , "switching" , "quality of service"],
        "Communication": ["communication", "source" , "network"],
        "Device": ["device"]
    }

    # Assign topics to sentences
    titles = assign_topic(text , topic_keywords)
    
    return titles

In [251]:
def split_test_train(urls):
    train_claim_sentences = []
    test_claim_sentences = []

    # Data Collection and Preprocessing
    for i, url in enumerate(urls):
        # Fetch and process the text from the URL
        text = url_reader(url)
        if i == 2:
            test_claim_sentences = text
        else:
            train_claim_sentences.extend(text)
    return train_claim_sentences , test_claim_sentences

In [252]:
urls = ["https://patents.google.com/patent/GB2478972A/en?q=(phone)&oq=phone","https://patents.google.com/patent/US9980046B2/en?oq=US9980046B2", "https://patents.google.com/patent/US9634864B2/en?oq=US9634864B2"]
train , test = split_test_train(urls)

In [253]:
def print_the_results(labels, sentences):
    cluster_dict = {}
    for cluster_num, sentence in zip(labels, sentences):
        if cluster_num not in cluster_dict:
            cluster_dict[cluster_num] = [sentence]
        else:
            cluster_dict[cluster_num].append(sentence)

    all_text = []
    array_num_of_claims = []
    text = ""
    # Print the cluster dictionary
    for cluster_num, cluster_sentences in cluster_dict.items():
        number_of_claims = 0
        for sentence in cluster_sentences:
            number_of_claims += 1
            text += " " + sentence
        all_text.append(text)
        array_num_of_claims.append(number_of_claims)
        text = ""

    titles = topic_render(all_text)
    for title in titles:
        print(f"title: {title[0]}, numbers of claims: {array_num_of_claims[title[1]-1]}")
    print()

    


In [254]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [255]:
# Function to generate sentence embeddings using SentenceTransformer
def generate_embeddings(sentences, model_name='bert-base-nli-mean-tokens'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(sentences)
    return embeddings

In [256]:
# Function to test KMeans model on new sentences
def test_model_KMeansST(kmeans_model, test_claim_sentences, model_name):
    test_embeddings = generate_embeddings(test_claim_sentences, model_name)
    predicted_clusters = kmeans_model.predict(test_embeddings)
    
    # Print the predicted clusters for each test sentence
    print("Predicted Clusters for New Data:")
    for i, cluster_label in enumerate(predicted_clusters):
        print(f"Document {i + 1}: Cluster = {cluster_label + 1}")
    print_the_results(predicted_clusters, test_claim_sentences)

In [257]:
def model_KMeansST(train_claim_sentences , test_claim_sentences):

    model_name = 'bert-base-nli-mean-tokens'

    # Generate embeddings for training sentences
    train_embeddings = generate_embeddings(train_claim_sentences)

    # Topic Modeling (K-means clustering)
   
    kmeans_model = KMeans(n_clusters=num_clusters, random_state=42, init='k-means++', n_init=10)
    kmeans_model.fit(train_embeddings)

    # Test the KMeans model on new sentences
    test_model_KMeansST(kmeans_model, test_claim_sentences , model_name)


In [258]:
model_KMeansST(train , test)



title: Wireless, numbers of claims: 3
title: Other, numbers of claims: 4
title: Call, numbers of claims: 6
title: Functionality, numbers of claims: 2
title: Communication, numbers of claims: 1



In [259]:
# Convert each sentence to a vector representation by averaging word vectors
def sentence_to_vector(sentence, model):
    words = [word for word in sentence.split() if word in model.wv]
    if not words:
        return np.zeros(model.vector_size)
    vectors = [model.wv[word] for word in words]
    return np.mean(vectors, axis=0)

In [260]:
def test_model_KMeansW2V(kmeans_model , test_claim_sentences , word2vec_model):
    new_data_vectors = np.array([sentence_to_vector(sentence, word2vec_model) for sentence in test_claim_sentences])

    if len(new_data_vectors) > 0:
        new_data_clusters = kmeans_model.predict(new_data_vectors)
        
        # Calculate distances to cluster centers for probability-like measure
        distances = kmeans_model.transform(new_data_vectors)

        # Print the predicted topics for each document
        print("Predicted Clusters for New Data:")
        for i, cluster_label in enumerate(new_data_clusters):
            print(f"Document {i+1}: Cluster = {cluster_label + 1}")
        print_the_results(new_data_clusters, test_claim_sentences)

In [261]:
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import numpy as np


In [262]:
def model_KMeansW2V(train_claim_sentences, test_claim_sentences):

    # Train Word2Vec model
    word2vec_model = Word2Vec(sentences=[sentence.split() for sentence in train_claim_sentences], vector_size=100, window=5, min_count=1, workers=4)

    train_vectors = np.array([sentence_to_vector(sentence, word2vec_model) for sentence in train_claim_sentences])

    # Topic Modeling (K-means clustering)
    kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans_model.fit(train_vectors)

    test_model_KMeansW2V(kmeans_model , test_claim_sentences , word2vec_model)



In [263]:
model_KMeansW2V(train , test)

Predicted Clusters for New Data:
Document 1: Cluster = 4
Document 2: Cluster = 9
Document 3: Cluster = 10
Document 4: Cluster = 1
Document 5: Cluster = 4
Document 6: Cluster = 4
Document 7: Cluster = 4
Document 8: Cluster = 10
Document 9: Cluster = 10
Document 10: Cluster = 10
Document 11: Cluster = 4
Document 12: Cluster = 5
Document 13: Cluster = 4
Document 14: Cluster = 4
Document 15: Cluster = 2
Document 16: Cluster = 10
title: Wireless, numbers of claims: 1
title: Other, numbers of claims: 1
title: Call, numbers of claims: 5
title: Functionality, numbers of claims: 1
title: Communication, numbers of claims: 7
title: Other, numbers of claims: 1



In [264]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [265]:
def test_model_LDA(test_claim_sentences, vectorizer, lda_model):
    # Preprocess the new data
    new_data_vectorized = vectorizer.transform(test_claim_sentences)
    # Predict cluster for the new data
    new_data_cluster_distributions = lda_model.transform(new_data_vectorized)
    
    predict = 0
    cluster = 0
    new_data_clusters = []
    # Print the predicted topics for each document
    print("Predicted Topics for New Data:")
    for i, cluster_distribution in enumerate(new_data_cluster_distributions):
        for cluster_idx, prob in enumerate(cluster_distribution):
            if predict < prob:
                predict = prob
                cluster = cluster_idx + 1
        new_data_clusters.append(cluster)
        print(f"Document {i+1}: Cluster: {cluster}")
        predict = 0
    print_the_results(new_data_clusters, test_claim_sentences)

In [266]:
def model_LDA(train_claim_sentences , test_claim_sentences):
    # Preprocessing
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(train_claim_sentences)

    # Topic Modeling (LDA)
    lda_model = LatentDirichletAllocation(n_components=num_clusters, random_state=42)
    lda_model.fit(X)
    
    # # Store the sentence index and its corresponding topic
    sentence_topics = []
    for sentence_idx, (topic_idx, _) in enumerate(zip(lda_model.transform(X).argmax(axis=1), test_claim_sentences)):
        sentence_topics.append((sentence_idx + 1, topic_idx + 1))
  
    test_model_LDA(test_claim_sentences, vectorizer, lda_model)



In [267]:
model_LDA(train , test)

Predicted Topics for New Data:
Document 1: Cluster: 2
Document 2: Cluster: 2
Document 3: Cluster: 2
Document 4: Cluster: 2
Document 5: Cluster: 2
Document 6: Cluster: 8
Document 7: Cluster: 2
Document 8: Cluster: 2
Document 9: Cluster: 2
Document 10: Cluster: 2
Document 11: Cluster: 8
Document 12: Cluster: 8
Document 13: Cluster: 8
Document 14: Cluster: 8
Document 15: Cluster: 8
Document 16: Cluster: 8
title: Wireless, numbers of claims: 9
title: Other, numbers of claims: 7

