In [20]:
import requests
from bs4 import BeautifulSoup
import re
from sklearn.decomposition import LatentDirichletAllocation


In [21]:
num_clusters = 10 # Number of clusters

In [22]:
def url_reader(url):
        """
        Scrapes the webpage and extracts sentences containing specified keywords.
        """
        # Keyword to search to start from this line
        keyword = 'Claims'

        # Send a GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.content, "html.parser")

            # Extract title and type of patent from webpage title
            topic_title = soup.title.string
            components = topic_title.split(" - ")
            # Patent title
            title = components[1]
            # Webpage title
            type_patent = components[2]

            # Exclude content within table elements
            for table in soup.find_all('table'):
                table.extract()

            # Find all text on the webpage
            all_text = soup.get_text()

            # Split the text into sentences with periods as separators
            sentences = re.split(r'(?<=[.!?])\s+', all_text)

            # Initialize a flag to indicate when to start adding sentences
            add_sentences = False
            assertion_words = ["claim", "assert", "argue", "propose", "conclude",
                               "suggest", "state", "believe", "affirm", "maintain", "contend", "insist", "apparatus"]

            # List to store sentences containing the keyword
            keyword_sentences = []

            # Iterate through the sentences
            for sentence in sentences:
                # Check if the keyword to start from is found in the sentence
                if keyword in sentence:
                    # Set the flag to True to start adding sentences
                    add_sentences = True

                # Add the sentence if the flag is True and it meets criteria
                if add_sentences and any(word.lower() in sentence.lower() for word in assertion_words) and \
                        sentence.strip().endswith(".") and title not in sentence and type_patent not in sentence \
                        and "Fig" not in sentence and "Claims (" not in sentence:
                    keyword_sentences.append(sentence.strip())

            return keyword_sentences
        else:
            print("Failed to fetch the webpage. Status code:", response.status_code)


In [23]:


# Function to assign topic to a sentence based on occurrence of keywords
def assign_topic(text, topic_keywords):
    assigned = []
    # Copy the original text for reuse the text
    remaining_text = text.copy()  
    # Iterate over each topic and evaluate the best score he have for the specific sentences
    for topic, keywords in topic_keywords.items():
        # If there is no more sentences in the text
        if not remaining_text:
            break
        best_topic = "Other"
        best_score = 0 # Track the highest score for the assigned topic
        score = 0 
        # Iterate over each sentence and calculate the score for the sentence
        for sentence in remaining_text:
            matches = sum(keyword in sentence.lower() for keyword in keywords)
            # Calculate the suitability score
            score = matches / len(keywords)  
            # Check the high score and adapt the topic
            if best_score < score:
                best_score = score
                best_topic = topic
        # Get the original index from the text
        id = text.index(sentence)
        # Assigned the topic to the right sentence
        assigned.append((best_topic, id))
        # Remove the sentence to iterate over the remaining sentences.
        remaining_text.remove(sentence)

    return assigned


In [24]:
def topic_render(text):
    # Define keywords for each title
    topic_keywords = {
        "Wireless": ["wireless", "loss"],
        "Telephone": ["telephone", "phone"],
        "Call": ["call", "indication" , "conversation" , "second party"],
        "Functionality": ["functionality","Efficient" , "switching" , "quality of service"],
        "Communication": ["communication", "source" , "network" , "signal"],
        "Device": ["device" ,"tool","machine"],
        "Service": ["service"],
        "Protocols": ["protocols"],
        "Microphones": ["microphones", "speaker" ,"sound Sensors" ,"acoustic Sensors" , "sound output"],
    }


    # Assign title to sentences
    titles = assign_topic(text , topic_keywords)
    
    return titles

In [25]:
def extract_test_claims(urls):

    claim_sentences = []

    # Data Collection and Preprocessing
    for url in urls:
        # Fetch and process the text from the URL
        text = url_reader(url)
        # Aggregate all the data together
        claim_sentences.extend(text)
    return claim_sentences

In [26]:
# The list of URLs provided in the homework
urls = ["https://patents.google.com/patent/GB2478972A/en?q=(phone)&oq=phone","https://patents.google.com/patent/US9980046B2/en?oq=US9980046B2", "https://patents.google.com/patent/US9634864B2/en?oq=US9634864B2"]
test = extract_test_claims(urls)

In [27]:
def print_the_results(labels, sentences):
    """
    Prints the results of clustering sentences.

    :param labels: List of cluster labels assigned to each sentence
    :param sentences: List of sentences to be clustered
    """
    # Initialize a dictionary to store sentences for each cluster
    cluster_dict = {}

    # Filling up the cluster dictionary with data. 
    for cluster_num, sentence in zip(labels, sentences):
        if cluster_num not in cluster_dict:
            cluster_dict[cluster_num] = [sentence]
        else:
            cluster_dict[cluster_num].append(sentence)

    # Initialize lists to store aggregated text and number of claims per cluster
    all_text = []
    array_num_of_claims = []
    text = ""

    # Aggregate text for each cluster and count the number of claims
    for cluster_num, cluster_sentences in cluster_dict.items():
        number_of_claims = 0
        for sentence in cluster_sentences:
            number_of_claims += 1
            # Group all the sentences from the same cluster
            text += " " + sentence
        all_text.append(text)
        array_num_of_claims.append(number_of_claims)
        text = ""

    # Get titles for each aggregated text
    titles = topic_render(all_text)

    agg_title = {}

    # Update the aggregated titles with their numbers of claims from the array_num_of_claims extracting by index
    for title in titles:
        # Check if the title is already in the aggregated titles dictionary
        if title[0] in agg_title:
            # If yes, add the number of claims to the existing count
            agg_title[title[0]] += array_num_of_claims[title[1]-1]
        else:
            # If no, initialize the count with the number of claims
            agg_title[title[0]] = array_num_of_claims[title[1]-1]

    # Print the aggregated titles and their numbers of claims
    for title, claims in agg_title.items():
        print(f"title: {title}, numbers of claims: {claims}")




In [28]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [29]:
def generate_embeddings(sentences, model_name='bert-base-nli-mean-tokens'):
    """
    Generates sentence embeddings using SentenceTransformer model.

    :param sentences: List of sentences to generate embeddings for
    :param model_name: Name of the SentenceTransformer model to use (default: 'bert-base-nli-mean-tokens')
    :return: List of sentence embeddings
    """
    # Initialize SentenceTransformer model
    model = SentenceTransformer(model_name)
    
    # Generate embeddings for the sentences
    embeddings = model.encode(sentences)
    
    return embeddings

In [30]:
def model_KMeansST(test_claim_sentences):
    """
    Performs K-means clustering on the test claim sentences using SentenceTransformer embeddings.

    :param test_claim_sentences: List of test claim sentences
    """
    # Define the SentenceTransformer model to use
    model_name = 'bert-base-nli-mean-tokens'

    # Initialize KMeans model for clustering
    kmeans_model = KMeans(
        n_clusters=num_clusters,   # Number of clusters to form
                                   # Defualt parameters:
        random_state=42,           # Seed for random number generation
        init='k-means++',          # Method for initializing centroids 
        n_init=10                  # Number of times the algorithm will be run with different centroid seeds
    )
    
    # Generate sentence embeddings for the test claim sentences using SentenceTransformer
    test_embeddings = generate_embeddings(test_claim_sentences, model_name)
    
    # Fit KMeans model on the generated embeddings
    kmeans_model.fit(test_embeddings)
    
    # Predict clusters for the test claim sentences
    predicted_clusters = kmeans_model.predict(test_embeddings)
    
    # Print the predicted clusters for each test sentence
    print("Predicted Clusters for New Data:")
    for i, cluster_label in enumerate(predicted_clusters):
        print(f"Document {i + 1}: Cluster = {cluster_label + 1}")
    
    # Print the results of clustering
    print_the_results(predicted_clusters, test_claim_sentences)


In [31]:
model_KMeansST(test)



Predicted Clusters for New Data:
Document 1: Cluster = 8
Document 2: Cluster = 5
Document 3: Cluster = 8
Document 4: Cluster = 1
Document 5: Cluster = 1
Document 6: Cluster = 10
Document 7: Cluster = 10
Document 8: Cluster = 9
Document 9: Cluster = 9
Document 10: Cluster = 9
Document 11: Cluster = 1
Document 12: Cluster = 7
Document 13: Cluster = 8
Document 14: Cluster = 2
Document 15: Cluster = 2
Document 16: Cluster = 2
Document 17: Cluster = 2
Document 18: Cluster = 5
Document 19: Cluster = 5
Document 20: Cluster = 5
Document 21: Cluster = 2
Document 22: Cluster = 7
Document 23: Cluster = 2
Document 24: Cluster = 5
Document 25: Cluster = 5
Document 26: Cluster = 2
Document 27: Cluster = 2
Document 28: Cluster = 2
Document 29: Cluster = 2
Document 30: Cluster = 2
Document 31: Cluster = 3
Document 32: Cluster = 6
Document 33: Cluster = 1
Document 34: Cluster = 1
Document 35: Cluster = 6
Document 36: Cluster = 3
Document 37: Cluster = 6
Document 38: Cluster = 4
Document 39: Cluster = 4

In [32]:
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import numpy as np


In [33]:
def sentence_to_vector(sentence, model):
    """
    Convert each sentence to a vector representation by averaging word vectors.

    :param sentence: Input sentence to convert
    :param model: Word2Vec model used for word embeddings
    :return: Vector representation of the sentence
    """
    # Tokenize the sentence and filter out words not present in the Word2Vec model
    words = [word for word in sentence.split() if word in model.wv]
    
    # If no words are found in the model, return a zero vector
    if not words:
        return np.zeros(model.vector_size)

    # Get word vectors for the words present in the model
    vectors = [model.wv[word] for word in words]
    
    # Calculate the mean of word vectors to obtain the sentence vector
    return np.mean(vectors, axis=0)


In [34]:
def model_KMeansW2V(test_claim_sentences):
    """
    Performs K-means clustering on the Word2Vec vectors of test claim sentences.

    :param test_claim_sentences: List of test claim sentences
    """
    # Word2Vec model
    word2vec_model = Word2Vec(
        sentences=[sentence.split() for sentence in test_claim_sentences],  # Convert sentences to word lists
        vector_size=100,  # Dimensionality of word vectors
        window=5,         # Maximum distance between the current and predicted word within a sentence
        min_count=1,      # Ignore words with a frequency lower than this
        workers=4         # Number of threads to train the model
    )

    # Generate Word2Vec vectors for test claim sentences
    test_vectors = np.array([sentence_to_vector(sentence, word2vec_model) for sentence in test_claim_sentences])

    # Cluster Modeling (K-means clustering)
    kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
    
    # Fit KMeans model on the Word2Vec vectors
    kmeans_model.fit(test_vectors)

    # Predict clusters for the test claim sentences
    if len(test_vectors) > 0:
        new_data_clusters = kmeans_model.predict(test_vectors)

        # Print the predicted clusters for each document
        print("Predicted Clusters for New Data:")
        for i, cluster_label in enumerate(new_data_clusters):
            print(f"Document {i+1}: Cluster = {cluster_label + 1}")
        # Print the results of clustering
        print_the_results(new_data_clusters, test_claim_sentences)


In [35]:
model_KMeansW2V(test)

Predicted Clusters for New Data:
Document 1: Cluster = 9
Document 2: Cluster = 9
Document 3: Cluster = 9
Document 4: Cluster = 5
Document 5: Cluster = 8
Document 6: Cluster = 5
Document 7: Cluster = 5
Document 8: Cluster = 8
Document 9: Cluster = 8
Document 10: Cluster = 9
Document 11: Cluster = 5
Document 12: Cluster = 5
Document 13: Cluster = 9
Document 14: Cluster = 6
Document 15: Cluster = 3
Document 16: Cluster = 7
Document 17: Cluster = 3
Document 18: Cluster = 1
Document 19: Cluster = 6
Document 20: Cluster = 6
Document 21: Cluster = 1
Document 22: Cluster = 9
Document 23: Cluster = 7
Document 24: Cluster = 6
Document 25: Cluster = 6
Document 26: Cluster = 3
Document 27: Cluster = 7
Document 28: Cluster = 6
Document 29: Cluster = 6
Document 30: Cluster = 3
Document 31: Cluster = 4
Document 32: Cluster = 4
Document 33: Cluster = 4
Document 34: Cluster = 4
Document 35: Cluster = 4
Document 36: Cluster = 10
Document 37: Cluster = 4
Document 38: Cluster = 2
Document 39: Cluster = 2


In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [37]:
def model_LDA(test_claim_sentences):
    """
    Performs Latent Dirichlet Allocation (LDA) topic modeling on the test claim sentences.

    :param test_claim_sentences: List of test claim sentences
    """
    # Preprocessing: Convert text into a numerical format
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(test_claim_sentences)

    # Topic Modeling (LDA)
    lda_model = LatentDirichletAllocation(n_components=num_clusters, random_state=42)
    lda_model.fit(X)

    # Preprocess the new data
    test_vectorized = vectorizer.transform(test_claim_sentences)
    
    # Predict cluster for the new data
    test_cluster_distributions = lda_model.transform(test_vectorized)
    
    # Initialize variables for predicting and storing clusters
    predict = 0
    cluster = 0
    test_clusters = []
    
    # Print the predicted topics for each document
    print("Predicted Topics for New Data:")
    for i, cluster_distribution in enumerate(test_cluster_distributions):
        for cluster_idx, prob in enumerate(cluster_distribution):
            # Find the cluster with the highest probability
            if predict < prob:
                predict = prob
                cluster = cluster_idx + 1
        test_clusters.append(cluster)
        print(f"Document {i+1}: Cluster: {cluster}")
        predict = 0
    
    # Display the results of clustering
    print_the_results(test_clusters, test_claim_sentences)


In [38]:
model_LDA(test)

Predicted Topics for New Data:
Document 1: Cluster: 9
Document 2: Cluster: 9
Document 3: Cluster: 3
Document 4: Cluster: 3
Document 5: Cluster: 9
Document 6: Cluster: 9
Document 7: Cluster: 10
Document 8: Cluster: 9
Document 9: Cluster: 9
Document 10: Cluster: 9
Document 11: Cluster: 3
Document 12: Cluster: 3
Document 13: Cluster: 9
Document 14: Cluster: 8
Document 15: Cluster: 2
Document 16: Cluster: 1
Document 17: Cluster: 1
Document 18: Cluster: 3
Document 19: Cluster: 5
Document 20: Cluster: 5
Document 21: Cluster: 2
Document 22: Cluster: 2
Document 23: Cluster: 1
Document 24: Cluster: 5
Document 25: Cluster: 5
Document 26: Cluster: 1
Document 27: Cluster: 1
Document 28: Cluster: 8
Document 29: Cluster: 2
Document 30: Cluster: 1
Document 31: Cluster: 3
Document 32: Cluster: 7
Document 33: Cluster: 7
Document 34: Cluster: 7
Document 35: Cluster: 7
Document 36: Cluster: 7
Document 37: Cluster: 7
Document 38: Cluster: 5
Document 39: Cluster: 5
Document 40: Cluster: 5
Document 41: Clus