In [5]:
!pip install Flask
!pip install gensim
!pip install beautifulsoup4
!pip install requests
!pip install nltk
!pip install torch
!pip install transformers
!pip install scikit-learn
!pip install Werkzeug




In [None]:
from flask import Flask, request, render_template, jsonify
from gensim import corpora
from bs4 import BeautifulSoup
import requests
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
import numpy as np
import re
from transformers import BertTokenizer, BertModel, pipeline
from sklearn.cluster import AgglomerativeClustering
import traceback
from werkzeug.serving import run_simple

app = Flask(__name__, template_folder='my_templates')
app.config['TEMPLATES_AUTO_RELOAD'] = True

lemmatizer = WordNetLemmatizer()

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def extract_patent_claims(url):
    pages = requests.get(url)
    soup = BeautifulSoup(pages.text, 'html.parser')
    claims = soup.find_all('div', class_='claim-text')
    return [claim.get_text(strip=True) for claim in claims]

def preprocess_claims(claims):
    stop_words = set(stopwords.words('english'))
    processed_claims = []
    for claim in claims:
        claim = claim.lower()
        claim = re.sub(r'[^a-z\s]', '', claim)
        tokens = word_tokenize(claim)
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        processed_claims.append(' '.join(tokens))
    return processed_claims

def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state[:, 0, :].squeeze().numpy())
    return np.array(embeddings)

def generate_topic_title(claims, summarizer):
    combined_claims = ' '.join(claims)
    words = combined_claims.split()
    
    if len(words) > 512:
        chunk_size = 256
        summaries = []
        for i in range(0, len(words), chunk_size):
            chunk = ' '.join(words[i:i+chunk_size])
            summary = summarizer(chunk, max_length=20, min_length=5, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        
        combined_summary = ' '.join(summaries)
        final_summary = summarizer(combined_summary, max_length=20, min_length=5, do_sample=False)
        return final_summary[0]['summary_text']
    else:
        summary = summarizer(combined_claims, max_length=20, min_length=5, do_sample=False)
        return summary[0]['summary_text']

@app.route("/", methods=["GET", "POST"])
def index():
    if request.method == "POST":
        try:
            num_clusters = int(request.form['num_clusters'])
            list_url = [
                'https://patents.google.com/patent/US9634864B2/en?oq=US9634864B2',
                'https://patents.google.com/patent/US9980046B2/en?oq=US9980046B2',
                'https://patents.google.com/patent/GB2478972A/en?q=(phone)&oq=phone']
            
            claims = []
            for url in list_url:
                claims.extend(extract_patent_claims(url))

            preprocessed_claims = preprocess_claims(claims)
            embeddings = get_bert_embeddings(preprocessed_claims)
            
            cluster = AgglomerativeClustering(n_clusters=num_clusters, metric='euclidean', linkage='ward')
            cluster_labels = cluster.fit_predict(embeddings)
            
            grouped_claims = {i: [] for i in range(num_clusters)}
            for idx, label in enumerate(cluster_labels):
                grouped_claims[label].append(preprocessed_claims[idx])

            summarizer = pipeline('summarization', model="facebook/bart-large-cnn")
            group_titles = {group: generate_topic_title(claims, summarizer) for group, claims in grouped_claims.items()}
            
            response = {
                'groups': [
                    {
                        'title': group_titles[group],
                        'number_of_claims': len(claims)
                    }
                    for group, claims in grouped_claims.items()]}
            
            return render_template("results.html", num_clusters=num_clusters, groups=response['groups'])
        except Exception as e:
            traceback.print_exc()
            return "An error occurred, please check the console for details.", 500
    else:
        return render_template("index.html")
        
if __name__ == "__main__":
    app.run(host='0.0.0.0', port=5000)
