In [67]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

import spacy

import torch
from transformers import DistilBertTokenizer, DistilBertModel

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Load the pre-trained NLP model from spaCy
nlp = spacy.load("en_core_web_md")

# Define a function to determine if two sentences have the same meaning
def is_same_meaning(s1, s2):
    # Parse the two sentences using the NLP model
    doc1 = nlp(s1)
    doc2 = nlp(s2)
    
    # Calculate the similarity between the two parsed sentences
    similarity = doc1.similarity(doc2)
    
    # Determine if the sentences have the same meaning based on the similarity score
    if similarity > 0.95:
        return True
    else:
        return False


# define a meaning comparison function
def are_similar(sentence1, sentence2, threshold=0.95):
    encoded_sentence1 = tokenizer.encode_plus(sentence1, add_special_tokens=True, return_tensors='pt')
    encoded_sentence2 = tokenizer.encode_plus(sentence2, add_special_tokens=True, return_tensors='pt')
    with torch.no_grad():
        sentence1_output = model(encoded_sentence1['input_ids'], attention_mask=encoded_sentence1['attention_mask'])
        sentence2_output = model(encoded_sentence2['input_ids'], attention_mask=encoded_sentence2['attention_mask'])
    similarity = torch.cosine_similarity(sentence1_output.last_hidden_state.mean(dim=1), sentence2_output.last_hidden_state.mean(dim=1)).item()
    print(similarity)
    print(threshold)
    return similarity >= threshold


# define a function to extract nouns from a sentence
def extract_nouns(sent):
    tokens = nltk.word_tokenize(sent)
    return [word for (word, pos) in nltk.pos_tag(tokens) if pos.startswith('N')]

# compare not in 
def print_sentences_not_in_array(sentences, compare):
      for sentence in sentences:
        if sentence not in compare:
            print(sentence)


# input texts
text1 = "the cat jumped over the house. I am not tall"
text2 = " I am tall. the cat jumped over a house. teri mummy."

# tokenize into paragraphs
sent1 = sent_tokenize(text1)
sent2 = sent_tokenize(text2)

# extract nouns for each sentence in both texts
stripped_sentences1 = [extract_nouns(sent) for sent in sentences1]
stripped_sentences2 = [extract_nouns(sent) for sent in sentences2]

# find matched sentences between both texts
matched_sentences = []
for i, sent1 in enumerate(stripped_sentences1):
    for j, sent2 in enumerate(stripped_sentences2):
        if set(sent1) == set(sent2):
            matched_sentences.append((sentences1[i], sentences2[j]))




# print matched sentences
if matched_sentences:
    for sent1, sent2 in matched_sentences:
        print(f"{sent1} <--> {sent2}")
        if are_similar(sent1, sent2):
          print("The two sentences have the same meaning.")
        else:
          print("The two sentences do not have the same meaning.")
    else:
        print("No matches found.")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


the cat jumped over the house. <--> the cat jumped over a house.
0.9711356163024902
0.95
The two sentences have the same meaning.
I am not tall <-->  I am tall.
0.9079421162605286
0.95
The two sentences do not have the same meaning.
No matches found.
All sentences matched.


In [None]:
!pip install flask-ngrok

from flask_ngrok import run_with_ngrok
from flask import Flask, request

app = Flask(__name__)
run_with_ngrok(app)

@app.route('/compare-texts', methods=['POST'])
def compare_texts():
    text1 = request.form['text1']
    text2 = request.form['text2']
    
    # logic from above would come here
    
    return {'output': 'your output here'}

app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Running on http://94a3-35-238-39-112.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040
 * Running on http://94a3-35-238-39-112.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [25/Mar/2023 20:23:42] "[33mGET / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [25/Mar/2023 20:23:43] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [25/Mar/2023 20:23:52] "[33mGET / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [25/Mar/2023 20:23:53] "[33mGET / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [25/Mar/2023 20:24:17] "[33mGET / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [25/Mar/2023 20:34:12] "[33mGET / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [25/Mar/2023 20:35:47] "[33mGET / HTTP/1.1[0m" 404 -
