# 1. Word Embeddings with Cosine Similarity using Word2Vec:

In [None]:
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Sample sentences
sentence1 = "The quick brown fox jumps over the lazy dog."
sentence2 = "A fast brown fox leaps over a sleeping canine."

# Pre-trained Word2Vec model (download or use your own)
model = gensim.models.KeyedVectors.load_word2vec_format('word2vec.bin', binary=True)

# Tokenize and average word embeddings for each sentence
tokens1 = sentence1.lower().split()
tokens2 = sentence2.lower().split()
vector1 = sum(model[token] for token in tokens1) / len(tokens1)
vector2 = sum(model[token] for token in tokens2) / len(tokens2)

# Calculate cosine similarity
similarity_score = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0, 0]
print(f"Cosine Similarity: {similarity_score}")

In [7]:
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import string

# Sample sentences
sentence1 = "The quick brown fox jumps over the lazy dog."
sentence2 = "A fast brown fox leaps over a sleeping canine."

# Download the pre-trained Word2Vec model (only required once)
model = api.load("word2vec-google-news-300")

# Tokenize and get embeddings for each sentence
translator = str.maketrans('', '', string.punctuation)
tokens1 = [token.translate(translator).lower() for token in sentence1.split()]
tokens2 = [token.translate(translator).lower() for token in sentence2.split()]

# Filter out empty strings after removing punctuation
tokens1 = list(filter(None, tokens1))
tokens2 = list(filter(None, tokens2))

# Calculate embeddings for each token
embeddings1 = [model[token] for token in tokens1 if token in model]
embeddings2 = [model[token] for token in tokens2 if token in model]

# Handle cases where no tokens are in the Word2Vec model
if not embeddings1 or not embeddings2:
    print("No embeddings found for one or both sentences.")
    exit()

# Average the embeddings for each sentence
vector1 = sum(embeddings1) / len(embeddings1)
vector2 = sum(embeddings2) / len(embeddings2)

# Calculate cosine similarity
similarity_score = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))
print(f"Cosine Similarity: {similarity_score[0, 0]}")


Cosine Similarity: 0.8298037648200989


# 2. BERT and Transformer-based Models:

In [5]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Sample sentences
sentence1 = "The quick brown fox jumps over the lazy dog."
sentence2 = "A fast brown fox leaps over a sleeping canine."

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and get embeddings for each sentence
inputs1 = tokenizer(sentence1, return_tensors='pt', padding=True, truncation=True)
inputs2 = tokenizer(sentence2, return_tensors='pt', padding=True, truncation=True)

# Get BERT embeddings
with torch.no_grad():
    embeddings1 = model(**inputs1).pooler_output
    embeddings2 = model(**inputs2).pooler_output

# Calculate cosine similarity
similarity_score = cosine_similarity(embeddings1, embeddings2)
print(f"Cosine Similarity: {similarity_score.item()}")

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Cosine Similarity: 0.9935073852539062


# Pros:
State-of-the-art performance in various NLP tasks due to deep contextual understanding.
Can handle out-of-vocabulary words effectively.
# Cons:
Computationally expensive and requires significant resources.
Slower compared to simpler methods.
May require a large amount of training data for fine-tuning.

# 3. Cosine Similarity with TF-IDF Vectors:

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample sentences
sentence1 = "The quick brown fox jumps over the lazy dog."
sentence2 = "A fast brown fox leaps over a sleeping canine."

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Transform sentences into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform([sentence1, sentence2])

# Calculate cosine similarity
similarity_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
print(f"Cosine Similarity: {similarity_score[0, 0]}")

Cosine Similarity: 0.2095424038071013


# 4.  Siamese Networks:

In [1]:
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity
# import gensim.downloader as api


# # Sample sentences
# sentence1 = "The quick brown fox jumps over the lazy dog."
# sentence2 = "A fast brown fox leaps over a sleeping canine."

# # Pre-trained Word2Vec model (download or use your own)
# model = api.load("word2vec-google-news-300")

# # Tokenize and get embeddings for each sentence
# tokens1 = sentence1.lower().split()
# tokens2 = sentence2.lower().split()
# vector1 = np.array([model[token] for token in tokens1])
# vector2 = np.array([model[token] for token in tokens2])

# # Calculate cosine similarity
# similarity_score = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))
# print(f"Cosine Similarity: {similarity_score[0, 0]}")

In [2]:
import tkinter as tk

def generate_function():
    location = location_entry.get()
    property_type = property_type_entry.get()
    bhk = bhk_entry.get() if bhk_entry.get() else None
    sqft = sqft_entry.get() if sqft_entry.get() else None

    results = property_search(location, property_type, bhk, sqft)
    # Do something with the results, e.g., display them in a GUI or print them.

# Create the GUI
root = tk.Tk()
root.title("Property Search")

location_label = tk.Label(root, text="Location:")
location_label.pack()
location_entry = tk.Entry(root)
location_entry.pack()

property_type_label = tk.Label(root, text="Property Type:")
property_type_label.pack()
property_type_entry = tk.Entry(root)
property_type_entry.pack()

bhk_label = tk.Label(root, text="BHK:")
bhk_label.pack()
bhk_entry = tk.Entry(root)
bhk_entry.pack()

sqft_label = tk.Label(root, text="SQFT:")
sqft_label.pack()
sqft_entry = tk.Entry(root)
sqft_entry.pack()

search_button = tk.Button(root, text="Search", command=generate_function)
search_button.pack()
root.mainloop()

Exception in Tkinter callback
Traceback (most recent call last):
  File "/home/codetrade/anaconda3/lib/python3.10/tkinter/__init__.py", line 1921, in __call__
    return self.func(*args)
  File "/tmp/ipykernel_6458/131764056.py", line 9, in generate_function
    results = property_search(location, property_type, bhk, sqft)
NameError: name 'property_search' is not defined
Exception in Tkinter callback
Traceback (most recent call last):
  File "/home/codetrade/anaconda3/lib/python3.10/tkinter/__init__.py", line 1921, in __call__
    return self.func(*args)
  File "/tmp/ipykernel_6458/131764056.py", line 9, in generate_function
    results = property_search(location, property_type, bhk, sqft)
NameError: name 'property_search' is not defined
Exception in Tkinter callback
Traceback (most recent call last):
  File "/home/codetrade/anaconda3/lib/python3.10/tkinter/__init__.py", line 1921, in __call__
    return self.func(*args)
  File "/tmp/ipykernel_6458/131764056.py", line 9, in generate_fu

In [6]:
def property_search(question, location, property_type, details=None):
    # Validating the question type
    if question.lower() not in ["search", "create function"]:
        return "Invalid question. Please use 'search' or 'create function'."

    # Validating the property type
    property_type = property_type.lower()
    if property_type not in ["flat", "house"]:
        return "Invalid property type. Supported types are 'flat' or 'house'."

    # Action statements based on the provided statements
    if question.lower() == "search":
        if details is None:
            return f"Invalid details for {property_type} search."

        if "bhk" in details.lower() and property_type == "flat":
            return f"Searching for {details} {property_type} in {location}"
        elif "sqft" in details.lower() and property_type == "flat":
            return f"Searching for {property_type} with {details} in {location}"
        elif "sqft" in details.lower() and property_type == "house":
            return f"Searching for {property_type} with {details} in {location}"
        else:
            return f"Invalid details for {property_type} search."

    elif question.lower() == "create function":
        return """
def custom_property_search(location, property_type, details):
    # Custom function implementation based on location, property_type, and details
    pass
"""
    else:
        return "Invalid question. Please use 'search' or 'create function'."

# Example usage:
question = "search"
location = "New York"
property_type = "flat"
details = "2 bhk"
print(property_search(question, location, property_type, details))

question = "search"
location = "London"
property_type = "house"
details = "1500 sqft"
print(property_search(question, location, property_type, details))


Searching for 2 bhk flat in New York
Searching for house with 1500 sqft in London


In [7]:
def question_generate(location, property_type, area, price):
    if property_type.lower() == 'house':
        return f"What is the {property_type} size in sqft at {location}? It is {area} sqft and priced at ${price}."
    elif property_type.lower() == 'flat':
        return f"How many BHKs are there in the {property_type} located at {location}? It is a {area} BHK flat and priced at ${price}."
    else:
        return "Invalid property type. Please provide 'house' or 'flat'."

# Example usage:
location = "New York"
house_sqft = 2000
house_price = 300000
flat_bhk = 2
flat_price = 200000
print(question_generate(location, "house", house_sqft, house_price))
print(question_generate(location, "flat", flat_bhk, flat_price))

What is the house size in sqft at New York? It is 2000 sqft and priced at $300000.
How many BHKs are there in the flat located at New York? It is a 2 BHK flat and priced at $200000.


In [8]:
def question_generate(location, property_type, area, price):
    if property_type.lower() == 'house':
        return f"What is the {property_type} size in sqft at {location}? It is {area} sqft and priced at ${price}."
    elif property_type.lower() == 'flat':
        return f"How many BHKs are there in the {property_type} located at {location}? It is a {area} BHK flat and priced at ${price}."
    else:
        return "Invalid property type. Please provide 'house' or 'flat'."

# Ask the user for inputs
location = input("Enter the location: ")
property_type = input("Enter the property type (house or flat): ")
area = input("Enter the size in sqft (for house) or BHK (for flat): ")
price = input("Enter the price: ")

# Generate the question and print it
question = question_generate(location, property_type, area, price)
print("Generated Question:", question)


Enter the location: rajkot
Enter the property type (house or flat): flat
Enter the size in sqft (for house) or BHK (for flat): 2bhk
Enter the price: 900
Generated Question: How many BHKs are there in the flat located at rajkot? It is a 2bhk BHK flat and priced at $900.


In [9]:
import random

def question_generate(location, property_type, area, price):
    questions = []
    if property_type.lower() == 'house':
        questions.append(f"What is the {property_type} size in sqft at {location}? It is {area} sqft and priced at ${price}.")
        questions.append(f"How big is the {property_type} at {location}? It covers an area of {area} sqft and costs ${price}.")
    elif property_type.lower() == 'flat':
        questions.append(f"How many BHKs are there in the {property_type} located at {location}? It is a {area} BHK flat and priced at ${price}.")
        questions.append(f"What is the size of the {property_type} in {location}? It has {area} BHK and is priced at ${price}.")
    else:
        return "Invalid property type. Please provide 'house' or 'flat'."

    # Select a random question from the list
    return random.choice(questions)

# Ask the user for inputs
location = input("Enter the location: ")
property_type = input("Enter the property type (house or flat): ")
area = input("Enter the size in sqft (for house) or BHK (for flat): ")
price = input("Enter the price: ")

# Generate the question and print it
question = question_generate(location, property_type, area, price)
print("Generated Question:", question)

Enter the location: rajkot
Enter the property type (house or flat): flat
Enter the size in sqft (for house) or BHK (for flat): 2bhk
Enter the price: 9090
Generated Question: What is the size of the flat in rajkot? It has 2bhk BHK and is priced at $9090.


In [3]:
mwes = open('/home/codetrade/Downloads/CSV/hello/manyterms.lower.txt').read().lower().strip().split('\n')
print(mwes[44444:44456])
print(len(mwes), 'mwes')

['antonio superchi', 'antonio tarver', 'antonio torres jurado', 'antonio valdes', 'antonio valdes y fernandez bazan', 'antonio valdez', 'antonio valdés y bazán', 'antonio valdés y fernández bazán', 'antonio valente', 'antonio vitali', 'antonio vivaldi', 'antonio xavier machado e cerveira']
743274 mwes


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
patent_texts = pd.read_csv("/home/codetrade/Downloads/CSV/hello/hearst_patterns.155.csv")

cvectorizer = CountVectorizer(ngram_range=(1, 4), stop_words="english", vocabulary=mwes, lowercase=True)
X = cvectorizer.fit_transform(patent_texts)
termdf_cv = pd.DataFrame(np.sum(X, axis=0), columns=cvectorizer.get_feature_names()).T
termdf_cv = termdf_cv.sort_values(by=0, ascending=False)
print(termdf_cv.head(25))




AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'