### Tokenizer
nltk.download('punkt_tab') is used for downloading tokenizer

In [None]:
import nltk

from nltk.tokenize import sent_tokenize, word_tokenize
example_string = "Generate Values for user, book, car, bike and resources"
tokenized_words = word_tokenize(example_string)


## Filter Stop Words
nltk.download("stopwords")

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words("english"))
example_string = "Generate Values for user, book, car, bike and resources"
tokenized_words = word_tokenize(example_string)
filtered_list = []
for word in tokenized_words:
    if word.casefold() not in stop_words:
        filtered_list.append(word)

print(filtered_list)

## Remove symbols and Stemming

In [21]:

import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words("english"))
example_string = "Generating Values for user, book, car, bike and resources"
clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', example_string)
tokenized_words = word_tokenize(clean_text)
filtered_list = []
for word in tokenized_words:
    if word.casefold() not in stop_words:
        filtered_list.append(word)

# print(filtered_list)
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_list]
print(stemmed_words)


['gener', 'valu', 'user', 'book', 'car', 'bike', 'resourc']


## Lemmatization
* nltk.download('wordnet')
* nltk.download('averaged_perceptron_tagger')
* nltk.download('averaged_perceptron_tagger_eng')

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

example_string = "Generating geese Values for user, book, car, bike and resources"
clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', example_string)
tokenized_words = word_tokenize(clean_text)
filtered_list = []
for word in tokenized_words:
    filtered_list.append(word)
lemma = WordNetLemmatizer()
for word in filtered_list:
    print(word,":",lemma.lemmatize(word))

## Parts of Speech Tagging

Input parameter must be tokens

In [23]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

example_string = "Generating Values for user, book, car, bike and resources"
clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', example_string)
tokenized_words = word_tokenize(clean_text)
tagged_string = pos_tag(tokenized_words)
print(tagged_string)

[('Generating', 'VBG'), ('Values', 'NNS'), ('for', 'IN'), ('user', 'NN'), ('book', 'NN'), ('car', 'NN'), ('bike', 'NN'), ('and', 'CC'), ('resources', 'NNS')]


## Chunking and Chinking

In [26]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

example_string = "Generating Values for user, book, car, bike and resources"
clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', example_string)
tokenized_words = word_tokenize(clean_text)
tagged_string = pos_tag(tokenized_words)

chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = nltk.RegexpParser(chunk_grammar)
tree = chunk_parser.parse(tagged_string)
# tree.draw()

chink_grammar = """
Chink: {<.*>+}
       }<CC>{"""
chink_parser = nltk.RegexpParser(chink_grammar)
chink_tree = chink_parser.parse(tagged_string)
chink_tree.draw()

## Named Entity Recognition
* nltk.download('maxent_ne_chunker_tab')
* nltk.download("words")

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

sample_string = "Apple is looking at buying U.K. startup for $1 billion. Steve Jobs founded the company."
tokenized_words = word_tokenize(sample_string)
tagged_string = pos_tag(tokenized_words)
tree = nltk.ne_chunk(tagged_string)
tree.draw()

## Concordance
* import nltk
* nltk.download("book")

In [15]:
from nltk.corpus import gutenberg
from nltk.text import Text

corpus = gutenberg.words('melville-moby_dick.txt')
text = Text(corpus)
text.concordance(["honest", "man"])

Displaying 1 of 1 matches:
r , sir ?'--' Soon enough for any honest man that goes a passenger .' Ha ! Jon


## Vector Embedding 

In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings)

## Vector Embedding 
### Semantic Similarity Search

pip install -U sentence-transformers

Common Distance Metrics used
* Euclidean
* Manhattan
* Minkowski
* ChebyChev
* Cosine Similarity
* Hamming

In [24]:
import numpy as np

from numpy.linalg import norm
from sentence_transformers import SentenceTransformer

# Define the model we want to use (it'll download itself)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

sentences = [
  "That is a very happy person",
  "That is a happy dog",
  "Today is a sunny day"
]

# vector embeddings created from dataset
embeddings = model.encode(sentences)

# query vector embedding
query_embedding = model.encode("That is a happy person")

# define our distance metric
def cosine_similarity(a, b):
    return np.dot(a, b)/(norm(a)*norm(b))

# run semantic similarity search
print("Query: That is a happy person")
for e, s in zip(embeddings, sentences):
    print(s, " -> similarity score = ",
         cosine_similarity(e, query_embedding))


Query: That is a happy person
That is a very happy person  -> similarity score =  0.9429151
That is a happy dog  -> similarity score =  0.6945774
Today is a sunny day  -> similarity score =  0.25687614


## PgVector With Python
pip install psycopg2
+ 384 is  for all-MiniLMV6

In [39]:
import psycopg2
import numpy as np
from sentence_transformers import SentenceTransformer


conn = psycopg2.connect("dbname=vector_db user=postgres password=1631")
cursor = conn.cursor()

create_table_query = """
CREATE TABLE IF NOT EXISTS embeddings (
    id SERIAL PRIMARY KEY,
    vector_embeddings VECTOR(384)
);
"""
cursor.execute(create_table_query)
conn.commit()
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def create_embedding():   
    sentences = [
    "That is a happy dog",
    "Today is a sunny day",
    "That is a very happy person"
    ]
    embeddings = model.encode(sentences)
    for embedding in embeddings.tolist():
        cursor.execute("INSERT INTO embeddings (vector_embeddings) VALUES (%s)", (embedding,)) # Note ',' all embedding will not add
    conn.commit()

def query_embedding():
    # Query sentence
    query_sentence = "That is a happy person"
    query_embedding = model.encode([query_sentence])[0].tolist()  # Convert numpy array to list

    # Perform similarity search with pgvector
    cursor.execute("""
        SELECT id, vector_embeddings <-> %s::vector AS similarity
        FROM embeddings
        ORDER BY similarity
        LIMIT 2;
    """, (query_embedding,))

    # Retrieve and print results
    results = cursor.fetchall()
    print(f"Query: {query_sentence}")
    for row in results:
        print(f"Sentence: {row[0]}, Embedding: {row[1]}")

# create_embedding()
query_embedding()

cursor.close()
conn.close()

Query: That is a happy person
Sentence: 3, Embedding: 0.3378904596229104
