<a href="https://colab.research.google.com/github/AiMl-hub/Gists/blob/main/text_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Tokeniztaion

In [None]:
import nltk
# nltk.download('punkt') # Download the necessary tokenizer models


text = "Hello, world! This is a simple sentence. Tokenization is fun."

# Word Tokenization
words = nltk.word_tokenize(text, language='english')
print("\nWord Tokenization:")
print(words)

# Sentence Tokenization
sentences = nltk.sent_tokenize(text)
print("\nSentence Tokenization:")
print(sentences)

# Keyword Search vs Semantic Search

In [None]:
# Keyword Search Example
print("--- Keyword Search Example ---")
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "A dog barks at the cat.",
    "Semantic search uses embeddings.",
    "Keyword search relies on exact word matches."
]
keyword_query = "dog"

print(f"Keyword search for: '{keyword_query}'")
keyword_results = []
for i, doc in enumerate(documents):
    if keyword_query.lower() in doc.lower():
        keyword_results.append((i, doc))

if keyword_results:
    print("Found in documents:")
    for i, doc in keyword_results:
        print(f"- Doc {i+1}: {doc}")
else:
    print("No matching documents found.")
print("\n")


# Semantic Search Example
print("--- Semantic Search Example ---")

# Install sentence-transformers if not already installed
!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer, util

# Load a pre-trained model. 'all-MiniLM-L6-v2' is a good general-purpose model.
model = SentenceTransformer('all-MiniLM-L6-v2')

corpus = [
    "A cat sits on the mat.",
    "The dog runs in the park.",
    "Machine learning is a field of artificial intelligence.",
    "Natural Language Processing deals with text data.",
    "This document talks about pets and animals.",
    "Information retrieval methods include keyword and semantic search."
]

print("Corpus documents:")
for i, doc in enumerate(corpus):
    print(f"- {i+1}: {doc}")
print("\n")

# Encode the corpus to get embeddings
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

# Define a query for semantic search
semantic_query = "animals in a park"
print(f"Semantic search query: '{semantic_query}'")

# Encode the query
query_embedding = model.encode(semantic_query, convert_to_tensor=True)

# Compute cosine similarity between query and all corpus embeddings
cosine_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]

# Combine corpus and scores, then sort by score
results = []
for i, score in enumerate(cosine_scores):
    results.append({'corpus_id': i, 'score': score.item(), 'text': corpus[i]})

# Sort the results by score in descending order
results = sorted(results, key=lambda x: x['score'], reverse=True)

print("\nTop 3 semantic search results:")
for i, result in enumerate(results[:3]):
    print(f"{i+1}. Score: {result['score']:.4f}, Document: {result['text']}")

#Measuring Vector Distance

In [None]:
import numpy as np
from scipy.spatial import distance

# Define two sample embeddings (vectors)
embedding1 = np.array([1.0, 2.0, 3.0, 4.0])
embedding2 = np.array([2.0, 3.0, 4.0, 5.0])
embedding3 = np.array([-1.0, -2.0, -3.0, -4.0])

print("Embedding 1:", embedding1)
print("Embedding 2:", embedding2)
print("Embedding 3:", embedding3)
print("\n-- Similarity between Embedding 1 and Embedding 2 --")

# 1. Euclidean Distance
# Lower distance means higher similarity
euclidean_dist_1_2 = distance.euclidean(embedding1, embedding2)
print(f"Euclidean Distance: {euclidean_dist_1_2:.4f}")

# 2. Cosine Similarity
# Ranges from -1 (opposite) to 1 (identical), 0 (orthogonal)
# Using 1 - cosine_distance because scipy's cosine is a distance metric
cosine_similarity_1_2 = 1 - distance.cosine(embedding1, embedding2)
print(f"Cosine Similarity: {cosine_similarity_1_2:.4f}")

# 3. Dot Product Similarity
# Higher value means higher similarity (especially for non-negative vectors)
dot_product_similarity_1_2 = np.dot(embedding1, embedding2)
print(f"Dot Product Similarity: {dot_product_similarity_1_2:.4f}")

print("\n-- Similarity between Embedding 1 and Embedding 3 --")

# Euclidean Distance
euclidean_dist_1_3 = distance.euclidean(embedding1, embedding3)
print(f"Euclidean Distance: {euclidean_dist_1_3:.4f}")

# Cosine Similarity
cosine_similarity_1_3 = 1 - distance.cosine(embedding1, embedding3)
print(f"Cosine Similarity: {cosine_similarity_1_3:.4f}")

# Dot Product Similarity
dot_product_similarity_1_3 = np.dot(embedding1, embedding3)
print(f"Dot Product Similarity: {dot_product_similarity_1_3:.4f}")