# **M Anwaar Ur Rehman**
# **Roll No: 28**
# **Bs: Data Science**

---
---
# **TF-IDF Feature Names & Matrix**

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the corpus
corpus = [
    "Text mining is fun",
    "Mining data helps extract insights",
    "Data mining and text analytics are valuable"
]

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)



# Display results
print("TF-IDF Feature Names:", vectorizer.get_feature_names_out())

print("::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
print("::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
print("::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
print("::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")

print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


TF-IDF Feature Names: ['analytics' 'and' 'are' 'data' 'extract' 'fun' 'helps' 'insights' 'is'
 'mining' 'text' 'valuable']
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
TF-IDF Matrix:
 [[0.         0.         0.         0.         0.         0.5844829
  0.         0.         0.5844829  0.34520502 0.44451431 0.        ]
 [0.         0.         0.         0.38376993 0.50461134 0.
  0.50461134 0.50461134 0.         0.29803159 0.         0.        ]
 [0.4261835  0.4261835  0.4261835  0.32412354 0.         0.
  0.         0.         0.         0.25171084 0.32412354 0.4261835 ]]


# **Cosine Similarity**

In [2]:
# Cosine Similarity Example

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents
docs = ["Text mining is fun",
        "Text mining helps extract insights",
        "Data mining and text analytics are valuable"]

# Convert to term frequency matrix
vectorizer = CountVectorizer()
tf_matrix = vectorizer.fit_transform(docs)

# Compute cosine similarity
cos_sim = cosine_similarity(tf_matrix, tf_matrix)
print("Cosine Similarity Matrix:\n", cos_sim)


Cosine Similarity Matrix:
 [[1.         0.4472136  0.37796447]
 [0.4472136  1.         0.3380617 ]
 [0.37796447 0.3380617  1.        ]]


# **Jaccard Similarity**

In [3]:
def jaccard_similarity(doc1, doc2):
    # Tokenize documents into sets
    set1, set2 = set(doc1.split()), set(doc2.split())
    # Compute Jaccard Similarity
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union

# Sample documents
doc_a = "Text mining is fun"
doc_b = "Text mining helps extract insights"

print("Jaccard Similarity:", jaccard_similarity(doc_a, doc_b))

Jaccard Similarity: 0.2857142857142857


# **Euclidean Distance**

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import euclidean

# Convert documents into term frequency vectors
tf_matrix = vectorizer.fit_transform(docs).toarray()

# Compute Euclidean distance
euclidean_dist = euclidean(tf_matrix[0], tf_matrix[1])
print("Euclidean Distance:", euclidean_dist)

Euclidean Distance: 2.23606797749979


# **Semantic Similarity Using Word Embeddings**

In [7]:
import spacy

# Download the en_core_web_md model
try:
    spacy.cli.download("en_core_web_md")
    print("Model 'en_core_web_md' downloaded successfully!")
except Exception as e:
    print(f"An error occurred: {e}")

# Load the model after installation
nlp = spacy.load("en_core_web_md")
print("Model 'en_core_web_md' loaded successfully!")


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Model 'en_core_web_md' downloaded successfully!
Model 'en_core_web_md' loaded successfully!


In [8]:
import spacy

# Load pre-trained word embeddings model
nlp = spacy.load("en_core_web_md")

# Sample documents
doc1 = nlp("Text mining is fun")
doc2 = nlp("Text mining helps extract insights")

# Compute semantic similarity
semantic_similarity = doc1.similarity(doc2)
print("Semantic Similarity:", semantic_similarity)

Semantic Similarity: 0.6180577720672877
