# **Load a Dataset**

In [1]:
import pandas as pd

# Example dataset
data = {
    "documents": [
        "Text mining is fun",
        "Text mining helps extract insights",
        "Data science involves text mining",
        "Fun with machine learning and mining",
        "Artificial intelligence drives innovation",
        "Machine learning is a subset of AI",
        "Deep learning uses neural networks",
        "Text mining helps in data extraction",
        "Data science bridges statistics and computing",
        "Python is popular for data analysis",
        "Big data requires scalable solutions",
        "Clustering groups similar data points",
        "Classification assigns labels to data",
        "Natural Language Processing is fascinating",
        "Sentiment analysis identifies emotions",
        "Recommendation systems suggest products",
        "Regression predicts continuous values",
        "Data visualization simplifies insights",
        "Statistics is the foundation of data science",
        "Cloud computing supports data storage",
        "Data preprocessing is crucial for modeling",
        "Tokenization splits text into words",
        "Vectorization represents text numerically",
        "Cosine similarity measures text similarity",
        "Word embeddings capture word meanings",
        "Neural networks learn from data",
        "Optimization minimizes loss functions",
        "Feature engineering improves model performance",
        "EDA uncovers hidden patterns in data",
    ]
}
df = pd.DataFrame(data)
df


Unnamed: 0,documents
0,Text mining is fun
1,Text mining helps extract insights
2,Data science involves text mining
3,Fun with machine learning and mining
4,Artificial intelligence drives innovation
5,Machine learning is a subset of AI
6,Deep learning uses neural networks
7,Text mining helps in data extraction
8,Data science bridges statistics and computing
9,Python is popular for data analysis


# **Preprocess the Text**

In [2]:
import re

def preprocess(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text.strip()

df['cleaned_documents'] = df['documents'].apply(preprocess)

# **Apply Cosine Similarity**

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert to term frequency matrix
vectorizer = CountVectorizer()
tf_matrix = vectorizer.fit_transform(df['cleaned_documents'])

# Compute cosine similarity for all pairs
cosine_sim_matrix = cosine_similarity(tf_matrix)

print("Cosine Similarity Matrix:")
print("::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
print("::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
print("::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
print("::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
print(pd.DataFrame(cosine_sim_matrix, columns=df.index, index=df.index))

Cosine Similarity Matrix:
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
          0         1         2         3    4         5         6         7   \
0   1.000000  0.447214  0.447214  0.408248  0.0  0.204124  0.000000  0.408248   
1   0.447214  1.000000  0.400000  0.182574  0.0  0.000000  0.000000  0.547723   
2   0.447214  0.400000  1.000000  0.182574  0.0  0.000000  0.000000  0.547723   
3   0.408248  0.182574  0.182574  1.000000  0.0  0.333333  0.182574  0.166667   
4   0.000000  0.000000  0.000000  0.000000  1.0  0.000000  0.000000  0.000000   
5   0.204124  0.000000  0.000000  0.333333  0.0  1.000000  0.182574  0.000000   
6   0.000000  0.000000  0.000000  0.182574  0.0  0.182574  1.000000  0.000000   
7   0.408248  0.547723  0.547723  0.166667  0.0  0.000000  0.000000  1.000000   
8   0.000

# **Apply Jaccard Similarity**

In [4]:
def jaccard_similarity_set(doc1, doc2):
    set1, set2 = set(doc1.split()), set(doc2.split())
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union

# Pairwise Jaccard similarity
jaccard_sim_matrix = pd.DataFrame(index=df.index, columns=df.index)

for i in df.index:
    for j in df.index:
        jaccard_sim_matrix.iloc[i, j] = jaccard_similarity_set(
            df['cleaned_documents'][i], df['cleaned_documents'][j]
        )

print("Jaccard Similarity Matrix:")
print("...............................................................")
print(jaccard_sim_matrix)

Jaccard Similarity Matrix:
...............................................................
          0         1         2         3    4         5         6         7   \
0        1.0  0.285714  0.285714      0.25  0.0       0.1       0.0      0.25   
1   0.285714       1.0      0.25       0.1  0.0       0.0       0.0     0.375   
2   0.285714      0.25       1.0       0.1  0.0       0.0       0.0     0.375   
3       0.25       0.1       0.1       1.0  0.0  0.181818       0.1  0.090909   
4        0.0       0.0       0.0       0.0  1.0       0.0       0.0       0.0   
5        0.1       0.0       0.0  0.181818  0.0       1.0  0.090909       0.0   
6        0.0       0.0       0.0       0.1  0.0  0.090909       1.0       0.0   
7       0.25     0.375     0.375  0.090909  0.0       0.0       0.0       1.0   
8        0.0       0.0  0.222222  0.090909  0.0       0.0       0.0  0.090909   
9   0.111111       0.0       0.1       0.0  0.0  0.083333       0.0  0.090909   
10       0.0      

# **Apply Euclidean Distance**

In [5]:
from scipy.spatial.distance import euclidean

# Euclidean distance for term frequency vectors
euclidean_dist_matrix = pd.DataFrame(index=df.index, columns=df.index)

for i in df.index:
    for j in df.index:
        euclidean_dist_matrix.iloc[i, j] = euclidean(
            tf_matrix[i].toarray()[0], tf_matrix[j].toarray()[0]
        )

print("Euclidean Distance Matrix:")
print("..........................................................................")
print(euclidean_dist_matrix)

Euclidean Distance Matrix:
..........................................................................
          0         1         2         3         4         5         6   \
0        0.0  2.236068  2.236068   2.44949  2.828427  2.828427       3.0   
1   2.236068       0.0   2.44949       3.0       3.0  3.316625  3.162278   
2   2.236068   2.44949       0.0       3.0       3.0  3.316625  3.162278   
3    2.44949       3.0       3.0       0.0  3.162278  2.828427       3.0   
4   2.828427       3.0       3.0  3.162278       0.0  3.162278       3.0   
5   2.828427  3.316625  3.316625  2.828427  3.162278       0.0       3.0   
6        3.0  3.162278  3.162278       3.0       3.0       3.0       0.0   
7    2.44949  2.236068  2.236068  3.162278  3.162278  3.464102  3.316625   
8   3.162278  3.316625  2.645751  3.162278  3.162278  3.464102  3.316625   
9   2.828427  3.316625       3.0  3.464102  3.162278  3.162278  3.316625   
10       3.0  3.162278  2.828427  3.316625       3.0  3.316625

# **Apply Semantic Similarity**

import spacy

# Load a pre-trained word embeddings model
nlp = spacy.load("en_core_web_md")

# Compute semantic similarity for all pairs
semantic_sim_matrix = pd.DataFrame(index=df.index, columns=df.index)

for i in df.index:
    for j in df.index:
        doc1, doc2 = nlp(df['cleaned_documents'][i]), nlp(df['cleaned_documents'][j])
        semantic_sim_matrix.iloc[i, j] = doc1.similarity(doc2)

print("Semantic Similarity Matrix:")
print(semantic_sim_matrix)