In [11]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample text corpus (articles)
corpus = [
    "Machine learning is great for understanding data",
    "Natural language processing is a part of artificial intelligence",
    "Artificial intelligence is transforming the future",
    "Machine learning is a core part of artificial intelligence",
    "Deep learning is a subset of machine learning",
    "Athleisure is dominating fashion trends this year with comfortable yet stylish outfits.",
    "The latest smartphones are pushing the boundaries of AI and camera technology.",
    "Fashion shows this season highlight sustainable clothing made from eco-friendly materials.",
    "Athletes are increasingly using data-driven techniques to optimize performance in competitive sports.",
    "Technology companies are focusing on AI-driven innovations for smart home devices.",
    "The future of fashion involves a mix of technology and sustainability, with smart fabrics playing a key role.",
    "Wearable fitness technology is becoming a crucial part of the fashion industry.",
    "Soccer teams are adopting wearable tech to track players' health and improve performance.",
]

# Function to get the most similar article using cosine similarity
def most_similar_article(query, corpus, vectorizer):
    # Combine query and corpus
    documents = corpus + [query]

    # Transform documents into numerical representation (BoW or TF-IDF)
    X = vectorizer.fit_transform(documents)

    print(vectorizer.get_feature_names_out())
    print(X.toarray())
    print(X.toarray().shape)

    # Calculate cosine similarity between the query and all corpus articles
    similarities = cosine_similarity(X[-1], X[:-1])

    # Get index of the most similar article
    most_similar_index = np.argmax(similarities)

    return most_similar_index, similarities[0][most_similar_index]

# Query article (input for similarity search)
query ="Machine learning is for understanding data"
'''queries = [
    "How is technology influencing modern fashion?",
    "What are the latest trends in sports performance optimization?",
    "How are sustainable materials changing the fashion industry?",
    "What role does AI play in modern smartphones?",
    "How are wearables impacting both fitness and fashion?",
]'''



'queries = [\n    "How is technology influencing modern fashion?",\n    "What are the latest trends in sports performance optimization?",\n    "How are sustainable materials changing the fashion industry?",\n    "What role does AI play in modern smartphones?",\n    "How are wearables impacting both fitness and fashion?",\n]'

In [12]:

# 1. Using Bag of Words (BoW)
print("Using Bag of Words (BoW)")

# Create a BoW vectorizer
bow_vectorizer = CountVectorizer()

# Get most similar article and similarity score using BoW
most_similar_idx_bow, similarity_score_bow = most_similar_article(query, corpus, bow_vectorizer)
print(f"Query: {query}")
print(f"Most similar article (BoW): {corpus[most_similar_idx_bow]}")
print(f"Similarity score (BoW): {similarity_score_bow}\n")

Using Bag of Words (BoW)
['adopting' 'ai' 'and' 'are' 'artificial' 'athleisure' 'athletes'
 'becoming' 'boundaries' 'camera' 'clothing' 'comfortable' 'companies'
 'competitive' 'core' 'crucial' 'data' 'deep' 'devices' 'dominating'
 'driven' 'eco' 'fabrics' 'fashion' 'fitness' 'focusing' 'for' 'friendly'
 'from' 'future' 'great' 'health' 'highlight' 'home' 'improve' 'in'
 'increasingly' 'industry' 'innovations' 'intelligence' 'involves' 'is'
 'key' 'language' 'latest' 'learning' 'machine' 'made' 'materials' 'mix'
 'natural' 'of' 'on' 'optimize' 'outfits' 'part' 'performance' 'players'
 'playing' 'processing' 'pushing' 'role' 'season' 'shows' 'smart'
 'smartphones' 'soccer' 'sports' 'stylish' 'subset' 'sustainability'
 'sustainable' 'teams' 'tech' 'techniques' 'technology' 'the' 'this' 'to'
 'track' 'transforming' 'trends' 'understanding' 'using' 'wearable' 'with'
 'year' 'yet']
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 [0 0 0 ..

In [13]:
# 2. Using TF-IDF
print("Using TF-IDF")

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Get most similar article and similarity score using TF-IDF
most_similar_idx_tfidf, similarity_score_tfidf = most_similar_article(query, corpus, tfidf_vectorizer)
print(f"Query: {query}")
print(f"Most similar article (TF-IDF): {corpus[most_similar_idx_tfidf]}")
print(f"Similarity score (TF-IDF): {similarity_score_tfidf}")

Using TF-IDF
['adopting' 'ai' 'and' 'are' 'artificial' 'athleisure' 'athletes'
 'becoming' 'boundaries' 'camera' 'clothing' 'comfortable' 'companies'
 'competitive' 'core' 'crucial' 'data' 'deep' 'devices' 'dominating'
 'driven' 'eco' 'fabrics' 'fashion' 'fitness' 'focusing' 'for' 'friendly'
 'from' 'future' 'great' 'health' 'highlight' 'home' 'improve' 'in'
 'increasingly' 'industry' 'innovations' 'intelligence' 'involves' 'is'
 'key' 'language' 'latest' 'learning' 'machine' 'made' 'materials' 'mix'
 'natural' 'of' 'on' 'optimize' 'outfits' 'part' 'performance' 'players'
 'playing' 'processing' 'pushing' 'role' 'season' 'shows' 'smart'
 'smartphones' 'soccer' 'sports' 'stylish' 'subset' 'sustainability'
 'sustainable' 'teams' 'tech' 'techniques' 'technology' 'the' 'this' 'to'
 'track' 'transforming' 'trends' 'understanding' 'using' 'wearable' 'with'
 'year' 'yet']
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.        

In [17]:
# 1. Using Bag of Words (BoW) with n-grams (bigrams and trigrams)
print("Using Bag of Words (BoW) with n-grams")
bow_vectorizer = CountVectorizer(ngram_range=(1, 2))  # Using bigrams and trigrams


# Get most similar article and similarity score using BoW
most_similar_idx_bow, similarity_score_bow = most_similar_article(query, corpus, bow_vectorizer)
print(f"Query: {query}")
print(f"Most similar article (BoW): {corpus[most_similar_idx_bow]}")
print(f"Similarity score (BoW): {similarity_score_bow}\n")

Using Bag of Words (BoW) with n-grams
['adopting' 'adopting wearable' 'ai' 'ai and' 'ai driven' 'and'
 'and camera' 'and improve' 'and sustainability' 'are' 'are adopting'
 'are focusing' 'are increasingly' 'are pushing' 'artificial'
 'artificial intelligence' 'athleisure' 'athleisure is' 'athletes'
 'athletes are' 'becoming' 'becoming crucial' 'boundaries' 'boundaries of'
 'camera' 'camera technology' 'clothing' 'clothing made' 'comfortable'
 'comfortable yet' 'companies' 'companies are' 'competitive'
 'competitive sports' 'core' 'core part' 'crucial' 'crucial part' 'data'
 'data driven' 'deep' 'deep learning' 'devices' 'dominating'
 'dominating fashion' 'driven' 'driven innovations' 'driven techniques'
 'eco' 'eco friendly' 'fabrics' 'fabrics playing' 'fashion'
 'fashion industry' 'fashion involves' 'fashion shows' 'fashion trends'
 'fitness' 'fitness technology' 'focusing' 'focusing on' 'for' 'for smart'
 'for understanding' 'friendly' 'friendly materials' 'from' 'from eco'
 'future

In [15]:
# 2. Using TF-IDF with n-grams (bigrams and trigrams)
print("\nUsing TF-IDF with n-grams")
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 3))  # Using bigrams and trigrams
# Get most similar article and similarity score using TF-IDF
most_similar_idx_tfidf, similarity_score_tfidf = most_similar_article(query, corpus, tfidf_vectorizer)
print(f"Query: {query}")
print(f"Most similar article (TF-IDF): {corpus[most_similar_idx_tfidf]}")
print(f"Similarity score (TF-IDF): {similarity_score_tfidf}")


Using TF-IDF with n-grams
['adopting wearable' 'adopting wearable tech' 'ai and' 'ai and camera'
 'ai driven' 'ai driven innovations' 'and camera' 'and camera technology'
 'and improve' 'and improve performance' 'and sustainability'
 'and sustainability with' 'are adopting' 'are adopting wearable'
 'are focusing' 'are focusing on' 'are increasingly'
 'are increasingly using' 'are pushing' 'are pushing the'
 'artificial intelligence' 'artificial intelligence is' 'athleisure is'
 'athleisure is dominating' 'athletes are' 'athletes are increasingly'
 'becoming crucial' 'becoming crucial part' 'boundaries of'
 'boundaries of ai' 'camera technology' 'clothing made'
 'clothing made from' 'comfortable yet' 'comfortable yet stylish'
 'companies are' 'companies are focusing' 'competitive sports' 'core part'
 'core part of' 'crucial part' 'crucial part of' 'data driven'
 'data driven techniques' 'deep learning' 'deep learning is'
 'dominating fashion' 'dominating fashion trends' 'driven innovat