Document Similarity Using Similarity Measures Cosine and Jaccard Similarity

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
doc1 = "Natural language processing enables computers to understand human language."
doc2 = "Machine learning algorithms help computers learn from data automatically."

In [5]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([doc1,doc2])
cos_sim = cosine_similarity(tfidf_matrix[0],tfidf_matrix[1])[0][0]

In [7]:
def jaccard_similarity(a,b):
    set1 = set(a.split())
    set2 = set(b.split())
    intersection = len(set1.intersection(set2))
    union = len(set1)+len(set2)-intersection
    return intersection/union
jac_sim = jaccard_similarity(doc1,doc2)

In [9]:
print(f"Cosine Similariry : {cos_sim :.4f}")
print(f"Jaccard Similarity : {jac_sim :.4f}")

Cosine Similariry : 0.0536
Jaccard Similarity : 0.0588


Sentiment Analysis Using Pretrained NLP Libraries TextBlob And VADER

In [11]:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [12]:
texts = [
    "This product is absolutely wonderful!",
    "The service was terrible and slow.",
    "Python 3.11 includes new features."
]

In [13]:
analyzer = SentimentIntensityAnalyzer()

In [14]:
for text in texts:
    print(f"\nText: {text}")
    
    # TextBlob Analysis
    tb = TextBlob(text)
    print(f"TextBlob - Polarity: {tb.sentiment.polarity:.2f}, Subjectivity: {tb.sentiment.subjectivity:.2f}")
    
    # VADER Analysis
    vs = analyzer.polarity_scores(text)
    print(f"VADER - Polarity: {vs['compound']:.2f}")


Text: This product is absolutely wonderful!
TextBlob - Polarity: 1.00, Subjectivity: 1.00
VADER - Polarity: 0.65

Text: The service was terrible and slow.
TextBlob - Polarity: -0.65, Subjectivity: 0.70
VADER - Polarity: -0.48

Text: Python 3.11 includes new features.
TextBlob - Polarity: 0.14, Subjectivity: 0.45
VADER - Polarity: 0.00


Sentiment Analysis Using Bayesian Classification

In [15]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [16]:
X = ["great movie", "terrible film", "awesome acting", "boring storyline"]
y = [1, 0, 1, 0]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [18]:
# Feature extraction
bow = CountVectorizer().fit(X_train)
X_train_bow = bow.transform(X_train)
X_test_bow = bow.transform(X_test)

In [19]:
tfidf = TfidfVectorizer().fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [20]:
# Train classifiers
clf_bow = MultinomialNB().fit(X_train_bow, y_train)
clf_tfidf = MultinomialNB().fit(X_train_tfidf, y_train)

In [22]:
# Evaluate
print("Bag-of-Words Results:")
print(classification_report(y_test, clf_bow.predict(X_test_bow)))

print("\nTF-IDF Results:")
print(classification_report(y_test, clf_tfidf.predict(X_test_tfidf)))

Bag-of-Words Results:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0


TF-IDF Results:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
