<a href="https://colab.research.google.com/github/AllisonDing/Natural-Language-Processing/blob/main/NLP_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import numpy as np

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.update(['is', "n't"])  # Add 'is' and 'n't' to stopwords
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def encode_sentence(sentence, to_stem=False, to_remove_stop_words=False):
  # Tokenize the sentence
  tokens = word_tokenize(sentence.lower())

  # Remove stop words if required
  if to_remove_stop_words:
      tokens = [token for token in tokens if token not in stop_words and token.isalpha()]

  # Apply stemming if required
  if to_stem:
      tokens = [stemmer.stem(token) for token in tokens]

  # Prepare text again after processing
  processed_text = ' '.join(tokens)
  return processed_text

In [None]:
def cosine_similarity(A, B):
  dot_product = np.dot(A, B)
  norm_A = np.linalg.norm(A)
  norm_B = np.linalg.norm(B)
  return dot_product/(norm_A*norm_B)

In [None]:
def compare_sentences(sentence1, sentence2, encoding_type='one-hot'):
  # Choose the vectorizer based on encoding_type
  if encoding_type == 'one-hot':
    vectorizer = CountVectorizer(binary=True)
  elif encoding_type == 'bag of words':
    vectorizer = CountVectorizer(binary=False)
  elif encoding_type == 'tf':
    vectorizer = TfidfVectorizer(use_idf=False, norm = 'l1')  # TF (term frequency) with normalization
  elif encoding_type == 'tfXidf':
    vectorizer = TfidfVectorizer(use_idf=True, norm = 'l1')  # TF-IDF

  # fit the vectorizer to both sentences to ensure the same feature space
  vectors = vectorizer.fit_transform([sentence1, sentence2]).toarray()
  print(vectors)

  # calculate cosine similarity
  similarity = cosine_similarity(vectors[0], vectors[1])
  return similarity

In [None]:
sentence1 = "The engineer spoke about the importance of innovation and sustainability in building design."
sentence2 = "The engineer has not spoken about the importance of innovation and sustainability not in building design."
compare_sentences(sentence1, sentence2, encoding_type='tfXidf')

[[0.07459644 0.07459644 0.07459644 0.07459644 0.07459644 0.
  0.07459644 0.07459644 0.07459644 0.         0.07459644 0.1048427
  0.         0.07459644 0.14919288]
 [0.0567477  0.0567477  0.0567477  0.0567477  0.0567477  0.07975691
  0.0567477  0.0567477  0.0567477  0.15951382 0.0567477  0.
  0.07975691 0.0567477  0.11349539]]


0.6888998213683907

In [None]:
# 1 stop-word removal increases recall
# set to_stem to False; encoding to one-hot
sentence1 = "The engineer spoke about the importance of innovation and sustainability in building design."
sentence2 = "The engineer has not spoken about the importance of innovation and sustainability not in building design."
processed_sentence1 = encode_sentence(sentence1, to_stem=False, to_remove_stop_words=True)
processed_sentence2 = encode_sentence(sentence2, to_stem=False, to_remove_stop_words=True)
similarity_score_before = compare_sentences(sentence1, sentence2, encoding_type='one-hot')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='one-hot')
print(f"Sentence1: {sentence1}")
print(f"Sentence2: {sentence2}")
print(f"Before Similarity Score: {similarity_score_before}")
print("Stop Words Removed")
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"After Similarity Score: {similarity_score_after}")

Sentence1: The engineer spoke about the importance of innovation and sustainability in building design.
Sentence2: The engineer has not spoken about the importance of innovation and sustainability not in building design.
Before Similarity Score: 0.8486684247915056
Stop Words Removed
Processed Sentence1: engineer spoke importance innovation sustainability building design
Processed Sentence2: engineer spoken importance innovation sustainability building design
After Similarity Score: 0.857142857142857


In [None]:
# 1 stop-word removal increases recall
# set to_stem to False; encoding to one-hot
sentence1 = "A frog is jumping quickly onto the table"
sentence2 = "The frog jumps quick over on the table"
processed_sentence1 = encode_sentence(sentence1, to_stem=False, to_remove_stop_words=True)
processed_sentence2 = encode_sentence(sentence2, to_stem=False, to_remove_stop_words=True)
similarity_score_before = compare_sentences(sentence1, sentence2, encoding_type='one-hot')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='one-hot')
print(f"Sentence1: {sentence1}")
print(f"Sentence2: {sentence2}")
print(f"Before Similarity Score: {similarity_score_before}")
print("Stop Words Removed")
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"After Similarity Score: {similarity_score_after}")

Sentence1: A frog is jumping quickly onto the table
Sentence2: The frog jumps quick over on the table
Before Similarity Score: 0.4285714285714285
Stop Words Removed
Processed Sentence1: frog jumping quickly onto table
Processed Sentence2: frog jumps quick table
After Similarity Score: 0.4472135954999579


In [None]:
# 2 stop-word removal decreases precision
# set to_stem to False; encoding to one-hot
sentence1 = "The actor saw the man with the telescope."
sentence2 = "The actor has not seen the man with the telescope."
processed_sentence1 = encode_sentence(sentence1, to_stem=False, to_remove_stop_words=True)
processed_sentence2 = encode_sentence(sentence2, to_stem=False, to_remove_stop_words=True)
similarity_score_before = compare_sentences(sentence1, sentence2, encoding_type='one-hot')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='one-hot')
print(f"Sentence1: {sentence1}")
print(f"Sentence2: {sentence2}")
print(f"Before Similarity Score: {similarity_score_before}")
print("Stop Word Removed")
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"After Similarity Score: {similarity_score_after}")

Sentence1: The actor saw the man with the telescope.
Sentence2: The actor has not seen the man with the telescope.
Before Similarity Score: 0.7216878364870323
Stop Word Removed
Processed Sentence1: actor saw man telescope
Processed Sentence2: actor seen man telescope
After Similarity Score: 0.75


In [None]:
# 2 stop-word removal decreases precision
# set to_stem to False; encoding to one-hot
sentence1 = "The cat has not run on the court."
sentence2 = "A cat is running on the court."
processed_sentence1 = encode_sentence(sentence1, to_stem=False, to_remove_stop_words=True)
processed_sentence2 = encode_sentence(sentence2, to_stem=False, to_remove_stop_words=True)
similarity_score_before = compare_sentences(sentence1, sentence2, encoding_type='one-hot')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='one-hot')
print(f"Sentence1: {sentence1}")
print(f"Sentence2: {sentence2}")
print(f"Before Similarity Score: {similarity_score_before}")
print("Stop Word Removed")
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"After Similarity Score: {similarity_score_after}")

Sentence1: The cat has not run on the court.
Sentence2: A cat is running on the court.
Before Similarity Score: 0.6172133998483676
Stop Word Removed
Processed Sentence1: cat run court
Processed Sentence2: cat running court
After Similarity Score: 0.6666666666666667


In [None]:
# 3 stemming increases recall
# set stop_word removal to False; encoding to one-hot
sentence1 = "The company is expanding rapidly due to an increase in demand for its products."
sentence2 = "The corporation has decided to expand its operations due to rising demands."
processed_sentence1 = encode_sentence(sentence1, to_stem=True, to_remove_stop_words=False)
processed_sentence2 = encode_sentence(sentence2, to_stem=True, to_remove_stop_words=False)
similarity_score_before = compare_sentences(sentence1, sentence2, encoding_type='one-hot')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='one-hot')
print(f"Sentence1: {sentence1}")
print(f"Sentence2: {sentence2}")
print(f"Before Similarity Score: {similarity_score_before}")
print("Stem Word")
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"After Similarity Score: {similarity_score_after}")

Sentence1: The company is expanding rapidly due to an increase in demand for its products.
Sentence2: The corporation has decided to expand its operations due to rising demands.
Before Similarity Score: 0.3223291856101521
Stem Word
Processed Sentence1: the compani is expand rapidli due to an increas in demand for it product .
Processed Sentence2: the corpor ha decid to expand it oper due to rise demand .
After Similarity Score: 0.48349377841522817


In [None]:
# 3 stemming increases recall
# set stop_word removal to False; encoding to one-hot
sentence1 = "Many companies are integrating advanced artificial intelligence into daily operations."
sentence2 = "Several businesses have integrated advanced AI technologies for daily tasks."
processed_sentence1 = encode_sentence(sentence1, to_stem=True, to_remove_stop_words=False)
processed_sentence2 = encode_sentence(sentence2, to_stem=True, to_remove_stop_words=False)
similarity_score_before = compare_sentences(sentence1, sentence2, encoding_type='one-hot')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='one-hot')
print(f"Sentence1: {sentence1}")
print(f"Sentence2: {sentence2}")
print(f"Before Similarity Score: {similarity_score_before}")
print("Stem Word")
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"After Similarity Score: {similarity_score_after}")

Sentence1: Many companies are integrating advanced artificial intelligence into daily operations.
Sentence2: Several businesses have integrated advanced AI technologies for daily tasks.
Before Similarity Score: 0.19999999999999996
Stem Word
Processed Sentence1: mani compani are integr advanc artifici intellig into daili oper .
Processed Sentence2: sever busi have integr advanc ai technolog for daili task .
After Similarity Score: 0.29999999999999993


In [None]:
# 4 stemming decreases precision
# set stop_word removal to False; encoding to one-hot
sentence1 = "The developer creates innovative software solutions to optimize various business processes."
sentence2 = "Innovatively created software by developers optimizing business process solutions has shown significant improvements."
processed_sentence1 = encode_sentence(sentence1, to_stem=True, to_remove_stop_words=False)
processed_sentence2 = encode_sentence(sentence2, to_stem=True, to_remove_stop_words=False)
similarity_score_before = compare_sentences(sentence1, sentence2, encoding_type='one-hot')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='one-hot')
print(f"Sentence1: {sentence1}")
print(f"Sentence2: {sentence2}")
print(f"Before Similarity Score: {similarity_score_before}")
print("Stem Word")
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"After Similarity Score: {similarity_score_after}")

Sentence1: The developer creates innovative software solutions to optimize various business processes.
Sentence2: Innovatively created software by developers optimizing business process solutions has shown significant improvements.
Before Similarity Score: 0.25087260300212727
Stem Word
Processed Sentence1: the develop creat innov softwar solut to optim variou busi process .
Processed Sentence2: innov creat softwar by develop optim busi process solut ha shown signific improv .
After Similarity Score: 0.6689936080056726


In [None]:
# 4 stemming decreases precision
# set stop_word removal to False; encoding to one-hot
sentence1 = "Public health officials are urging people to get vaccinated against the flu."
sentence2 = "National Health authorities have urged flu vaccinations for the public."
processed_sentence1 = encode_sentence(sentence1, to_stem=True, to_remove_stop_words=False)
processed_sentence2 = encode_sentence(sentence2, to_stem=True, to_remove_stop_words=False)
similarity_score_before = compare_sentences(sentence1, sentence2, encoding_type='one-hot')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='one-hot')
print(f"Sentence1: {sentence1}")
print(f"Sentence2: {sentence2}")
print(f"Before Similarity Score: {similarity_score_before}")
print("Stem Word")
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"After Similarity Score: {similarity_score_after}")

Sentence1: Public health officials are urging people to get vaccinated against the flu.
Sentence2: National Health authorities have urged flu vaccinations for the public.
Before Similarity Score: 0.3651483716701107
Stem Word
Processed Sentence1: public health offici are urg peopl to get vaccin against the flu .
Processed Sentence2: nation health author have urg flu vaccin for the public .
After Similarity Score: 0.5477225575051661


In [None]:
# 5 bag of words has higher precision than one-hot encoding
# set both to_stem and stop_word removal to False
sentence1 = "A brown frog jumps quickly on the table."
sentence2 = "The brown frog is jumping quick onto the table."
processed_sentence1 = encode_sentence(sentence1, to_stem=False, to_remove_stop_words=False)
processed_sentence2 = encode_sentence(sentence2, to_stem=False, to_remove_stop_words=False)
similarity_score_before = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='one-hot')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='bag of words')
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"Similarity Score with One-Hot Encoder: {similarity_score_before}")
print(f"Similarity Score with Bag of Words: {similarity_score_after}")

Processed Sentence1: a brown frog jumps quickly on the table .
Processed Sentence2: the brown frog is jumping quick onto the table .
Similarity Score with One-Hot Encoder: 0.5345224838248487
Similarity Score with Bag of Words: 0.5698028822981898


In [None]:
# 5 bag of words has higher precision than one-hot encoding
# set both to_stem and stop_word removal to False
sentence1 = "The student read the book and the book was very interesting and informative."
sentence2 = "The book was read by the student and found to be interesting and informative."
processed_sentence1 = encode_sentence(sentence1, to_stem=False, to_remove_stop_words=False)
processed_sentence2 = encode_sentence(sentence2, to_stem=False, to_remove_stop_words=False)
similarity_score_before = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='one-hot')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='bag of words')
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"Similarity Score with One-Hot Encoder: {similarity_score_before}")
print(f"Similarity Score with Bag of Words: {similarity_score_after}")

Processed Sentence1: the student read the book and the book was very interesting and informative .
Processed Sentence2: the book was read by the student and found to be interesting and informative .
Similarity Score with One-Hot Encoder: 0.769800358919501
Similarity Score with Bag of Words: 0.8355044182110839


In [None]:
# 6 tf has higher precision than bag of words
# set both to_stem and stop_word removal to False
sentence1 = "A brown frog jumps quickly on the table."
sentence2 = "The brown frog is jumping quick onto the table."
processed_sentence1 = encode_sentence(sentence1, to_stem=False, to_remove_stop_words=False)
processed_sentence2 = encode_sentence(sentence2, to_stem=False, to_remove_stop_words=False)
similarity_score_before = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='bag of words')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='tf')
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"Similarity Score with Bag of Words: {similarity_score_before}")
print(f"Similarity Score with tf: {similarity_score_after}")

Processed Sentence1: a brown frog jumps quickly on the table .
Processed Sentence2: the brown frog is jumping quick onto the table .
Similarity Score with Bag of Words: 0.5698028822981898
Similarity Score with tf: 0.5698028822981899


In [None]:
# 6 tf has higher precision than bag of words
# set both to_stem and stop_word removal to False
sentence1 = "A new tech startup launched a revolutionary software product that is designed to enhance productivity and collaboration."
sentence2 = "The new software product was launched by a tech startup which is designed to improve productivity and teamwork."
processed_sentence1 = encode_sentence(sentence1, to_stem=False, to_remove_stop_words=False)
processed_sentence2 = encode_sentence(sentence2, to_stem=False, to_remove_stop_words=False)
similarity_score_before = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='bag of words')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='tf')
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"Similarity Score with Bag of Words: {similarity_score_before}")
print(f"Similarity Score with tf: {similarity_score_after}")

Processed Sentence1: a new tech startup launched a revolutionary software product that is designed to enhance productivity and collaboration .
Processed Sentence2: the new software product was launched by a tech startup which is designed to improve productivity and teamwork .
Similarity Score with Bag of Words: 0.6888467201936644
Similarity Score with tf: 0.6888467201936646


In [None]:
# 7 tfXidf has higher precision than tf
# set both to_stem and stop_word removal to False
sentence1 = "A brown frog jumps quickly on the table."
sentence2 = "The brown frog is jumping quick onto the table."
processed_sentence1 = encode_sentence(sentence1, to_stem=False, to_remove_stop_words=False)
processed_sentence2 = encode_sentence(sentence2, to_stem=False, to_remove_stop_words=False)
similarity_score_before = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='tf')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='tfXidf')
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"Similarity Score with tf: {similarity_score_before}")
print(f"Similarity Score with tfXidf: {similarity_score_after}")

Processed Sentence1: a brown frog
Processed Sentence2: the brown frog
Similarity Score with tf: 0.8164965809277261
Similarity Score with tfXidf: 0.7092972666062739


In [None]:
# 7 tfXidf has higher precision than tf
# set both to_stem and stop_word removal to False
sentence1 = "The cat enjoys sitting on the mat very much."
sentence2 = "A cat likes to sit on a mat a lot."
processed_sentence1 = encode_sentence(sentence1, to_stem=False, to_remove_stop_words=False)
processed_sentence2 = encode_sentence(sentence2, to_stem=False, to_remove_stop_words=False)
similarity_score_before = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='tf')
similarity_score_after = compare_sentences(processed_sentence1, processed_sentence2, encoding_type='tfXidf')
print(f"Processed Sentence1: {processed_sentence1}")
print(f"Processed Sentence2: {processed_sentence2}")
print(f"Similarity Score with tf: {similarity_score_before}")
print(f"Similarity Score with tfXidf: {similarity_score_after}")

Processed Sentence1: the cat enjoys sitting on the mat very much .
Processed Sentence2: a cat likes to sit on a mat a lot .
Similarity Score with tf: 0.3418817293789139
Similarity Score with tfXidf: 0.2095424038071012


In [None]:
# Alternative approach to compute tf, idf, and tfXidf from a suitable corpus

import math
from collections import defaultdict, Counter

def calculate_tf(document):
    """
    Calculate TF (Term Frequency) for each word in a single document.

    Args:
    document (list of str): A list of words representing a document.

    Returns:
    dict: A dictionary where keys are words and values are their TF scores.
    """

    # Total number of words in the document
    total_words = len(document)
    # Frequency of each word in the document
    word_freq = Counter(document)
    # Calculate TF for each word
    tf_scores = {word: freq / total_words for word, freq in word_freq.items()}
    return tf_scores

def calculate_idf(documents):
    """
    Calculate IDF (Inverse Document Frequency) for words in a collection of documents.

    Args:
    documents (list of list of str): Each element is a list of words representing a document.

    Returns:
    dict: A dictionary where keys are words and values are their IDF scores.
    """
    N = len(documents)
    word_doc_count = defaultdict(int)
    for doc in documents:
        unique_words = set(doc)
        for word in unique_words:
            word_doc_count[word] += 1
    # Calculate IDF for each word, avoiding division by zero
    idfs = {word: math.log(N / float(count)) for word, count in word_doc_count.items()}
    return idfs

def calculate_tfidf(document, idfs):
    """
    Calculate TF-IDF for each word in a single document based on pre-computed IDF scores.

    Args:
    document (list of str): A list of words representing a document.
    idfs (dict): A dictionary of IDF scores pre-computed from a corpus.

    Returns:
    dict: A dictionary where keys are words and values are their TF-IDF scores.
    """
    # Calculate TF for the given document
    tf_scores = calculate_tf(document)
    # Calculate TF-IDF using TF and pre-computed IDF
    tfidf_scores = {word: tf_scores[word] * idfs.get(word, 0) for word in tf_scores}
    return tfidf_scores

def merge_sentences(document):
    """
    Merge a list of sentences (where each sentence is a list of words) into a single list of words.

    Args:
    document (list of list of str): A list where each element is a sentence represented as a list of words.

    Returns:
    list of str: A merged list of words from all sentences in the document.
    """
    merged_document = []
    for sentence in document:
        merged_document.extend(sentence)
    return merged_document


In [None]:
# Sample corpus with documents consisting of sentences

corpus = [
    [["the", "cat", "sat"], ["on", "the", "mat"]],
    [["a", "cat", "is"], ["sitting", "on", "the", "mat"]],
    [["there", "is", "a", "mat"], ["in", "the", "room"]]
]

# Merge sentences in each document
merged_documents = [merge_sentences(doc) for doc in corpus]

# Calculate IDF scores based on all merged documents
IDFs = calculate_idf(merged_documents)

print("The IDF scores based on all merged documents: ")
for key, value in IDFs.items():
  print(f"{key}:{value}")


The IDF scores based on all merged documents: 
the:0.0
sat:1.0986122886681098
on:0.4054651081081644
cat:0.4054651081081644
mat:0.0
sitting:1.0986122886681098
a:0.4054651081081644
is:0.4054651081081644
room:1.0986122886681098
there:1.0986122886681098
in:1.0986122886681098
