#### Tutorial on Term-Document Matrix, TF-IDF, Chunking, Named Entity Recognition (NER), and Word Sense Disambiguation (WSD) 


In [None]:
# Install these libraries, if not done already. 
# !pip install nltk spacy sklearn

In [None]:
!python -m spacy download en_core_web_sm

In [3]:
import nltk
import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Load SpaCy model.
nlp = spacy.load("en_core_web_sm")

# Sample Documents.
documents = [
    "Apple and banana are fruits.",
    "I like to eat apple pie.",
    "The banana pie is delicious."
]

#### Chunking

In [5]:
def chunking_example(doc):
    chunks = []
    for sent in doc.sents:
        for chunk in sent.noun_chunks:
            chunks.append(chunk.text)
    return chunks

#### Named Entity Recognition (NER)

In [17]:
def ner_example(doc):
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

#### Word Sense Disambiguation 

In [6]:
def wsd_example(word, context):
    # Simplistic approach for demonstration
    if word == "apple":
        if "pie" in context:
            return "The tech company"
        else:
            return "The fruit"
    return None

#### Term-Document Matrix

In [32]:
# Create the Term-Document Matrix using raw counts.
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

# Create DataFrame for better visualization
df_term_doc = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print("Term-Document Matrix:\n", df_term_doc)

Term-Document Matrix:
    and  apple  are  banana  delicious  eat  fruits  is  like  pie  the  to
0    1      1    1       1          0    0       1   0     0    0    0   0
1    0      1    0       0          0    1       0   0     1    1    0   1
2    0      0    0       1          1    0       0   1     0    1    1   0


#### Term Frequency-Inverse Document Frequency (TF-IDF)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Example Documents
documents = [
    "Apple and banana are fruits.",
    "I like to eat apple pie.",
    "The banana pie is delicious."
]

# Create the TF-IDF Matrix
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

# Create DataFrame for better visualization
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print("TF-IDF Matrix:\n", df_tfidf)

TF-IDF Matrix:
         and     apple       are    banana  delicious       eat    fruits  \
0  0.490479  0.373022  0.490479  0.373022   0.000000  0.000000  0.490479   
1  0.000000  0.373022  0.000000  0.000000   0.000000  0.490479  0.000000   
2  0.000000  0.000000  0.000000  0.373022   0.490479  0.000000  0.000000   

         is      like       pie       the        to  
0  0.000000  0.000000  0.000000  0.000000  0.000000  
1  0.000000  0.490479  0.373022  0.000000  0.490479  
2  0.490479  0.000000  0.373022  0.490479  0.000000  


In [12]:
# Process sample documents.
docs_spacy = [nlp(doc) for doc in documents]
print(docs_spacy)

[Apple and banana are fruits., I like to eat apple pie., The banana pie is delicious.]


In [13]:
# Try chunking.
# Uses SpaCy to extract noun chunks from each document.
chunks = [chunking_example(doc) for doc in docs_spacy]
print("Chunks:", chunks)

Chunks: [['Apple', 'banana', 'fruits'], ['I', 'apple pie'], ['The banana pie']]


In [24]:
# Try NER.
# Identifies named entities such as persons, locations, and organizations in the documents.
ner_results = [ner_example(doc) for doc in docs_spacy]
print("Named Entities:",ner_results)

Named Entities: [[('Apple', 'ORG')], [], []]


- No named entities detected in documents 2 and 3. It is likely because "apple pie" and “banana pie” are not recognized as a named entity.
- It could be because of model limitations of the pre-trained SpaCy model (en_core_web_sm). Larger models (en_core_web_md or en_core_web_lg), have more extensive training data. So, you can try with them. By using a larger model or adjusting the text, you may get better results.

In [28]:
# Try WSD.
'''
A simplistic approach is used here to demonstrate 
how "apple" might be interpreted based on context.
'''

wsd_results = [wsd_example("apple", doc.text) for doc in docs_spacy]
print("WSD Results:", wsd_results)

WSD Results: ['The fruit', 'The tech company', 'The tech company']
