In [None]:
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
# -----------------------------------------------------------
def tokenize(docs):
    for id,doc in docs.items():
        docs[id]=word_tokenize(docs[id].lower())
    return docs
# -----------------------------------------------------------
def stopword_rem(tokendocs):
    stop_words=set(stopwords.words('english'))
    for id,doc in tokendocs.items():
        not_stopwords=[]
        for word in doc:
            if word not in stop_words:
                not_stopwords.append(word)
        tokendocs[id] = not_stopwords
    return tokendocs
# -----------------------------------------------------------
def stemming(tokened):
    stemmer = PorterStemmer()
    for id, doc in tokened.items():
        stemmed_doc = [stemmer.stem(word) for word in doc]
        tokened[id] = stemmed_doc
    return tokened
# -----------------------------------------------------------
def tdm_build(docs):
    word_list=[]
    for id, doc in docs.items():
     for word in doc:
        word_list.append(word)
     tdm=dict.fromkeys(sorted(set(word_list)))
    for word in tdm:
         tdm[word]=[]
         for doc in docs.values():
                 if word in doc:
                     tdm[word].append(1)
                 else : tdm[word].append(0)
    return tdm
# -----------------------------------------------------------
def search_tdm_two_words(word1,word2,tdm):
 v1=tdm[word1]
 v2=tdm[word2]
 '''
 result=[]
 for a,b in zip(term_doc_matrix["Glider" ],term_doc_matrix["soccer" ]):
    #result.append(a&b)
    print(a,' & ',b,' ----->',a&b)    
    ''' 
 print(v1)
 print(v2)
 result=[a&b for a,b in zip(tdm[word1],tdm[word2])]
 print(result)
# -----------------------------------------------------------
def build_inverted_index(docs):
    word_list=[]
    for id, doc in docs.items():
     for word in doc:
        word_list.append(word)
    inverted_index=dict.fromkeys(sorted(set(word_list)))

    for word in inverted_index:
         inverted_index[word]=[]
         for docid,doc in docs.items():
                 if word in doc:
                     inverted_index[word].append(docid)
    return inverted_index
# -----------------------------------------------------------
def search_inverted_two_words(word1,word2,inverted_index):
 v1=inverted_index[word1]
 v2=inverted_index[word2]

 print(v1)
 print(v2)
 print(set(v1)&(set(v2)))


In [2]:
docs={"doc1":"I have fever and headache",
      "doc2":"He has fever and cough",
      "doc3":"Cough and headache are common"}
docs=tokenize(docs)
docs=stopword_rem(docs)
docs=stemming(docs)
tdm=tdm_build(docs)
print(tdm)
search_tdm_two_words("fever","cough",tdm)


{'common': [0, 0, 1], 'cough': [0, 1, 1], 'fever': [1, 1, 0], 'headach': [1, 0, 1]}
[1, 1, 0]
[0, 1, 1]
[0, 1, 0]


In [60]:
inverted=build_inverted_index(docs)
inverted

{'common': ['doc3'],
 'cough': ['doc2', 'doc3'],
 'fever': ['doc1', 'doc2'],
 'headach': ['doc1', 'doc3']}

In [62]:
search_inverted_two_words("fever","cough",inverted)

['doc1', 'doc2']
['doc2', 'doc3']
{'doc2'}


**Task 5 (Compare: TDM vs Inverted Index)**
In your results for the AND query **“fever” AND “cough”**, both methods return the same matching document: **doc2**. With the **Term–Document Matrix (TDM)**, each word is represented as a full binary vector over *all documents* (e.g., “fever” → `[1,1,0]`, “cough” → `[0,1,1]`), and the AND search is done by doing a position-wise bitwise AND to produce `[0,1,0]`, meaning only **doc2** satisfies both terms. With the **Inverted Index**, each word maps directly to the list of documents that contain it (e.g., “fever” → `[doc1, doc2]`, “cough” → `[doc2, doc3]`), and the AND search is done by intersecting these two lists, giving `{doc2}`. The key difference is efficiency and storage: **TDM checks across every document every time (and stores a large sparse matrix)**, while **an inverted index only processes the documents that actually contain the query terms (and stores compact postings lists)**—which scales much better as the number of documents grows.
