In [65]:
from gnewsclient import gnewsclient
from newspaper import Article
import re
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import OrderedDict

In [66]:
topics = ['Health', 'Entertainment', 'Science', 'Sports', 'Nation']
news = []
articles_per_topic = 20
client = gnewsclient.NewsClient(language='English', location='india', topic=topics[0], max_results=articles_per_topic)

for topic in topics:
    client.topic = topic
    news.append(client.get_news())

In [67]:
documents = []
total_categories = len(news)

for i in range(total_categories):
    for j in range(len(news[i])):
        url = news[i][j]['link']
        try:
            article = Article(url)
            article.download()
            article.parse()
        except:
            continue
        documents.append(article.text)



In [68]:
len(documents)

100

# **Apply Preprocessing**
## **1. Remove Punctuation**<br>
## **2. Case Folding**

In [69]:
for i in range(len(documents)):
    documents[i] = re.sub(r'[^\w\s]', '', documents[i])
    documents[i] = documents[i].lower()

In [70]:
tokens = []
stopwrds = stopwords.words()


## **3. Remove Stopwords**<br>
## **4. Perform Stemming**
## **5. Remove other language words if its there**

In [71]:
ps = PorterStemmer()
words = nltk.corpus.words.words()

for i in range(len(documents)):
    text_tokens = word_tokenize(documents[i])
    tokens_per_doc = []
    for word in text_tokens:
        if word in words:
            if word not in stopwrds:
                tokens_per_doc.append(ps.stem(word))
    tokens.append(tokens_per_doc)

print(len(tokens))
print(len(documents))

100
100


## **5. Creation of the Inverted Index**

In [72]:
dict = {}
for i in range(len(documents)):
    for word in tokens[i]:
        if word not in dict:
            dict[word] = [i]
        if word in dict:
            if(i not in dict[word]):
                dict[word].append(i)

## **6. Sorting the Index terms**

In [73]:
dict = OrderedDict(sorted(dict.items()))

In [74]:
for word in dict:
    print(word, "-->", dict[word], end="\n")

aam --> [87]
abandon --> [65]
abil --> [6, 7, 14, 43, 60]
abl --> [17, 38, 42, 43, 44, 47, 48, 49, 50, 62, 65, 77, 78, 79]
abnorm --> [2]
aboard --> [55]
abras --> [45]
abroad --> [33]
absenc --> [33, 67]
absent --> [63]
absorb --> [59]
abu --> [49]
abund --> [32, 49]
academi --> [19]
acceler --> [44, 57, 79, 93]
accept --> [22, 60, 62, 81]
access --> [14, 25, 43, 45, 84]
accid --> [36]
accommod --> [99]
accompani --> [77]
accomplish --> [62]
accord --> [1, 2, 9, 10, 14, 17, 18, 26, 36, 44, 49, 52, 54, 59, 72, 75, 77, 81, 83, 85, 86, 88, 91, 93, 95, 97]
account --> [7, 14, 42, 54]
accuraci --> [60]
accus --> [21, 65, 87]
achiev --> [14, 95]
acknowledg --> [2, 57]
across --> [7, 13, 19, 28, 29, 32, 45, 46, 49, 52, 54, 57, 62, 80, 87, 97]
act --> [22, 25, 29, 36, 46, 57]
action --> [32, 41, 65, 67, 81]
activ --> [10, 16, 43, 52, 53, 59, 82, 89, 93, 95, 97]
actor --> [20, 22, 25, 29, 32, 33, 36]
actress --> [27, 30, 31, 39]
actual --> [22, 26, 44, 58]
acut --> [2, 13, 83]
ad --> [2, 13, 1

In [81]:
def boolean_check(term1="", term2="", term3="", operators = [1,1,1]):
    term1 = normalizeTerm(term1);
    term2 = normalizeTerm(term2);
    term3 = normalizeTerm(term3);
    if term1 in dict and term2 in dict and term3 in dict:
        doc_ids = set(range(len(documents)))
        if(operators[0] == 1):
            docs_term1 = set(dict[term1])
        else:
            docs_term1 = doc_ids.difference(set(dict[term1]))

        if(operators[1] == 1):
            docs_term2 = set(dict[term2])
        else:
            docs_term2 = doc_ids.difference(set(dict[term2]))

        if(operators[2] == 1):
            docs_term3 = set(dict[term3])
        else:
            docs_term3 = doc_ids.difference(set(dict[term3]))

        set1 = docs_term1.intersection(docs_term2)
        set2 = set1.intersection(docs_term3)

        if(len(set2) > 0):
            return set2
        else:
            return ()
    return ()


# **Normalize the terms of the query**

In [82]:
def normalizeTerm(term):
    term = re.sub(r'[^\w\s]', '', term)
    term = term.lower()
    term = ps.stem(term)
    return term

In [85]:
boolean_check("abl", "accord", "area", [1,1,0])

{17, 44, 49, 77}