In [1]:
from nltk.corpus import brown

brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [9]:
brown.words(categories="humor")

['It', 'was', 'among', 'these', 'that', 'Hinkle', ...]

In [10]:
import nltk

gram = ("NP: {<DT>?<JJ>*<NN>}")

sent = "last night i saw a black dog barking at a kid"

chunking = nltk.RegexpParser(gram)
sent_token = nltk.word_tokenize(sent)
tagging = nltk.pos_tag(sent_token)

tagging

[('last', 'JJ'),
 ('night', 'NN'),
 ('i', 'NN'),
 ('saw', 'VBD'),
 ('a', 'DT'),
 ('black', 'JJ'),
 ('dog', 'NN'),
 ('barking', 'NN'),
 ('at', 'IN'),
 ('a', 'DT'),
 ('kid', 'NN')]

## Model Construction

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

raw = dict['Question']

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(raw)
word = vectorizer.get_feature_names_out()


In [43]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
print(transformer)

tfidf = transformer.fit_transform(X)
print(tfidf.toarray())

TfidfTransformer()
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## QA dataset preprocessing

In [71]:
import nltk

tokenizer = nltk.RegexpTokenizer(r"\w+")
tok_documents = []

idx = 0
for elem in dict['Question']:
    tok_documents.append(tokenizer.tokenize(elem.lower()))
    # idx += 1
print(tok_documents[0])


['how', 'african', 'americans', 'were', 'immigrated', 'to', 'the', 'us']


In [73]:
from nltk.corpus import stopwords

filtered_documents = []
for elem in tok_documents:
    filtered_documents.append([word for word in elem 
                    if word.lower() not in stopwords.words('english')])
print(filtered_documents[0])


['african', 'americans', 'immigrated', 'us']


In [74]:
from nltk.stem import WordNetLemmatizer

lemmer = WordNetLemmatizer()
lemed_documents = []
for elem in filtered_documents:
    lemed_documents.append([lemmer.lemmatize(word) for word in elem])

print(lemed_documents[0])

['african', 'american', 'immigrated', 'u']


In [75]:
vocabulary = []
for elem in lemed_documents:
    for word in elem:
        if word not in vocabulary:
            vocabulary.append(word)

print(len(vocabulary))

2304


In [79]:
import numpy as np

bow = {}
counter = 0
for q in lemed_documents:
    bow[counter] = np.zeros(len(vocabulary))
    for word in q:
        index = vocabulary.index(word)
        bow[counter][index] += 1
    counter += 1
print(bow[10][10:20])
    

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


## Query preprocessing

In [48]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer



query = "how are antibodies used"

tokenized_query = nltk.word_tokenize(query)
print(tokenized_query)
tagged_response = nltk.pos_tag(tokenized_query, tagset="universal")
print(tagged_response)
sb_stemmer = SnowballStemmer('english')
stemmed_response = [sb_stemmer.stem(word) for word in tokenized_query]
print(stemmed_response)
tokens_without_sw = [word.lower() for word in stemmed_response if not word in stopwords.words()]
print(tokens_without_sw)



['how', 'are', 'antibodies', 'used']
[('how', 'ADV'), ('are', 'VERB'), ('antibodies', 'NOUN'), ('used', 'VERB')]
['how', 'are', 'antibodi', 'use']
['antibodi', 'use']


In [68]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


query = "how are antibodies used"

tokenized_query = nltk.word_tokenize(query)
print(tokenized_query)

tagged_response = nltk.pos_tag(tokenized_query, tagset="universal")
print(tagged_response)

lemmer = WordNetLemmatizer()
Lemed_response = [lemmer.lemmatize(word) for word in tokenized_query]
print(Lemed_response)

tokens_without_sw = [word.lower() for word in Lemed_response if not word in stopwords.words()]
print(tokens_without_sw)



['how', 'are', 'antibodies', 'used']
[('how', 'ADV'), ('are', 'VERB'), ('antibodies', 'NOUN'), ('used', 'VERB')]
['how', 'are', 'antibody', 'used']
['antibody', 'used']
