In [2]:
import nltk
import pandas as pd

from nltk.tokenize import sent_tokenize,word_tokenize

### Reading csv from dataset

In [3]:
df=pd.read_csv('./nlp-getting-started/train.csv')

### Orginal Dataset 

In [4]:
text_df=df['text']
text_df.head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

# TASK 2

In [5]:
sent_df=text_df.apply(sent_tokenize)
sent_df.head(5)

0    [Our Deeds are the Reason of this #earthquake ...
1            [Forest fire near La Ronge Sask., Canada]
2    [All residents asked to 'shelter in place' are...
3    [13,000 people receive #wildfires evacuation o...
4    [Just got sent this photo from Ruby #Alaska as...
Name: text, dtype: object

### TASK 1


In [6]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

word_df=text_df.apply(tokenizer.tokenize)
word_df.head(5)

0    [Our, Deeds, are, the, Reason, of, this, earth...
1        [Forest, fire, near, La, Ronge, Sask, Canada]
2    [All, residents, asked, to, shelter, in, place...
3    [13, 000, people, receive, wildfires, evacuati...
4    [Just, got, sent, this, photo, from, Ruby, Ala...
Name: text, dtype: object

### TASK 3

In [7]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))  

In [8]:
def filter_stop_words(wordList:list)-> list:
    ansLis=[]
    for i in wordList:
        if i not in stop_words:
            ansLis.append(i.lower())
    return ansLis


In [9]:
filtered_df=word_df.apply(filter_stop_words)
filtered_df.head(5)

0    [our, deeds, reason, earthquake, may, allah, f...
1        [forest, fire, near, la, ronge, sask, canada]
2    [all, residents, asked, shelter, place, notifi...
3    [13, 000, people, receive, wildfires, evacuati...
4    [just, got, sent, photo, ruby, alaska, smoke, ...
Name: text, dtype: object

### TASK 4

In [10]:
from nltk.stem import PorterStemmer


In [11]:
def word_stemmer(wordList:list)-> list:
    ansLis=[]
    for i in wordList:
        ansLis.append(PorterStemmer().stem(i))
    return ansLis

In [12]:
stem_df=word_df.apply(word_stemmer)
stem_df.head()

0    [our, deed, are, the, reason, of, thi, earthqu...
1         [forest, fire, near, la, rong, sask, canada]
2    [all, resid, ask, to, shelter, in, place, are,...
3    [13, 000, peopl, receiv, wildfir, evacu, order...
4    [just, got, sent, thi, photo, from, rubi, alas...
Name: text, dtype: object

In [13]:
word_corpus={}

index=0
for i in filtered_df:
    
    for j in i:
        if j not in word_corpus:
            word_corpus[j]=[]
            word_corpus[j].append(index)
        else:
            if index not in word_corpus[j]:
                word_corpus[j].append(index)
    
    index+=1
        
    

In [14]:
from prettyprinter import pprint
pprint(word_corpus)

{
    'our': [
        0,
        1371,
        1550,
        1645,
        1673,
        2220,
        2431,
        2855,
        2976,
        3099,
        3124,
        3172,
        3235,
        3345,
        3369,
        3618,
        3786,
        4024,
        4209,
        4281,
        4322,
        4630,
        4659,
        4987,
        6567,
        7157
    ],
    'deeds': [0, 4985],
    'reason': [
        0,
        304,
        305,
        317,
        319,
        746,
        763,
        781,
        894,
        1920,
        2112,
        2252,
        2747,
        4333,
        4843,
        5372,
        6232,
        6453,
        6459,
        7218
    ],
    'earthquake': [
        0,
        3027,
        3028,
        3029,
        3030,
        3032,
        3033,
        3034,
        3035,
        3037,
        3038,
        3039,
        3041,
        3043,
        3044,
        3046,
        3047,
        3048,
        3049,
        3050,
      

In [15]:
def merge(word1,word2,word_corpus):
    postings1=word_corpus[word1]
    postings2=word_corpus[word2]

    a,b=len(postings1),len(postings2)

    i,j=0,0
    results=[]
    while i<a and j<b:
        if postings1[i]==postings2[j]:
            results.append(postings1[i])
            i+=1
            j+=1
        elif postings1[i]<postings2[j]:
            i+=1
        else:
            j+=1
    return results


In [16]:
merge('got','sent',word_corpus=word_corpus)

[4]