# [Workshop] Textual Knowledge Processing

<img src="ws_img_001.png">

# 0. Package Installation (one time job)

In [1]:
# !pip install -U spacy
# !python -m spacy download en_core_web_sm
# !pip install pandas
# !pip install gingerit
# !pip install gensim==3.8.0

# 1. Import Library

In [2]:
import spacy
import pandas as pd
from gingerit.gingerit import GingerIt

In [3]:
import nltk
import gensim
import gensim.downloader
from gensim.models import KeyedVectors
#nltk.download('word2vec_sample')

In [4]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet

# 2. Text Preprocessing

## 2.1 Lower casing

In [5]:
def lower_casing(sentence):
    # Quiz: How to implement this function without using str.lower()?
    new_sentence = sentence.lower()
    return new_sentence

In [6]:
def lower_casing_spacy(sentence):
   
    sentence = sentence.lower()
    
    return sentence

## 2.2 Abbreviation expansion

In [7]:
def expand_abbriviation(sentence):
    replacement_patterns = [
        (r'won\'t', 'will not'),
        (r'can\'t', 'cannot'),
        (r'i\'m', 'i am'),
        (r'ain\'t', 'is not'),
        (r'(\w+)\'ll', '\g<1> will'),
        (r'(\w+)n\'t', '\g<1> not'),
        (r'(\w+)\'ve', '\g<1> have'),
        (r'(\w+)\'s', '\g<1> is'),
        (r'(\w+)\'re', '\g<1> are'),
        (r'(\w+)\'d', '\g<1> would')]
    patterns = [(re.compile(regex), repl) for (regex, repl) in replacement_patterns]

    new_sentence = sentence
    for (pattern, repl) in patterns:
        (new_sentence, count) = re.subn(pattern, repl, new_sentence)
    return new_sentence

In [8]:
model = KeyedVectors.load('wiki.kv')
def expand_abbriviation_spacy(sentence):
    
    final={}
    
    for word1 in sentence.split():
        
        word=word1.replace('.','')
        
        if(len(word)<=3 and word==word.upper()):
            try:
                l = [i[0] for i in model.most_similar(word, topn=200)]
                
                for i in l:

                    if(len(i.split('-'))==len(word)):
                    
                        k=''
                        for j in i.split('-'):
                            try:
                                k+=str(j[0])
                            except:
                                
                                pass
                       
                            
                        if(word==k):
                            final[word1]= i.replace('-',' ')
                            break
                      

            except:
                pass
            
    s=sentence.split()
    
    for i in range(len(s)):
        try:
            s[i] = final[s[i]]
        except:
            pass
        
    sentence = " ".join(s)
    
    return sentence

## 2.3 Punctuation removal

In [9]:
def punctuation_removal(sentence):
    # Remove the all the punctuations except '
    new_sentence = re.sub(',|!|\?|\"|<|>|\(|\)|\[|\]|\{|\}|@|#|\+|\=|\-|\_|~|\&|\*|\^|%|\||\$|/|`|\.|\'',
                          '', sentence,count=0, flags=0)
    return new_sentence

In [10]:
def punctuation_removal_spacy(sentence):
    # Remove the all the punctuations except '
    
    sentence = nlp(sentence)
    
    sentence = [token.text for token in sentence if token.is_alpha==True or token.text=="'"]
    
    sentence = " ".join(sentence)
    
    return sentence

## 2.4. Sentence tokenization

In [11]:
def tokenization(sentence):
    new_sentence = nltk.word_tokenize(sentence)
    return new_sentence

## 2.5. Spelling correction

In [12]:
def spell_correction(sentence):
    
    result = GingerIt().parse(sentence)

    sentence = result['result']

    return sentence

## 2.6. Stopwords removal

In [13]:
def stopword_removal(sentence):
    #stoplist = stopwords.words('english')
     
    with open('./stopwords.txt') as file:
        stoplist = [stopword.replace('\n', '').lower() for stopword in file.readlines()]
    
    new_sentence = [word for word in sentence if word not in stoplist]
    return new_sentence

In [14]:
def stopword_removal_spacy(sentence):
    
    sentence = nlp(sentence)
    
    sentence = [token.text for token in sentence if token.is_stop==False]
    
    sentence = " ".join(sentence)
    
    return sentence

## 2.7. Lemmatization

In [15]:
def get_wordnet_pos(word):
    pack = nltk.pos_tag([word])
    tag = pack[0][1]
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV

    else:
        return None


def lemmatization(sentence):
    lemmatizer = nltk.stem.WordNetLemmatizer()

    new_sentence = [lemmatizer.lemmatize(word, get_wordnet_pos(word) or wordnet.NOUN) for word in sentence]

    return new_sentence

In [16]:
def lemmatization_spacy(sentence):
    
    sentence = nlp(sentence)
    
    sentence = [word.lemma_ for word in sentence]
    
    sentence = " ".join(sentence)

    return sentence

## 2.8. Integrate all the functions

In [17]:
def text_preprocessing(raw_sentence):
    sentence = lower_casing(raw_sentence)
    sentence = expand_abbriviation(sentence)
    sentence = punctuation_removal(sentence)
    sentence = tokenization(sentence)
    sentence = stopword_removal(sentence)
    sentence = lemmatization(sentence)
    
    return sentence

In [18]:
def my_preprocessing(raw_sentence):
    sentence = lower_casing_spacy(raw_sentence)
    sentence = expand_abbriviation_spacy(sentence)
    sentence = punctuation_removal_spacy(sentence)
    sentence = stopword_removal_spacy(sentence)
    sentence = lemmatization_spacy(sentence)

    return sentence.split()

# 3. Lets have a try

In [19]:
with open('./questionbase_raw.txt') as file:
    raw_sentences = [sentence.replace('\n', '') for sentence in file.readlines()]

In [20]:
nlp = spacy.load('en_core_web_sm')

i = 1
for raw_sentence in raw_sentences:
    processed_sentence1 = text_preprocessing(raw_sentence)
    processed_sentence2 = my_preprocessing(raw_sentence)
    if raw_sentence != 'Q' and raw_sentence != 'A':
        print(i, raw_sentence,"\n")
        print("NLTK: ",processed_sentence1,"\n")
        print("SPACY: ",processed_sentence2,"\n")
        #if(len(processed_sentence1)>len(processed_sentence2)):
        print("NLTK extra tokens: ",set(processed_sentence1)-set(processed_sentence2),"\n")
        #elif(len(processed_sentence1)<len(processed_sentence2)):
        print("SPACY extra tokens:: ",set(processed_sentence2)-set(processed_sentence1),"\n")
#         else:
#             print("Same Processing","\n")
        print('*'*100)
        i += 1

1 Hello 

NLTK:  [] 

SPACY:  ['hello'] 

NLTK extra tokens:  set() 

SPACY extra tokens::  {'hello'} 

****************************************************************************************************
2 Hello, I am ASD knowledge bot. Feel free to ask me anything about autism spectrum disorder (ASD). 

NLTK:  ['asd', 'knowledge', 'bot', 'feel', 'free', 'autism', 'spectrum', 'disorder', 'asd'] 

SPACY:  ['hello', 'asd', 'knowledge', 'bot', 'feel', 'free', 'ask', 'autism', 'spectrum', 'disorder', 'asd'] 

NLTK extra tokens:  set() 

SPACY extra tokens::  {'ask', 'hello'} 

****************************************************************************************************
3 What is definition of Autistic Spectrum Disorder? 

NLTK:  ['definition', 'autistic', 'spectrum', 'disorder'] 

SPACY:  ['definition', 'autistic', 'spectrum', 'disorder'] 

NLTK extra tokens:  set() 

SPACY extra tokens::  set() 

*************************************************************************************

20 People with ASD may also experience sleep problems and irritability. Although people with ASD experience many challenges, they may also have many strengths, including: 

NLTK:  ['people', 'asd', 'experience', 'sleep', 'irritability', 'people', 'asd', 'experience', 'challenge', 'strength', 'include', ':'] 

SPACY:  ['people', 'asd', 'experience', 'sleep', 'problem', 'irritability', 'people', 'asd', 'experience', 'challenge', 'strength', 'include'] 

NLTK extra tokens:  {':'} 

SPACY extra tokens::  {'problem'} 

****************************************************************************************************
21 Being able to learn things in detail and remember information for long periods of time. 

NLTK:  ['learn', 'detail', 'remember', 'period', 'time'] 

SPACY:  ['able', 'learn', 'thing', 'detail', 'remember', 'information', 'long', 'period', 'time'] 

NLTK extra tokens:  set() 

SPACY extra tokens::  {'able', 'thing', 'information', 'long'} 

**********************************

In [21]:
# Compare the two results and explain which one is better and why?
# Provide your answer here
# According to me, preprocessing done by spacy is better than that done by nltk. 
# Preprocessing with nltk doesn't remove some unwanted tokens like digits or :
# NLTK makes some sentences too short after preprocessing. For example, in sentence 21, "information" and "long" seem like important tokens.Spacy retains such tokens.

---