In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, WordNetLemmatizer

## Stemmer & Lemmatization

In [2]:
def get_stem(sentence, stemmerName = "porter", return_token = False):
    """
    parameters : stemmerName, sentence
    - stemmerName :
        > porter(~PorterStemmer)
        > snow(~SnowballStemmer)
        > lancaster (~LancasterStemmer)
    - sentence : str 
    return sentence stemmed, stemmed tokens
    
    >>> sen, lst = stemmer("snow", "connecting connection connection")
    
    
    """
    if stemmerName == "porter":
        stemmer = PorterStemmer()
    elif stemmerName == "snow":
        stemmer = SnowballStemmer('english')
    elif stemmerName == "lancaster":
        stemmer = LancasterStemmer() 
        
    else :
        raise ValueError('please select from valid stemmers ["porter", "snow", "lancaster"]')
        
    stem_tokens = []
    
    tokens = nltk.word_tokenize(sentence)
    
    for token in tokens:
        stem_tokens.append(stemmer.stem(token))
        
    if return_token == True:
        return stem_tokens
        
    return " ".join(stem_tokens)
    

In [4]:
get_stem('messaging be cleaned may involved some thing like adjacenting space tabs',"snow")

'messag be clean may involv some thing like adjac space tab'

In [5]:
def get_lemma(sentence):
    """
    sentence : str
    """
    
    wnl = WordNetLemmatizer()
    
    lemma_tokens = []
    
    tokens = nltk.word_tokenize(sentence)
    
    for token in tokens:
        lemma_tokens.append(wnl.lemmatize(token))
        
    return lemma_tokens

In [6]:
get_lemma('go goes went going gonna')

['go', 'go', 'went', 'going', 'gon', 'na']

## BOW & TF-IDF

In [7]:
text = """
A major drawback of statistical methods is that they require elaborate feature engineering. 
Since the early 2010s, the field has thus largely abandoned statistical methods and shifted to neural networks for machine learning. 
Popular techniques include the use of word embeddings to capture semantic properties of words, and an increase in end-to-end learning of a higher-level task (e.g., question answering) instead of relying on a pipeline of separate intermediate tasks (e.g., part-of-speech tagging and dependency parsing).
In some areas, this shift has entailed substantial changes in how NLP systems are designed, such that deep neural network-based approaches may be viewed as a new paradigm distinct from statistical natural language processing. 
For instance, the term neural machine translation (NMT) emphasizes the fact that deep learning-based approaches to machine translation directly learn sequence-to-sequence transformations, obviating the need for intermediate steps such as word alignment and language modeling that was used in statistical machine translation (SMT). 
Latest works tend to use non-technical structure of a given task to build proper neural network
"""

In [8]:
len(stopwords.words("english"))

179

### cleaning dataset

In [9]:
def clean_text(sentence_list, sentence_str =False):
    """
    default sentence_list is a list of sentences.
    if it was <str> type, put sentence = True
    
    """
    if sentence_str == True:
        sentence_list = sent_tokenize(sentence_list)
        
    corpus = []
    for sen in sentence_list:
        sentence = re.sub("[^a-zA-Z]", " ", sen)
        sentence  = sentence.lower()
        sentence = sentence.split()
        sentence  = [get_stem(word) for word in sentence if not word in stopwords.words('english')]
        sentence = " ".join(sentence)
        corpus.append(sentence) 
        
    return corpus

In [10]:
test = ["This document is the first document",
             "This document is the second document",
             "and this is the third one"]

In [11]:
corpus = clean_text(test)
corpus

['document first document', 'document second document', 'third one']

### Bag Of Words

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
bow = cv.fit_transform(corpus).toarray()

In [13]:
bow

array([[2, 1, 0, 0, 0],
       [2, 0, 0, 1, 0],
       [0, 0, 1, 0, 1]], dtype=int64)

In [14]:
cv_binary = CountVectorizer(binary=True)
bow_binary = cv_binary.fit_transform(corpus).toarray()
bow_binary

array([[1, 1, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [0, 0, 1, 0, 1]], dtype=int64)

### TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()
tfidf = tf.fit_transform(corpus).toarray()

In [16]:
tfidf

array([[0.83559154, 0.54935123, 0.        , 0.        , 0.        ],
       [0.83559154, 0.        , 0.        , 0.54935123, 0.        ],
       [0.        , 0.        , 0.70710678, 0.        , 0.70710678]])