TEXT CLEANING

In [12]:
import re
import string
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MB511WS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def clean_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # convert to lowercase
    text = text.lower()

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # remove special character
    text = re.sub(r'\W+', ' ', text)

    return text


sample_text = "Client been very busy - unable to discuss Also still meeting with clients about onboarding them Awaiting further feedback"
cleaned_text = clean_text(sample_text)
print(cleaned_text)


client busy unable discuss also still meeting clients onboarding awaiting feedback


TOKENIZATION

In [14]:
from nltk.tokenize import sent_tokenize, word_tokenize

def tokenize_word(text) :
    word_token = word_tokenize(text)
    sent_token = sent_tokenize(text)

    return word_token, sent_token

word_token, sent_token = tokenize_word(cleaned_text)
print("word token", word_token)
print("sentence token", sent_token)

word token ['client', 'busy', 'unable', 'discuss', 'also', 'still', 'meeting', 'clients', 'onboarding', 'awaiting', 'feedback']
sentence token ['client busy unable discuss also still meeting clients onboarding awaiting feedback']


LEMMATIZATION

In [15]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmentizer = WordNetLemmatizer()

def lemmantize_text(text):
    tokens = word_tokenize(text)
    lemmantize_token = [lemmentizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmantize_token)

lemmatized_text = lemmantize_text(cleaned_text)
print("Lemmatized Text:", lemmatized_text)

Lemmatized Text: client busy unable discus also still meeting client onboarding awaiting feedback


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MB511WS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\MB511WS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


TEXT REPRESENTATION

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform([lemmatized_text])
print("Bag of Words representation:\n", X_bow.toarray())

# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform([lemmatized_text])
print("TF-IDF representation:\n", X_tfidf.toarray())


Bag of Words representation:
 [[1 1 1 2 1 1 1 1 1 1]]
TF-IDF representation:
 [[0.2773501 0.2773501 0.2773501 0.5547002 0.2773501 0.2773501 0.2773501
  0.2773501 0.2773501 0.2773501]]


FEATURE ENGINEERING

In [22]:
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2))
X_ngram = vectorizer_ngram.fit_transform([lemmatized_text])
print("N-gram representation:\n", X_ngram.toarray())

N-gram representation:
 [[1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1]]


In [23]:
import spacy
nlp = spacy.load('en_core_web_sm')

def pos_tagging(text):
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

pos_tags =  pos_tagging(lemmatized_text)
print("POS Tags", pos_tags)

POS Tags [('client', 'NOUN'), ('busy', 'ADJ'), ('unable', 'ADJ'), ('discus', 'NOUN'), ('also', 'ADV'), ('still', 'ADV'), ('meeting', 'VERB'), ('client', 'NOUN'), ('onboarding', 'NOUN'), ('awaiting', 'VERB'), ('feedback', 'NOUN')]


In [24]:
def name_entity_recognition(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

entities = name_entity_recognition(lemmatized_text)
print("Named Entities", entities)

Named Entities []


HANDLING IMBALANCE AND NOISE

In [25]:
import nlpaug.augmenter.word as naw

def augment_text(text):
    aug = naw.SynonymAug(aug_src="wordnet")
    augmented_text = aug.augment(text)
    return augmented_text

augmented_text = augment_text(lemmatized_text)
print("Augemented Text: ", augmented_text)

Augemented Text:  ['client busy ineffectual discus besides still meet node onboarding awaiting feedback']


DEEP LEARNING FOR NLP

In [27]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline

# load pretrained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# text classification
nlp_classify = pipeline('text-classification', model = model, tokenizer = tokenizer)
result = nlp_classify(augmented_text)

print(result)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'label': 'LABEL_1', 'score': 0.5187152028083801}]
