In [1]:
import pandas as pd
import numpy as np

import spacy
import nltk
import re
import unicodedata

from contractions import CONTRACTION_MAP

import warnings
warnings.filterwarnings('ignore')

# Text Normalisation

In [2]:
# spacy
nlp = spacy.load("en_core_web_sm") # English Model

example = """ When learning data science,   you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

In [3]:
def tokenize(text):
    doc = nlp(text)

    # Create list of word tokens
    token_list = [token.text for token in doc]
    
    return token_list

In [30]:
# sentencizer = nlp.create_pipe("sentencizer")
# nlp.add_pipe(sentencizer, before="parser")

def sentence_tokenize(text):
    
    doc = nlp(text)
    
    sentences = [sent for sent in doc.sents]
    
    return sentences

In [6]:
def lemmatize(text):
    doc = nlp(text)
    doc = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in doc])
    return doc

In [7]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [8]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [25]:
def remove_stop_words(text):
    
    spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS # stop words
    # words to keep
    spacy_stopwords.remove('not')

    doc = tokenize(text)
    
    # filtering stop words
    doc = ' '.join([word for word in doc if word.lower() not in spacy_stopwords])
    
    return doc

In [10]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [11]:
def remove_white_space(text):
    doc = tokenize(text)
    
    doc = [word.strip() for word in doc]
    doc = ' '.join([word for word in doc if word != ''])
    
    return doc

In [67]:
def normalize_text(corpus):
    
    normal_corpus = []
    
    for doc in corpus:
        text = \
        remove_stop_words(
            remove_white_space(
                lemmatize(
                    remove_accented_chars(
                        remove_special_characters(
                            expand_contractions(doc)
                        )
                    )
                )
            )
        )
    
        normal_corpus.append(text)
    
    return normal_corpus

In [13]:
normalize_text(example)

'learn datum science discourage challenge setback failure journey'

In [14]:
def get_entities(corpus):
    named_entities = []
    for doc in corpus:
        
        temp_entity_name = ''
        temp_named_entity = None
        sentence = nlp(doc)

        for word in sentence:
            term = word.text 
            tag = word.ent_type_
            
            if tag:
                temp_entity_name = ' '.join([temp_entity_name, term]).strip()
                temp_named_entity = (temp_entity_name, tag)
                
            else:
                if temp_named_entity:
                    named_entities.append(temp_named_entity)
                    temp_entity_name = ''
                    temp_named_entity = None
                    
    entity_frame = pd.DataFrame(named_entities, 
                                columns=['Entity Name', 'Entity Type'])
    return entity_frame

In [15]:
example_ = """New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000."""

In [16]:
top_entities = (get_entities([example_]).groupby(by=['Entity Name', 'Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Entity Name,Tuesday,"up to $ 1,000",four,Williamsburg,September,Orthodox Jews,New York City,Brooklyn,Bill de Blasio,At least 285,6 months old
Entity Type,DATE,MONEY,CARDINAL,GPE,DATE,PERSON,GPE,GPE,PERSON,CARDINAL,DATE
Frequency,2,1,1,1,1,1,1,1,1,1,1


# Word Embeddings

## Frequency-Based Embeddings

Count Vector (Bag of Words)

TF-IDF Vector

Co-Occurrence Vector

In [68]:
test = pd.Series(example)
test[1] = example_

['learn datum science not discourage challenge setback not failure journey',
 'New York City Tuesday declare public health emergency order mandatory measle vaccination amid outbreak late national flash point refusal inoculate dangerous disease 285 people contract measle city September Brooklyns Williamsburg neighborhood order cover Zip code Mayor Bill de Blasio D Tuesday mandate order unvaccinate people area include concentration Orthodox Jews receive inoculation include child young 6 month old resist fine 1000']

In [70]:
corpus = normalize_text(test)

In [84]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_vector = CountVectorizer(tokenizer = tokenize, ngram_range=(2,2))

tfidf_vector = TfidfVectorizer(tokenizer = tokenize, ngram_range=(2,2))

# X = bow_vector.fit_transform(corpus)
# bow_vector.get_feature_names()
# X.toarray()
# X.shape

# X = tfidf_vector.fit_transform(corpus)
# tfidf_vector.get_feature_names()
# X.toarray()
# X.shape

In [195]:
from scipy.sparse import csr_matrix

def cooccurrence_matrix(corpus, window_size = 1):
    vocabulary={}
    data=[]
    row=[]
    col=[]
    for sentence in corpus:
        sentence = tokenize(sentence)
        for pos, token in enumerate(sentence):
            i = vocabulary.setdefault(token, len(vocabulary))
            start = max(0, pos-window_size)
            end = min(len(sentence), pos + (window_size + 1))
            for pos2 in range(start, end):
                if pos2 == pos: 
                    continue
                j = vocabulary.setdefault(sentence[pos2],len(vocabulary))
                
                data.append(1.)
                row.append(i)
                col.append(j)
                
    cooccurrence_matrix = csr_matrix((data,(row,col)))
    return vocabulary, cooccurrence_matrix

In [196]:
corpus = [normalize_text(example)]
vocab, coo_mat = cooccurrence_matrix(corpus, window_size = 2)

# Feature Engineering
Word Count of the documents – total number of words in the documents

Character Count of the documents – total number of characters in the documents

Average Word Density of the documents – average length of the words used in the documents

Puncutation Count in the Complete Essay – total number of punctuation marks in the documents

Upper Case Count in the Complete Essay – total number of upper count words in the documents

Title Word Count in the Complete Essay – total number of proper case (title) words in the documents

In [None]:
# Word Count
data['char_count'] = data['text'].apply(len)
# Character Count
data['word_count'] = data['text'].apply(lambda x: len(tokenize(x)))
# Punctuation count
data['punctuation_count'] = data['text'].apply(lambda x: count_punct(x))
# Upper Case count
data['upper_case_word_count'] = data['text'].apply(lambda x: count_upper(x))

In [6]:
import string

def count_punct(text):
    doc = tokenize(text)
    
    punctuation = [punct for punct in doc if punct in string.punctuation]
    
    return len(punctuation)

In [9]:
def count_upper(text):
    doc = tokenize(text)
    
    upper  =[word for word in doc if word.isupper()]
    
    return len(upper)

Frequency distribution of Part of Speech Tags:
Noun Count

Verb Count

Adjective Count

Adverb Count

Pronoun Count

In [22]:
pos_type = {
    'adjective': 'ADJ',
    'adverb': 'ADV',
    'verb': 'VERB',
    'pronoun': 'PRON',
    'noun': 'NOUN',
    
}

def count_pos_type(text, pos: str):
    doc = nlp(text)
    
    if pos in pos_type.keys():
        pos_tagged = [word.pos_ for word in doc if word.pos_ == pos_type[pos]]
        
        return len(pos_tagged)
    else:
        return f'{pos} not in  accepted pos types {pos_type.keys()}'

In [23]:
count_pos_type(example, 'verb')

4

In [None]:
data['noun_count'] = data['text'].apply(lambda text: count_pos_type(text, 'noun'))
data['verb_count'] = data['text'].apply(lambda text: count_pos_type(text, 'verb'))
data['adj_count'] = data['text'].apply(lambda text: count_pos_type(text, 'adjective'))
data['adv_count'] = data['text'].apply(lambda text: count_pos_type(text, 'adverb'))
data['pron_count'] = data['text'].apply(lambda text: count_pos_type(text, 'pronoun'))