In [13]:
import pandas as pd
import numpy as np

import spacy
nlp = spacy.load("en_core_web_sm") # English Model
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS # stop words
# words to keep
# spacy_stopwords.remove('not')

import nltk
import re
import unicodedata

import sys
sys.path.append('scripts/')
from contractions import CONTRACTION_MAP

import warnings
warnings.filterwarnings('ignore')

# Text Normalisation

In [5]:
example = """ When learning data science, 5 you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the 10 journey. You've got this! """

In [3]:
from spellchecker import SpellChecker

spell = SpellChecker()
example = "latar"
# find those words that may be misspelled
misspelled = spell.unknown(example.split())

misspelled
for word in misspelled:
    # Get the one `most likely` answer
    print(spell.correction(word))

    # Get a list of `likely` options
    print(spell.candidates(word))

later
{'altar', 'tatar', 'later', 'lata', 'qatar', 'lazar', 'lamar'}


In [15]:
def format_num(text):
    pattern = "^\d+\s|\s\d+\s|\s\d+$"
    text = re.sub(pattern, " #NUM ", text)
    return text

In [16]:
def tokenize(text):
    doc = nlp(text)

    # Create list of word tokens
    token_list = [token.text for token in doc]
    
    return token_list

In [17]:
# sentencizer = nlp.create_pipe("sentencizer")
# nlp.add_pipe(sentencizer, before="parser")

def sentence_tokenize(text):
    
    doc = nlp(text)
    
    sentences = [sent for sent in doc.sents]
    
    return sentences

In [18]:
def lemmatize(text):
    doc = nlp(text)
    doc = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in doc])
    return doc

In [14]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [20]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [22]:
def remove_stop_words(text):
    spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS # stop words
    doc = tokenize(text)
    
    # filtering stop words
    doc = ' '.join([word for word in doc if word.lower() not in spacy_stopwords])
    
    return doc

In [23]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [9]:
def remove_white_space(text):
    
    doc = ' '.join([word for word in text.split()])
    return doc

In [27]:
def normalize_text(corpus):
    
    normal_corpus = []
    
    for doc in corpus:
        text = \
        remove_stop_words(
            remove_white_space(
                lemmatize(
                    remove_accented_chars(
                        remove_special_characters(
                            expand_contractions(doc)
                        )
                    )
                )
            )
        )
    
        normal_corpus.append(text)
    
    return normal_corpus

In [14]:
def get_entities(corpus):
    named_entities = []
    for doc in corpus:
        
        temp_entity_name = ''
        temp_named_entity = None
        sentence = nlp(doc)

        for word in sentence:
            term = word.text 
            tag = word.ent_type_
            
            if tag:
                temp_entity_name = ' '.join([temp_entity_name, term]).strip()
                temp_named_entity = (temp_entity_name, tag)
                
            else:
                if temp_named_entity:
                    named_entities.append(temp_named_entity)
                    temp_entity_name = ''
                    temp_named_entity = None
                    
    entity_frame = pd.DataFrame(named_entities, 
                                columns=['Entity Name', 'Entity Type'])
    return entity_frame

In [7]:
example_ = """New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases.

At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday.

The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000."""

In [16]:
top_entities = (get_entities([example_]).groupby(by=['Entity Name', 'Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Entity Name,Tuesday,"up to $ 1,000",four,Williamsburg,September,Orthodox Jews,New York City,Brooklyn,Bill de Blasio,At least 285,6 months old
Entity Type,DATE,MONEY,CARDINAL,GPE,DATE,PERSON,GPE,GPE,PERSON,CARDINAL,DATE
Frequency,2,1,1,1,1,1,1,1,1,1,1


## POS Tagging
Spacy

Text: The original word text.

Lemma: The base form of the word.

POS: The simple part-of-speech tag.

Tag: The detailed part-of-speech tag.

Dep: Syntactic dependency, i.e. the relation between tokens.

Shape: The word shape – capitalization, punctuation, digits.

is alpha: Is the token an alpha character?

is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [24]:
token = nlp('Apples')[0]
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop)

Apples apple NOUN NNS ROOT Xxxxx True False


In [None]:
from spacy import displacy

doc = nlp('my love for the food disappeared')
displacy.serve(doc, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



In [6]:
review = "This is the game the never, ever ends. I picked this game up thinking I wouldn't like it having never played a previous TES game before. Sat in my Library untouched for about a month or two, and finally took the plunge. I was wrong. I was so very, very wrong. This is probably the best purchase I've ever made on steam. Add in the unlimited potential of modding, and it's an adventure that continues forever. I've probably restarted over a hundred times by now with a new character, and still only have beaten Alduin once. ONCE. There's still so much more do to and explore that I'm still discovering quests and areas and little hidden things. I still can't believe how much there is to do and I'm still finding more. At it stands I'll probably wake up one day having lost all touch with reality and actually start seeing the world as Skyrim with how much I've played. And I'm totally ok with that."

In [36]:
doc = nlp("I liked the movie but the food was rubbish")
chunks = list(doc.noun_chunks)
print(chunks, chunks[0].root.dep_)

[I, the movie, the food] nsubj


# Word Embeddings

## Frequency-Based Embeddings

Count Vector (Bag of Words)

TF-IDF Vector

Co-Occurrence Vector

In [8]:
test = pd.Series(example)
test[1] = example_

In [30]:
corpus = normalize_text(test)

In [84]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_vector = CountVectorizer(tokenizer = tokenize, ngram_range=(2,2))

tfidf_vector = TfidfVectorizer(tokenizer = tokenize, ngram_range=(2,2))

# X = bow_vector.fit_transform(corpus)
# bow_vector.get_feature_names()
# X.toarray()
# X.shape

# X = tfidf_vector.fit_transform(corpus)
# tfidf_vector.get_feature_names()
# X.toarray()
# X.shape

In [27]:
from scipy.sparse import csr_matrix

def cooccurrence_matrix(corpus, window_size = 1):
    vocabulary={}
    data=[]
    row=[]
    col=[]
    for sentence in corpus:
        sentence = tokenize(sentence)
        for pos, token in enumerate(sentence):
            i = vocabulary.setdefault(token, len(vocabulary))
            start = max(0, pos-window_size)
            end = min(len(sentence), pos + (window_size + 1))
            for pos2 in range(start, end):
                if pos2 == pos: 
                    continue
                j = vocabulary.setdefault(sentence[pos2],len(vocabulary))
                
                data.append(1.)
                row.append(i)
                col.append(j)
                
    cooccurrence_matrix = csr_matrix((data,(row,col)))
    return vocabulary, cooccurrence_matrix

In [31]:
vocab, coo_mat = cooccurrence_matrix(corpus, window_size = 2)

In [35]:
pd.DataFrame(coo_mat.toarray(), index = vocab.keys(), columns = vocab.keys())

Unnamed: 0,learn,datum,science,not,discourage,challenge,setback,failure,journey,New,...,receive,inoculation,child,young,6,month,old,resist,fine,1000
learn,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
datum,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
science,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
not,0.0,1.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
discourage,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
month,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
old,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
resist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
fine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


# Feature Engineering
Word Count of the documents – total number of words in the documents

Character Count of the documents – total number of characters in the documents

Average Word Density of the documents – average length of the words used in the documents

Puncutation Count in the Complete Essay – total number of punctuation marks in the documents

Upper Case Count in the Complete Essay – total number of upper count words in the documents

Title Word Count in the Complete Essay – total number of proper case (title) words in the documents

In [None]:
# Word Count
data['char_count'] = data['text'].apply(len)
# Character Count
data['word_count'] = data['text'].apply(lambda x: len(tokenize(x)))
# Punctuation count
data['punctuation_count'] = data['text'].apply(lambda x: count_punct(x))
# Upper Case count
data['upper_case_word_count'] = data['text'].apply(lambda x: count_upper(x))

In [6]:
import string

def count_punct(text):
    doc = tokenize(text)
    
    punctuation = [punct for punct in doc if punct in string.punctuation]
    
    return len(punctuation)

In [9]:
def count_upper(text):
    doc = tokenize(text)
    
    upper  =[word for word in doc if word.isupper()]
    
    return len(upper)

Frequency distribution of Part of Speech Tags:
Noun Count

Verb Count

Adjective Count

Adverb Count

Pronoun Count

In [22]:
pos_type = {
    'adjective': 'ADJ',
    'adverb': 'ADV',
    'verb': 'VERB',
    'pronoun': 'PRON',
    'noun': 'NOUN',
    
}

def count_pos_type(text, pos: str):
    doc = nlp(text)
    
    if pos in pos_type.keys():
        pos_tagged = [word.pos_ for word in doc if word.pos_ == pos_type[pos]]
        
        return len(pos_tagged)
    else:
        return f'{pos} not in  accepted pos types {pos_type.keys()}'

In [23]:
count_pos_type(example, 'verb')

4

In [None]:
data['noun_count'] = data['text'].apply(lambda text: count_pos_type(text, 'noun'))
data['verb_count'] = data['text'].apply(lambda text: count_pos_type(text, 'verb'))
data['adj_count'] = data['text'].apply(lambda text: count_pos_type(text, 'adjective'))
data['adv_count'] = data['text'].apply(lambda text: count_pos_type(text, 'adverb'))
data['pron_count'] = data['text'].apply(lambda text: count_pos_type(text, 'pronoun'))