In [1]:
import stanfordnlp as sn
import spacy
import pandas as pd
from sklearn.naive_bayes import MultinomialNB

# Tokenization and Lemmatization

## Understanding Spacy - Creating tokens

In [4]:
nlp = spacy.load('en_core_web_sm')

string = "Hello There! This is a test, string"

doc = nlp(string)

tokens = [(token.text,token.prob) for token in doc]

In [5]:
print(tokens)

[('Hello', -20.0), ('There', -20.0), ('!', -20.0), ('This', -20.0), ('is', -20.0), ('a', -20.0), ('test', -20.0), (',', -20.0), ('string', -20.0)]


## Lemmatization

Note that **'token.lemma_'** creates the lemmas of the inputted text 

In [4]:
nlp = spacy.load('en_core_web_sm') # Sounds like encore dubsmash

string = "Hello There! This is a test, string. I don't know what's happening"

doc = nlp(string)

tokens1 = [token.lemma_ for token in doc]

In [5]:
print(tokens1)

['hello', 'there', '!', 'this', 'be', 'a', 'test', ',', 'string', '.', '-PRON-', 'do', 'not', 'know', 'what', 'be', 'happen']


'-PRON-' means pronoun, in the above output

I'm gonna join the lemmas to form the lemmatized original sentence`

In [6]:
' '.join(tokens1)

'hello there ! this be a test , string . -PRON- do not know what be happen'

## Text cleaning 

In [7]:
'dog'.isalpha() # is alphabet?

True

In [8]:
'3dogs'.isalpha()

False

In [9]:
"!".isalpha()

False

In [10]:
blog = "Twenty-first-century politics has witnessed an alarming rise of populism in the U.S. and Europe. The first warning signs came with the UK Brexit Referendum vote in 2016 swinging in the way of Leave. This was followed by a stupendous victory by billionaire Donald Trump to become the 45th President of the United States in November 2016. Since then, Europe has seen a steady rise in populist and far-right parties that have capitalized on Europe’s Immigration Crisis to raise nationalist and anti-Europe sentiments. Some instances include Alternative for Germany (AfD) winning 12.6% of all seats and entering the Bundestag, thus upsetting Germany’s political order for the first time since the Second World War, the success of the Five Star Movement in Italy and the surge in popularity of neo-nazism and neo-fascism in countries such as Hungary, Czech Republic, Poland and Austria."
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [11]:
# Load model and create Doc object
nlp = spacy.load('en_core_web_sm')
doc = nlp(blog)

# Generate lemmatized tokens
lemmas = [token.lemma_ for token in doc]

# Remove stopwords and non-alphabetic tokens
a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stopwords]

# Print string after text cleaning
print(' '.join(a_lemmas))



## POS Tagging 

In [12]:
pos_nlp = spacy.load("en_core_web_sm")
string = "Mary had a little lamb."
string1 = "A sunscreen with a higher SPF protects the skin longer." # this is the kind of string that I would be using in the QuaRTz dataset
doc = pos_nlp(string1) # Doc object is the 
print(doc)
print(string1)

A sunscreen with a higher SPF protects the skin longer.
A sunscreen with a higher SPF protects the skin longer.


Converting the doc object into the tokens and POS tags

In [13]:
pos = [(token.text, token.pos_) for token in doc]
print(pos)

[('A', 'DET'), ('sunscreen', 'NOUN'), ('with', 'ADP'), ('a', 'DET'), ('higher', 'ADJ'), ('SPF', 'PROPN'), ('protects', 'VERB'), ('the', 'DET'), ('skin', 'NOUN'), ('longer', 'ADV'), ('.', 'PUNCT')]


### Counting the number of NOUN tags

Note the NOUN returns only common nouns. PROPN for proper nouns

In [14]:
nlp = spacy.load('en_core_web_sm')

# Returns number of other nouns
def nouns(text, model=nlp):
  	# Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    
    # Return number of other nouns
    return pos.count("NOUN")

print(nouns("Abdul, Bill and Cathy went to the market to buy apples.", nlp))

2


## NER - Named Entity Recognition

In [15]:
string = "John doe is a software engineer at Google"
nlp = spacy.load('en_core_web_sm')

In [16]:
doc = nlp(string)
ne = [(ent.text, ent.label_) for ent in doc.ents]
print(ne)

[('John doe', 'PERSON'), ('Google', 'ORG')]


## Bag of words model - Vectorization

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ["this is an example text. This text is written using english language"]
# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray())

# Map the column names to vocabulary 
bow_df.columns = vectorizer.get_feature_names()

# Print bow_df
print(bow_df)

   an  english  example  is  language  text  this  using  written
0   1        1        1   2         1     2     2      1        1


A corpus is a Pandas Series. 
If you want to create a corpus, create a pd.Series, as shown below, 

In [22]:
corpus1 = [["this is an example text. This text is written using english language"],
           ['This is a test corpus 2']]
corpus2 = pd.Series(corpus1)
print(corpus2)

0    [this is an example text. This text is written...
1                            [This is a test corpus 2]
dtype: object


### Building an n-gram model

Why n-gram model instead of a BoW model? 
BoW loses context by using an n-gram model, continuity is preserved and hence context to some extent

Done using CountVectoriser

In [None]:
# Generate n-grams upto n=1
vectorizer_ng1 = CountVectorizer(ngram_range=(1,1))
ng1 = vectorizer_ng1.fit_transform(corpus)

# Generate n-grams upto n=2
vectorizer_ng2 = CountVectorizer(ngram_range=(1,2))
ng2 = vectorizer_ng2.fit_transform(corpus)

# Generate n-grams upto n=3
vectorizer_ng3 = CountVectorizer(ngram_range=(1, 3))
ng3 = vectorizer_ng3.fit_transform(corpus)

# Print the number of features for each model
print("ng1, ng2 and ng3 have %i, %i and %i features respectively" % (ng1.shape[1], ng2.shape[1], ng3.shape[1]))