## NLP Lecture notebook - trying out a few NLP tools together

### Simple Lexical analysis with and NLTK over a text document (MS word), converted with docx2txt

#### install tools using !pip or other method 

In [None]:
#!pip install nltk

#once installed, you need to import and run the download for popular packages which will be saved locally
import nltk 
#nltk.download()
#ensure you see the message True if downloading

In [None]:
# these nltk tools allow us to search the text for a specific word 
from nltk.tokenize import word_tokenize
from nltk.text import Text

In [None]:
#!pip install docx2txt
import docx2txt

In [None]:
mytextfile = docx2txt.process("rugby_story.docx")

### EX 1 NLP on a body of text - example : news article in a word doc, scraped text from the web/api, or a url open request

In [None]:
# first, lets use nltk to separate the individual  words out of the text
tokens= word_tokenize(mytextfile)
print(tokens[:100]) #the first 100 words in the text body

In [None]:
#how many words do we have?
len(tokens)

In [None]:
#create a NLTK text from the list of words to allow for further linguistic processing 
textlist= Text(tokens)

In [None]:
#we have changed the data type 
type(textlist)

In [None]:
#look for words that frequently appear together 
textlist.collocations()

In [None]:
#using regex we can seek key terms in our text 
textlist.findall(r"<safety> (<.*>) <seriously>")

In [None]:
#find fragments containing key words - note, an index is built so searching is faster next time for this word
textlist.concordance("injury")

In [None]:
#we can even achieve easy lexical visualisation of the text 

textlist.dispersion_plot(["safety","player","brain","injury","dementia"])

#### We have missed the most important stage of NLP! 
Lets stop and do some basic cleaning operations on our raw text 

In [None]:
# first lets deal with the punctuation - we know the text included words like can't, it's and symbols like '-', 
# so we remove everything non alphabetic 
words=[word for word in tokens if word.isalpha()]
print(words[:100])

In [None]:
#Next, lets make everything lower case 
lowerwords = [word.lower() for word in words]
print(lowerwords[:100])

In [None]:
#Next, removing stopwords (the, a, is) to leave us with cleaner set of usable text 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
clean_words=[w for w in lowerwords if not w in stop_words]
print(clean_words[:100])

In [None]:
len(clean_words)

In [None]:
#count freq of words in the text 

from nltk.probability import FreqDist

fdistrugby = FreqDist(clean_words)
fdistrugby.most_common(10)


### EX 2 - reviews and sentiment analysis - basic - with textblob 


In [None]:
# step 1 - add some sentiment ready data 
# step 2 - clean and tokenise as before (we will skip this step today)
# step 3 - calculate sentiment with text blob 

In [None]:
import pandas as pd 
clothingdf=pd.read_csv('Womens Clothing E-Commerce Reviews.csv')
clothingdf.head()

In [None]:
clothingdf.shape

In [None]:
# cleaning - lets focus on two key columns, so I need to keep populated rows 
clothingdf.dropna(subset=['Review Text','Division Name'], inplace=True)

In [None]:
clothingdf.shape

In [None]:
#!pip install TextBlob 

from textblob import TextBlob

In [None]:
# score for each statement on a scale negative to positive 
# polarity -> VERY -ve = -1  VERY +ve = +1 
# example - input some text

TextBlob("I love this dress!").sentiment 

In [None]:
#using lambda to pick up sentiment across all of review text column, adding a new column to the df
clothingdf['Review_Polarity'] = clothingdf['Review Text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
clothingdf.head(10)

In [None]:
#lets look at each divisions/departments review polarity 
clothingdf.groupby(['Division Name','Department Name'])['Review_Polarity'].mean()

### Ex 3 - using SpaCy as a headstart with NLP

In [None]:
#one of the biggest challenges of working with text is interpreting context, whats the subject/object etc.  

#!pip install -U spacy
#install model - language and size 

import spacy 
#import en_core_web_sm
nlp= spacy.load("en_core_web_sm")

In [None]:
doc=nlp("I saw a man on the hill with a telescope. He looks like Jesus")

In [None]:
#tokenisation 

for token in doc:
    print(token.text)

Text: The original word text.
Lemma: The base form of the word.
POS: The simple UPOS part-of-speech tag.
Tag: The detailed part-of-speech tag.
Dep: Syntactic dependency, i.e. the relation between tokens.
Shape: The word shape – capitalization, punctuation, digits.
is alpha: Is the token an alpha character?
is stop: Is the token part of a stop list, i.e. the most common words of the language?

In [None]:
#parse and tag each word with context using the model installed

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

In [None]:
#lets explore visually the word links in the text we have provided, to see what spacy is adding to our analysis

from spacy import displacy

In [None]:
displacy.render(doc, style="dep")

In [None]:
# lets see if spacy can spot the named entity 

displacy.render(doc, style="ent")

In [None]:
# text analysis - similarity of words detected using vectors - requires bigger model to be loaded though

nlp = spacy.load("en_core_web_md")
tokens = nlp("man woman hill telescope wkls")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)#out of vocabulary

In [None]:
#this function allows us to compare each word to another, for similarity, using those vectors 
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

#### Final Ex using SpaCy to detect and rank  named entities from social data (tweets)

In [None]:
tweets= pd.read_csv('ever_trump.csv')
tweets.shape

In [None]:
tweets.head(10)

In [None]:
# #obviously this text needs cleaning operations but lets simply create tokens using spacy ready for analysis 

tokens = nlp(''.join(str(tweets.text.tolist())))

In [None]:
from collections import Counter 
items = [x.text for x in tokens.ents]
Counter(items).most_common(20)

In [None]:
#focus on named persons in the tweets 
person_list= [] # empty list 

for ent in tokens.ents:
    if ent.label_ =='PERSON':
        person_list.append(ent.text) # using loop to fill the list 

In [None]:
person_counts = Counter(person_list).most_common(20)
df_person =pd.DataFrame(person_counts, columns = ['text', 'count'])
df_person

In [None]:
df_person.plot.barh(x='text', y='count')