## Text Mining and NLP

## Part 2

### **Goal**: to internalize the steps, challenges, and methodology of text mining
- explore text analysis by hand
- apply text mining steps in Jupyter with Python libraries NLTK
- classify documents correctly
<br/>
^ This last step will require modeling!

In [1]:
import nltk
import sklearn
import pandas as pd
import string, re
import urllib

url_a = "https://raw.githubusercontent.com/aapeebles/text_examples/master/Text%20examples%20folder/A.txt"
url_b = "https://raw.githubusercontent.com/aapeebles/text_examples/master/Text%20examples%20folder/D.txt"
article_a = urllib.request.urlopen(url_a).read()
article_a_st = article_a.decode("utf-8")

In [2]:
# tokens
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
arta_tokens_raw = nltk.regexp_tokenize(article_a_st, pattern)

# lower case
arta_tokens = [i.lower() for i in arta_tokens_raw]

# stop words if you get an error here uncommment the next line and run the cell again
# nltk.download('stopwords')
nltk.corpus.stopwords.words("english")

stop_words = set(nltk.corpus.stopwords.words('english'))
arta_tokens_stopped = [w for w in arta_tokens if not w in stop_words]

# stem words
stemmer = nltk.SnowballStemmer("english")
arta_stemmed = [stemmer.stem(word) for word in arta_tokens_stopped]

In [3]:
# repeat w second article
article_b = urllib.request.urlopen(url_b).read()
article_b_st = article_b.decode("utf-8")
artb_tokens_raw = nltk.regexp_tokenize(article_b_st, pattern)
artb_tokens = [i.lower() for i in artb_tokens_raw]
artb_tokens_stopped = [w for w in artb_tokens if not w in stop_words]
artb_stemmed = [stemmer.stem(word) for word in artb_tokens_stopped]

### Term Frequency (TF)

$\begin{align}
 tf_{i,j} = \dfrac{n_{i,j}}{\displaystyle \sum_k n_{i,j} }
\end{align} $

### Inverse Document Frequency (IDF)

$\begin{align}
idf(w) = \log \dfrac{N}{df_t}
\end{align} $

### TF-IDF score

$ \begin{align}
w_{i,j} = tf_{i,j} \times \log \dfrac{N}{df_i} \\
tf_{i,j} = \text{number of occurences of } i \text{ in} j \\
df_i = \text{number of documents containing } i \\
N = \text{total number of documents}
\end{align} $


#### Counting from scratch

In [4]:
article_b_st

'BNP leader Nick Griffin arrested\n\nThe leader of the British National Party has been arrested as part of a police inquiry following the screening of a BBC documentary.\n\nA party spokesman said Nick Griffin was arrested on Tuesday morning on suspicion of incitement to commit racial hatred. West Yorkshire police confirmed they had arrested a 45-year-old man from outside their area. BNP founding chairman John Tyndall was arrested on Sunday on the same charge.\n\nIn July, the BBC documentary Secret Agent featured covertly-filmed footage of BNP activists. Mr Griffin is the twelfth man to be arrested following the documentary. Nine men from West Yorkshire and another man from Leicester have been arrested and freed on bail. Seven of the men had been held variously in connection with suspected racially aggravated public order offences, conspiracy to commit criminal damage and possession of a firearm. Two men, both from Keighley, were arrested in September on suspicion of conspiracy to commi

In [5]:
wordSet = set(arta_stemmed).union(set(artb_stemmed)) 
wordDictA = dict.fromkeys(wordSet, 0) 
wordDictB = dict.fromkeys(wordSet, 0) 

for word in arta_stemmed: 
    wordDictA[word]+=1
    
for word in artb_stemmed: 
    wordDictB[word]+=1    

In [6]:
wordDictA

{'favour': 1,
 'sunday': 0,
 'month': 1,
 'screen': 0,
 'without': 1,
 'chairman': 0,
 'poland': 1,
 'state': 2,
 'brighton': 0,
 'bring': 1,
 'legal': 3,
 'mep': 2,
 'old': 0,
 'aggrav': 0,
 'com': 1,
 'eu': 4,
 'exampl': 1,
 'european': 2,
 'lancashir': 0,
 'vote': 1,
 'effect': 1,
 'fear': 1,
 'seven': 0,
 'direct': 4,
 'reject': 1,
 'parti': 0,
 'method': 1,
 'invent': 5,
 'abstain': 1,
 'comput': 4,
 'gain': 1,
 'oppon': 1,
 'vocal': 1,
 'west': 0,
 'let': 1,
 'leicest': 0,
 'achiev': 1,
 'largest': 1,
 'various': 0,
 'racial': 0,
 'twelfth': 0,
 'court': 1,
 'lead': 1,
 'conspiraci': 0,
 'busi': 1,
 'spokesperson': 0,
 'men': 0,
 'burnley': 0,
 'govern': 1,
 'action': 1,
 'featur': 0,
 'submit': 1,
 'open': 1,
 'intens': 1,
 'concern': 1,
 'secret': 0,
 'freed': 0,
 'speech': 0,
 'griffin': 0,
 'tyndal': 0,
 'impact': 1,
 'chanc': 1,
 'union': 1,
 'critic': 2,
 'held': 0,
 'yorkshir': 0,
 'parliament': 2,
 'put': 1,
 'first': 1,
 'morn': 0,
 'nine': 0,
 'john': 0,
 'servic': 1,
 

In [7]:
wordDictB

{'favour': 0,
 'sunday': 1,
 'month': 0,
 'screen': 1,
 'without': 0,
 'chairman': 1,
 'poland': 0,
 'state': 0,
 'brighton': 1,
 'bring': 0,
 'legal': 0,
 'mep': 0,
 'old': 2,
 'aggrav': 1,
 'com': 0,
 'eu': 0,
 'exampl': 0,
 'european': 0,
 'lancashir': 1,
 'vote': 0,
 'effect': 0,
 'fear': 0,
 'seven': 1,
 'direct': 0,
 'reject': 0,
 'parti': 2,
 'method': 0,
 'invent': 0,
 'abstain': 0,
 'comput': 0,
 'gain': 0,
 'oppon': 0,
 'vocal': 0,
 'west': 2,
 'let': 0,
 'leicest': 2,
 'achiev': 0,
 'largest': 0,
 'various': 1,
 'racial': 3,
 'twelfth': 1,
 'court': 0,
 'lead': 0,
 'conspiraci': 2,
 'busi': 0,
 'spokesperson': 1,
 'men': 3,
 'burnley': 1,
 'govern': 0,
 'action': 0,
 'featur': 1,
 'submit': 0,
 'open': 0,
 'intens': 0,
 'concern': 0,
 'secret': 1,
 'freed': 1,
 'speech': 1,
 'griffin': 3,
 'tyndal': 2,
 'impact': 0,
 'chanc': 0,
 'union': 0,
 'critic': 0,
 'held': 1,
 'yorkshir': 2,
 'parliament': 0,
 'put': 0,
 'first': 0,
 'morn': 1,
 'nine': 1,
 'john': 1,
 'servic': 0,
 

In [8]:
df = pd.DataFrame([wordDictA, wordDictB], index=range(2))
df.reindex(sorted(df.columns), axis=1)

Unnamed: 0,abstain,achiev,action,activist,adopt,affair,agent,aggrav,amazon,anoth,...,various,vocal,vote,welcom,west,without,word,would,year,yorkshir
0,1,1,1,0,1,1,0,0,1,0,...,0,1,1,1,0,1,1,3,0,0
1,0,0,0,1,0,0,1,1,0,1,...,1,0,0,0,2,0,0,0,2,2


#### Now with sklearn

In [9]:
counts = sklearn.feature_extraction.text.CountVectorizer()
counts_fitted = counts.fit_transform([article_b_st, article_a_st])

pd.DataFrame(counts_fitted.toarray(), columns=counts.get_feature_names())

Unnamed: 0,24,45,about,abstain,achieve,action,activists,adoption,affairs,after,...,west,when,which,who,with,without,words,would,year,yorkshire
0,1,1,0,0,0,0,1,0,0,0,...,2,0,0,0,1,0,0,0,2,2
1,0,0,1,1,1,1,0,1,1,1,...,0,1,1,1,2,1,1,3,0,0


## Tfidf from scratch

In [10]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count / bowCount
    return tfDict

In [11]:
tfbowA = computeTF(wordDictA, arta_stemmed)
tfbowB = computeTF(wordDictB, artb_stemmed)

In [12]:
tfbowA

{'favour': 0.005434782608695652,
 'sunday': 0.0,
 'month': 0.005434782608695652,
 'screen': 0.0,
 'without': 0.005434782608695652,
 'chairman': 0.0,
 'poland': 0.005434782608695652,
 'state': 0.010869565217391304,
 'brighton': 0.0,
 'bring': 0.005434782608695652,
 'legal': 0.016304347826086956,
 'mep': 0.010869565217391304,
 'old': 0.0,
 'aggrav': 0.0,
 'com': 0.005434782608695652,
 'eu': 0.021739130434782608,
 'exampl': 0.005434782608695652,
 'european': 0.010869565217391304,
 'lancashir': 0.0,
 'vote': 0.005434782608695652,
 'effect': 0.005434782608695652,
 'fear': 0.005434782608695652,
 'seven': 0.0,
 'direct': 0.021739130434782608,
 'reject': 0.005434782608695652,
 'parti': 0.0,
 'method': 0.005434782608695652,
 'invent': 0.02717391304347826,
 'abstain': 0.005434782608695652,
 'comput': 0.021739130434782608,
 'gain': 0.005434782608695652,
 'oppon': 0.005434782608695652,
 'vocal': 0.005434782608695652,
 'west': 0.0,
 'let': 0.005434782608695652,
 'leicest': 0.0,
 'achiev': 0.0054347

In [13]:
def computeIDF(docList):
    """ compute inverse doc freq for each doc in the docList
    returns: IDF for each doc
    """
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / val)
        
    return idfDict

In [17]:
idfs = computeIDF([wordDictA, wordDictB])

In [18]:
idfs

{'favour': 0.6931471805599453,
 'sunday': 0.6931471805599453,
 'month': 0.6931471805599453,
 'screen': 0.6931471805599453,
 'without': 0.6931471805599453,
 'chairman': 0.6931471805599453,
 'poland': 0.6931471805599453,
 'state': 0.6931471805599453,
 'brighton': 0.6931471805599453,
 'bring': 0.6931471805599453,
 'legal': 0.6931471805599453,
 'mep': 0.6931471805599453,
 'old': 0.6931471805599453,
 'aggrav': 0.6931471805599453,
 'com': 0.6931471805599453,
 'eu': 0.6931471805599453,
 'exampl': 0.6931471805599453,
 'european': 0.6931471805599453,
 'lancashir': 0.6931471805599453,
 'vote': 0.6931471805599453,
 'effect': 0.6931471805599453,
 'fear': 0.6931471805599453,
 'seven': 0.6931471805599453,
 'direct': 0.6931471805599453,
 'reject': 0.6931471805599453,
 'parti': 0.6931471805599453,
 'method': 0.6931471805599453,
 'invent': 0.6931471805599453,
 'abstain': 0.6931471805599453,
 'comput': 0.6931471805599453,
 'gain': 0.6931471805599453,
 'oppon': 0.6931471805599453,
 'vocal': 0.69314718055

In [19]:
def computeTFIDF(tfBow, idfs):
    """creates function for computing TFIDF"""
    tfidf = {} # creates empty dictionary
    for word, val in tfBow.items(): #starts a for loop using keys (word) and values from tfBow
        tfidf[word] = val * idfs[word] #for each word in tfBow, the value is multiplied by the idfs for the word. 
                                        #The word and resulting computation are then added to the dictionary tfidf
    return tfidf #returns the dictionary tfidf

In [20]:
tfidfBowA = computeTFIDF(tfbowA, idfs)
tfidfBowB = computeTFIDF(tfbowB, idfs)

In [21]:
import pandas as pd
nlpbh = pd.DataFrame([tfidfBowA, tfidfBowB])
nlpbh

Unnamed: 0,abstain,achiev,action,activist,adopt,affair,agent,aggrav,amazon,anoth,...,various,vocal,vote,welcom,west,without,word,would,year,yorkshir
0,0.003767,0.003767,0.003767,0.0,0.003767,0.003767,0.0,0.0,0.003767,0.0,...,0.0,0.003767,0.003767,0.003767,0.0,0.003767,0.003767,0.011301,0.0,0.0
1,0.0,0.0,0.0,0.005373,0.0,0.0,0.005373,0.005373,0.0,0.005373,...,0.005373,0.0,0.0,0.0,0.010746,0.0,0.0,0.0,0.010746,0.010746


Now with sklearn!

In [23]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.SnowballStemmer("english")
    stems = []
    for item in tokens:
        stems.append(stemmer.stem(item))
    return stems

tfidf = sklearn.feature_extraction.text.TfidfVectorizer(stop_words='english', tokenizer=tokenize)

response = tfidf.fit_transform([article_b_st, article_a_st])

nlpskl = pd.DataFrame(response.toarray(), columns=tfidf.get_feature_names())
nlpskl

Unnamed: 0,'',(,),",",.,24-year-old,45-year-old,``,abstain,achiev,...,union,us-bas,use,various,vocal,vote,welcom,west,word,yorkshir
0,0.0,0.0,0.0,0.286874,0.394452,0.050399,0.050399,0.0,0.0,0.0,...,0.0,0.0,0.0,0.050399,0.0,0.0,0.0,0.100798,0.0,0.100798
1,0.093784,0.046892,0.046892,0.300275,0.467094,0.0,0.0,0.093784,0.046892,0.046892,...,0.046892,0.046892,0.046892,0.0,0.046892,0.046892,0.046892,0.0,0.046892,0.0


**Note that these values are different! Sklearn's formula for tf-idf is a little more sophisticated than ours. See the doc for the transformer [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html).**

### N-grams

Notice that the `TfidfVectorizer()` has a parameter called "ngram_range". Sometimes we want to search not only for individual words but for pairs or triples (etc.) of words. Using $N$ as a variable for the size of the word cluster to consider, we speak of "N-grams". Notice that our default is (1, 1).

### Cosine Similarity

In [24]:
sklearn.metrics.pairwise.cosine_similarity(response)

array([[1.       , 0.2787621],
       [0.2787621, 1.       ]])

### Naive Bayes Modeling

Naive Bayes models lend themselves well to NLP problems. Consider the task of trying to predict genre from text. My subjective probability that a text belongs to a certain genre would be a function of the words in the text. So e.g. the (prior) probability that a text is science-fiction may be relatively small. But the probability that a text is science-fiction *given that it uses the word 'cyclotron'* may be quite high.

Now: What's "naive" about Naive Bayes models?

The calculation of the relevant probabilities could get very complicated. But they get much simpler with the (relatively implausible!) assumption that the different features (occurrences of particular words, in our present case of NLP) are *independent*.

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [None]:
fetch_20newsgroups().target_names

In [None]:
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

In [None]:
count = CountVectorizer(stop_words='english', tokenizer=tokenize)

In [None]:
X_train = count.fit_transform(train.data)
X_test = count.transform(test.data)
y_train = train.target
y_test = test.target

In [None]:
y_test[:10]

Which NB model do you want? Check out the options and their differences [here](https://scikit-learn.org/stable/modules/naive_bayes.html).

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, nb.predict(X_test))

## Corpus Statistics 

How many non-zero elements are there?

In [None]:
import numpy as np
https://www.gutenberg.org/ebooks/7849.txt.utf-8
newval = np.array(df)

non_zero_vals = np.count_nonzero(newval) / float(df.shape[0])
print(f'Average Number of Non-Zero Elements in Vectorized Articles: {non_zero_vals}')

percent_sparse = len([col for col in df.columns if sum(df[col]) <= 1]) / df.shape[1]
print(f'Percentage of columns containing 0: {percent_sparse}')

### Next Steps:
- Create the tf-idf for the **whole** corpus of 12 articles
- What are _on average_ the most important words in the whole corpus?
- Add a column named "Target" to the dataset
- Target will be set to 1 or 0 if the article is "Politics" or "Not Politics"
- Do some exploratory analysis of the dataset
 - What are the average most important words for the "Politics" articles?
 - What are the average most important words for the "Not Politics"?

## Lets talk classification
- How would you split into train and test? what would be the dataset?

In [None]:
# Sample code
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [25]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
import requests
from random import randint

## Download Frankenstein and Metamorphosis

In [26]:
frankenstein = requests.get('https://www.gutenberg.org/files/84/84-0.txt')
frankenstein.encoding ='utf-8'
frankenstein = frankenstein.text

metamorphosis = requests.get('https://www.gutenberg.org/ebooks/5200.txt.utf-8')
metamorphosis.encoding ='utf-8'
metamorphosis = metamorphosis.text

trial = requests.get('https://www.gutenberg.org/ebooks/7849.txt.utf-8')
trial.encoding ='utf-8'
trial = trial.text

### Remove some extraneous characters

In [27]:
frank = (frankenstein
         .replace('\r', '')
         .replace('\n', ' ')
         .strip()[1000:]
         .split()
        )

meta = (metamorphosis
        .replace('\r', '')
        .replace('\n', ' ')
        .strip()[1000:]
        .split()
       )

tri = (trial
        .replace('\r', '')
        .replace('\n', ' ')
        .strip()[1000:]
        .split()
       )

## Create our dataset by making our observations random subsets from each book

In [28]:
def text_sample(book, sample_length=10):
    start = randint(0, len(meta) - sample_length)
    end = start + sample_length

    return ' '.join(book[start:end])

n_samples = 100
all_text = []

all_text.extend([{'text': text_sample(tri), 'label': 'The Trial'} for _ in range(n_samples)])
all_text.extend([{'text': text_sample(meta), 'label': 'Metamorphosis'} for _ in range(n_samples)])

data = pd.DataFrame(all_text)

In [32]:
data.sample(1)['text'].values

array(['neglectful with him or especially indifferent, and he decided to'],
      dtype=object)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'])

In [34]:
c = CountVectorizer()

X_train_vec = c.fit_transform(X_train)
X_test_vec = c.transform(X_test)

In [35]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

nb.fit(X_train_vec, y_train)

nb.score(X_test_vec, y_test)

0.7