In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Natural Language Processing (NLP)

**Preprocessing of textual data**

### Tokenization

In [None]:
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# load data
filename = 'data/metamorphosis_clean.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

# split into words
tokens = word_tokenize(text)

# convert to lower case
tokens = [w.lower() for w in tokens]

# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation)) 

# remove punctuation from each word
stripped = [re_punc.sub('', w) for w in tokens]

# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]

# filter out stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words] 
print(words[:100])

### TF-IDF with TfidfVectorizer

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

dataset = [
    "I enjoy reading about Machine Learning and Machine Learning is my PhD subject",
    "I would enjoy a walk in the park",
    "I was reading in the library"
]

vectorizer = TfidfVectorizer(use_idf=True)
tfIdf = vectorizer.fit_transform(dataset)

df = pd.DataFrame(tfIdf[0].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)

print (df.head(25))

### TF-IDF with TfidfTransformer

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

transformer = TfidfTransformer(use_idf=True)
countVectorizer = CountVectorizer()

wordCount = countVectorizer.fit_transform(dataset)
newTfIdf = transformer.fit_transform(wordCount)

df = pd.DataFrame(newTfIdf[0].T.todense(), index=countVectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))

### Cosine similarity
URL: https://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# twenty dataset
twenty = fetch_20newsgroups()
tfidf = TfidfVectorizer().fit_transform(twenty.data)

# cosine similarity
cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()

# top-5 related documents
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
print(related_docs_indices)
print(cosine_similarities[related_docs_indices])

# print the first result to check
print(twenty.data[0])
print(twenty.data[958])

### Text classification

URL https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

# twenty dataset
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

print(twenty_train.target_names)
# print("\n".join(twenty_train.data[0].split("\n")[:3]))

### Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Bag-of-words
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)

# TF-IDF
transformer = TfidfTransformer()
X_train_tfidf = transformer.fit_transform(X_train_counts)
X_test_tfidf = transformer.transform(X_test_counts)

# Naive Bayes (NB) for text classification
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

# Performance of the model
predicted = clf.predict(X_test_tfidf)
np.mean(predicted == twenty_test.target)

### Pipeline

In [None]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

# Performance of the model
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

### GridSearchCV with Naive Bayes

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
             }
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

print(gs_clf.best_score_)
print(gs_clf.best_params_)

### SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier

text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', 
                                                   penalty='l2',
                                                   alpha=1e-3,  
                                                   random_state=42)),
                        ])
text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)

# Performance of the model
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

### GridSearchCV with SVM

In [None]:
from sklearn.model_selection import GridSearchCV

parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf-svm__alpha': (1e-2, 1e-3),
                 }
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

### Stemming

In [None]:
import nltk
nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), 
                             ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(twenty_train.data, twenty_train.target)

predicted_mnb_stemmed = text_mnb_stemmed.predict(twenty_test.data)

np.mean(predicted_mnb_stemmed == twenty_test.target)