# Solution for NLP

In [None]:
%matplotlib inline

## Speeches I

In [None]:
from glob import glob

files = glob("./data/speeches/*")
corpus = []
for f in files:
    try:
        with open(f, "rb") as inf:
            text = inf.readlines()[0]
            corpus.append(text.decode("utf8"))
    except:
        print(f)
        continue

In [None]:
from string import digits, punctuation

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

_remove = digits + punctuation
_stopwords = nltk.corpus.stopwords.words('english')
_stemmer = nltk.snowball.SnowballStemmer('english')


def tokenize_and_stem(text):
    """Return tokens of text deprived of numbers and interpunctuation."""
    text = text.translate(str.maketrans({p: "" for p in _remove}))
    return [_stemmer.stem(t) for t in nltk.word_tokenize(text.lower())]


vectorizer = TfidfVectorizer(stop_words=_stopwords, tokenizer=tokenize_and_stem, ngram_range=(1,3))
tfidf_matrix = vectorizer.fit_transform(corpus)
terms = vectorizer.get_feature_names()

In [None]:
from pickle import dump

from pandas import DataFrame

with open("./out/speech_matrix.pk", "wb") as ouf:
    dump(tfidf_matrix, ouf)

terms = DataFrame(terms)
terms.columns = ["terms"]
terms.to_csv('./out/terms.csv')

## Speeches II

In [None]:
from pickle import load

with open("./out/speech_matrix.pk", "rb") as inf:
    tfidf_matrix = load(inf)

In [None]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(tfidf_matrix.todense())
plt.figure(figsize=(10, 5))
dendrogram(linkage_matrix)
plt.show()

## Speeches III

In [None]:
from pickle import load

import pandas as pd

with open("./out/speech_matrix.pk", "rb") as inf:
    tfidf_matrix = load(inf)

terms = pd.read_csv("./out/terms.csv", index_col=0)["terms"]

In [None]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=2, learning_method='online', random_state=0, verbose=0)
lda.fit(tfidf_matrix)

In [None]:
N_WORDS = 10
for idx, topic_dist in enumerate(lda.components_):
    name = ", ".join(terms[i] for i in topic_dist.argsort()[:-N_WORDS:-1])
    print(idx, ":", name)

## Predicting newsgroups

In [None]:
from sklearn.datasets import fetch_20newsgroups

topics = ['sci.crypt', 'sci.electronics', 'sci.med', 'sci.space']
data = fetch_20newsgroups(subset='train', categories=topics, shuffle=True)

X_train = data.data
y_train = data.target

In [None]:
dir(data)

In [None]:
from string import digits, punctuation

import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

_remove = digits + punctuation
_stopwords = nltk.corpus.stopwords.words('english')
_stemmer = nltk.snowball.SnowballStemmer('english')

def tokenize_and_stem(text):
    """Return tokens of text deprived of numbers and interpunctuation."""
    text = text.translate(str.maketrans({p: "" for p in _remove}))
    return [_stemmer.stem(t) for t in nltk.word_tokenize(text.lower())]

pipe = Pipeline([('tfidf', TfidfVectorizer(stop_words=_stopwords, tokenizer=tokenize_and_stem)),
                ('mnb', MultinomialNB())])

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'mnb__alpha': (1e-2, 1e-3)}
mnb = GridSearchCV(pipe, params, cv=2, n_jobs=-1)
mnb.fit(X_train, y_train)

In [None]:
test = fetch_20newsgroups(subset='test', categories=topics, shuffle=True)

X_test = test.data
y_test = test.target

In [None]:
best = mnb.best_estimator_
preds = best.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, preds, target_names=topics))