In [None]:
import numpy as np
import os
import pandas as pd
import re
import zipfile

from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer

In [None]:
# import nltk
# nltk.download('stopwords')

In [None]:
!wget http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip -O /tmp/bbc-fulltext.zip

In [None]:
zip_ref = zipfile.ZipFile('/tmp/bbc-fulltext.zip', 'r')
zip_ref.extractall('/tmp/')
zip_ref.close()

In [None]:
base_dir = '/tmp/bbc/'


def get_subfolders(path):
    return [f.path for f in os.scandir(path) if f.is_dir()]


def get_filenames(path):
    return [f.path for f in os.scandir(path) if f.is_file()]

In [None]:
labels = get_subfolders(base_dir)

print(labels)

In [None]:
def clean_text(text):
    return re.sub("\s+", " ", text).strip()


def read_document(file_path):
    with open(file_path, "r", encoding="ISO-8859-1") as article:
        text = article.read().replace("\n", " ")
    cleaned_text = clean_text(text)
    return cleaned_text


def iterate_over_documents(path):
    subfolders = get_subfolders(path)
    for subfolder in subfolders:
        label = subfolder.split('/')[-1]
        file_paths = get_filenames(subfolder)
        for file_path in file_paths:
            file_name = file_path.split('/')[-1]
            content = read_document(file_path)
            yield label, file_name, content

In [None]:
def process_article(text, stop_words):
    content = defaultdict(int)
    stemmer = PorterStemmer()
    for word in text.split():
        lowercase = word.lower()
        # non-letter characters at the beginning/end of the word are replaced
        lowercase = re.sub("^[^a-z]*|[^a-z]*$", "", lowercase)
        if lowercase and lowercase not in stop_words:
            word = stemmer.stem(word)
            content[word] += 1
    return dict(content)


def create_document_word_counts(path):
    documents = []
    document_generator = iterate_over_documents(path)
    stop_words = stopwords.words("english")
    for label, file_name, text in document_generator:
        word_counts = process_article(text, stop_words)
        documents.append((label, file_name, word_counts))
    return documents

In [None]:
documents = create_document_word_counts(base_dir)

In [None]:
documents[0]

In [None]:
vectorizer = DictVectorizer(dtype=np.int_, sparse=False)
mydocuments = [{'foo': 1, 'bar': 2}, 
               {'foo': 3, 'baz': 1, 'foobar': 2, 'bar': 3}]
X = vectorizer.fit_transform(mydocuments)

print(X)
vectorizer.feature_names_

### Term frequency - inverse document frequency

TF-IDF is a statistical measure that evaluates how relevant a word is to a document in a collection of documents.

* TF: Term Frequency, that measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length
    * $tf(t, d) = \text{count}(t, d)$
    * $tf(t, d) = 1$ if $t$ occurs in $d$ and 0 otherwise;
    * $tf(t, d) = \text{count}(t, d) / (\text{number of words in d})$
    * $tf(t, d) = \log (1 + \text{count}(t, d)$)


* IDF: Inverse Document Frequency measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms may appear a lot of times but have little importance (e.g. stop words).
$$
    idf(t, d) = \frac{|D|}{1 + |d\in D:\ t\in d|}
$$    

In [None]:
def create_document_term_matrix(documents):
    vectorizer = DictVectorizer(dtype=np.int_, sparse=True)
    count_matrix = vectorizer.fit_transform(map(lambda x: x[2], documents))
    terms = vectorizer.get_feature_names()
    transformer = TfidfTransformer(norm="l2", sublinear_tf=True)
    term_matrix = transformer.fit_transform(count_matrix)
    document_titles = list(map(lambda x: f'{x[0]}/{x[1]}', documents))
    return term_matrix, document_titles, terms

In [None]:
term_matrix, labels, words = create_document_term_matrix(documents)

### Similarity of documents


* cosine similarity: $\left < v, w\right > = ||v||\cdot ||w||\cdot\cos\phi $

The attribute vectors $v$ and $w$ are usually the term frequency vectors of the documents.

In [None]:
similarity_matrix = pd.DataFrame((term_matrix * term_matrix.T).A, 
                                 columns=labels, index=labels)

In [None]:
similarity_matrix

In [None]:
def find_top_k_similar_documents(similarity_matrix, document_name, k):
    row = similarity_matrix.loc[document_name, similarity_matrix.columns != document_name]
    return row.sort_values(ascending=False)[:k]

In [None]:
doc_name = 'entertainment/385.txt'

In [None]:
doc = read_document(f'{base_dir}/{doc_name}')

In [None]:
print(doc)

In [None]:
similar_documents = find_top_k_similar_documents(similarity_matrix, doc_name, k=5)

In [None]:
similar_documents

In [None]:
for fname in similar_documents.index:
    doc = read_document(f'{base_dir}{fname}')
    print(doc)
    print('\n')