# Import necessary library

In [2]:
import os
import re
import numpy as np
from nltk.stem.porter import PorterStemmer
from collections import defaultdict

# Extracting data

In [3]:
def gather_20newsgropups_data():
    path = "../datasets/20news-bydate/"
    train_dir = path + "20news-bydate-train"
    test_dir = path + "20news-bydate-test"

    newsgroup_list = [news_group for news_group in os.listdir(train_dir)]
    newsgroup_list.sort()
    with open ("../datasets/20news-bydate/stop_words.txt") as f:
        stop_words = (f.read().splitlines())
    # print(stop_words)
    stemmer = PorterStemmer()
    def collect_data_from(parent_dir, newsgroup_list):
        data = []
        for group_id, newsgroup in enumerate (newsgroup_list):
            label = group_id
            dir_path = parent_dir + "/" + newsgroup + "/"
            files = [(filename, dir_path + filename)
                    for filename in os.listdir(dir_path)
                    if os.path.isfile(dir_path + filename)]
            files.sort()
            for filename, filepath in files:
                    # print(filepath)
                    with open(filepath, errors = "ignore") as f:
                        text = f.read().lower()
                        words = [stemmer.stem(word)
                                for word in re.split("\W+", text)
                                if word not in stop_words]

                        content = " ".join(words)
                        assert len(content.splitlines()) == 1
                        data.append(str(label) + "<fff>" +
                                    filename + "<fff>" + content)
        return data
    train_data = collect_data_from(parent_dir= train_dir, newsgroup_list = newsgroup_list)
    test_data = collect_data_from(parent_dir= test_dir, newsgroup_list = newsgroup_list)
    full_data = train_data + test_data
    with open ("../datasets/20news-bydate/20news-train-processed.txt", "w") as f:
        f.write('\n'.join(train_data))
    with open ("../datasets/20news-bydate/20news-test-processed.txt", "w") as f:
        f.write('\n'.join(test_data))
    with open ("../datasets/20news-bydate/20news-full-processed.txt", "w") as f:
        f.write('\n'.join(full_data))
    

In [4]:
gather_20newsgropups_data()

# Basic knowledge of TF-IDF

TF-IDF is scored between 0 and 1. The higher the numerical weight value, the rarer the term. The smaller the weight, the more common the term. 

- **TF (term frequency) example**

The TF (term frequency) of a word is the frequency of a word (i.e., number of times it appears) in a document. When you know TF, you’re able to see if you’re using a term too much or too little. When a 100-word document contains the term “cat” 12 times, the TF for the word ‘cat’ is

TFcat = 12/100 i.e. 0.12

- **IDF (inverse document frequency) example**

The IDF (inverse document frequency) of a word is the measure of how significant that term is in the whole corpus (a body of documents).

Let’s say the size of the corpus is 10,000,000 million documents. If we assume there are 0.3 million documents that contain the term “cat”, then the IDF (i.e. log {DF}) is given by the total number of documents (10,000,000) divided by the number of documents containing the term “cat” (300,000).

IDF (cat) = log (10,000,000/300,000) = 1.52



In [5]:
# we suppose the data in the train is combined by total files
def generate_vocabulary(data_path):

    # compute inverse document frequency
    def compute_idf(df, corpus_size):
        # df stands for document frequency
        assert df > 0
        return np.log10(corpus_size * 1. / df)
        
    with open (data_path) as f:
        lines = f.read().splitlines()
    """
    doc_count is the list with keys are distince word in the
    vocabulary and the values is the number of document
    containing that word in the corpus
    """
    doc_count = defaultdict(int)
    corpus_size = len(lines)

    for line in lines:
        features = line.split('<fff>')
        text = features[-1]
        words = list(set(text.split()))
        for word in words:
            doc_count[word] += 1
    """
    words_idfs is list containing pairs of values - word and idf of that word
    under condition that the frequency is larger than 10
    """
    words_idfs = [(word, compute_idf(document_freq, corpus_size))
                for word, document_freq in zip(doc_count.keys(), doc_count.values())
                if document_freq > 10 and not word.isdigit()]
    words_idfs.sort(key=lambda x: -x[1])
    print("Vocabulary size {}". format(len(words_idfs)))
    with open('../datasets/20news-bydate/words_idfs.txt', "w") as f:
        f.write("\n".join([word + "<fff>" + str(idf) for word, idf in words_idfs]))

In [6]:
generate_vocabulary("../datasets/20news-bydate/20news-full-processed.txt")

Vocabulary size 14230


In [7]:
def get_tf_idf(data_path):
    with open ("../datasets/20news-bydate/words_idfs.txt") as f:
        word_idfs = [(line.split("<fff>")[0], float(line.split("<fff>")[1]))  
                    for line in f.read().splitlines()]

        idfs = dict(word_idfs)
        word_IDs = dict([(word, index) for index, (word, idf) in enumerate(word_idfs)])
    
    with open (data_path) as f:
        documents = [
            (int(line.split("<fff>")[0]),
            int(line.split("<fff>")[1]),
            line.split("<fff>")[2])
            for line in f.read().splitlines()]
        total_doc_num = len(documents)
        
    data_tf_idf = []
    for i, document in enumerate(documents):
        if i % 1000 == 0:
            print("Processing {i}-th/{total_doc_num} documents".format(i = i, total_doc_num = total_doc_num))
        # unpack document
        label, doc_id, text = document
        words = [word for word in text.split() if word in idfs]
        word_set = list(set(words))

        max_term_freq = max([words.count(word) for word in word_set])
        words_tf_idf = []
        sum_squares = 0.0
        
        for word in word_set:
            term_freq = words.count(word)

            tf_idf_value = term_freq * 1. / max_term_freq * idfs[word]
            words_tf_idf.append((word_IDs[word], tf_idf_value))
            sum_squares = sum_squares + tf_idf_value ** 2
        
        words_tfidfs_normalized = [str(index)+":"+ str(tf_idf_value / np.sqrt(sum_squares))
                                for index, tf_idf_value in words_tf_idf]
        spare_rep = " ".join(words_tfidfs_normalized)
        data_tf_idf.append((label, doc_id, spare_rep))
    with open('../datasets/20news-bydate/data_tf_idf.txt', "w") as f:
        f.write("\n".join([str(label) + "<fff>" + str(doc_id) + "<fff>" + spare_rep \
                            for label, doc_id, spare_rep in data_tf_idf]))
    print("Already calculated tf-idf values of {} documents in the corpus. File was written and saved".format(total_doc_num))

In [8]:
get_tf_idf("../datasets/20news-bydate/20news-full-processed.txt")

Processing 0-th/18846 documents
Processing 1000-th/18846 documents
Processing 2000-th/18846 documents
Processing 3000-th/18846 documents
Processing 4000-th/18846 documents
Processing 5000-th/18846 documents
Processing 6000-th/18846 documents
Processing 7000-th/18846 documents
Processing 8000-th/18846 documents
Processing 9000-th/18846 documents
Processing 10000-th/18846 documents
Processing 11000-th/18846 documents
Processing 12000-th/18846 documents
Processing 13000-th/18846 documents
Processing 14000-th/18846 documents
Processing 15000-th/18846 documents
Processing 16000-th/18846 documents
Processing 17000-th/18846 documents
Processing 18000-th/18846 documents
Already calculated tf-idf values of 18846 documents in the corpus. File was written and saved
