# Import necessary library

In [5]:
import os
import re
import numpy as np
from nltk.stem.porter import PorterStemmer
from collections import defaultdict

# Extracting data

In [6]:
def gather_20newsgropups_data():
    path = "../datasets/20news-bydate/"
    train_dir = path + "20news-bydate-train"
    test_dit = path + "20news-bydate-test"

    newsgroup_list = [news_group for news_group in os.listdir(train_dir)]
    newsgroup_list.sort()
    return newsgroup_list
newsgroup_list = gather_20newsgropups_data()
print(newsgroup_list)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [7]:
with open ("../datasets/20news-bydate/stop_words.txt") as f:
    stop_words = (f.read().splitlines())
    # print(stop_words)
stemmer = PorterStemmer()
def collect_data_from(parent_dir, newsgroup_list):
    data = []
    for group_id, newsgroup in enumerate (newsgroup_list):
        label = group_id
        dir_path = parent_dir + "/" + newsgroup + "/"
        files = [(filename, dir_path + filename)
                for filename in os.listdir(dir_path)
                if os.path.isfile(dir_path + filename)]
        files.sort()
        for filename, filepath in files:
            # print(filepath)
            with open(filepath, errors = "ignore") as f:
                text = f.read().lower()
                words = [stemmer.stem(word)
                        for word in re.split("\W", text)
                        if word not in stop_words]

                content = " ".join(words)
                assert len(content.splitlines()) == 1
                data.append(str(label) + "<fff>" +
                            filename + "<fff>" + content)
    return data

path = "../datasets/20news-bydate/"
train_dir = path + "20news-bydate-train"
test_dir = path + "20news-bydate-test"

    

In [60]:
train_data = collect_data_from(parent_dir= train_dir, newsgroup_list = newsgroup_list)
test_data = collect_data_from(parent_dir= test_dir, newsgroup_list = newsgroup_list)
full_data = train_data + test_data
with open ("../datasets/20news-bydate/20news-train-processed.txt", "w") as f:
    f.write('\n'.join(train_data))
with open ("../datasets/20news-bydate/20news-test-processed.txt", "w") as f:
    f.write('\n'.join(test_data))
with open ("../datasets/20news-bydate/20news-full-processed.txt", "w") as f:
    f.write('\n'.join(full_data))

# Basic knowledge of TF-IDF

TF-IDF is scored between 0 and 1. The higher the numerical weight value, the rarer the term. The smaller the weight, the more common the term. 

- **TF (term frequency) example**

The TF (term frequency) of a word is the frequency of a word (i.e., number of times it appears) in a document. When you know TF, you’re able to see if you’re using a term too much or too little. When a 100-word document contains the term “cat” 12 times, the TF for the word ‘cat’ is

TFcat = 12/100 i.e. 0.12

- **IDF (inverse document frequency) example**

The IDF (inverse document frequency) of a word is the measure of how significant that term is in the whole corpus (a body of documents).

Let’s say the size of the corpus is 10,000,000 million documents. If we assume there are 0.3 million documents that contain the term “cat”, then the IDF (i.e. log {DF}) is given by the total number of documents (10,000,000) divided by the number of documents containing the term “cat” (300,000).

IDF (cat) = log (10,000,000/300,000) = 1.52



In [8]:
# we suppose the data in the train is combined by total files
def generate_vocabulary(data_path):

    # compute inverse document frequency
    def compute_idf(df, corpus_size):
        # df stands for document frequency
        assert df > 0
        return np.log10(corpus_size * 1. / df)
        
    with open (data_path) as f:
        lines = f.read().splitlines()
    """
    doc_count is the list with keys are distince word in the
    vocabulary and the values is the number of document
    containing that word in the corpus
    """
    doc_count = defaultdict(int)
    corpus_size = len(lines)

    for line in lines:
        features = line.split('<fff>')
        text = features[-1]
        words = list(set(text.split()))
        for word in words:
            doc_count[word] += 1
    """
    words_idfs is list containing pairs of values - word and idf of that word
    under condition that the frequency is larger than 10
    """
    words_idfs = [(word, compute_idf(document_freq, corpus_size))
                for word, document_freq in zip(doc_count.keys(), doc_count.values())
                if document_freq > 10 and not word.isdigit()]
    words_idfs.sort(key=lambda x: -x[1])
    print("Vocabulary size {}". format(len(words_idfs)))
    with open('../datasets/20news-bydate/words_idfs.txt', "w") as f:
        f.write("\n".join([word + "<fff>" + str(idf) for word, idf in words_idfs]))

In [9]:
def get_tf_idf(data_path):
    with open (data_path) as f:
        word_idfs = [(line.split("<fff>")[0], line.split("<fff>")[1])  
                    for line in f.read.splitlines()]

        idfs = dict(word_idfs)
        word_IDS = dict([(word, index) for index, (word, idfs) in enumerate(word_idfs)])
    
    with open (data_path) as f:
        document = [
            (int(line.split("<fff>")[0]),
            int(line.split("<fff>")[1]),
            line.split("<fff>")[2])
            for line in f.read().splitlines()]

    

In [10]:
generate_vocabulary("../datasets/20news-bydate/20news-train-processed.txt")

Vocabulary size 10309


### Another approach to implement TF-IDF more naively but consumes larger computational workload (and time complexity)

In [21]:
corpus = [
      'this is the first document',
      'this document is the second document',
      'and this is the third one',
      'is this the first document',
 ]

def IDF(corpus, unique_words):
    idf_dict={}
    N=len(corpus)
    print("len of the corpus:", N)
    for i in unique_words:
        count=0
        for sen in corpus:
            if i in sen.split():
                count=count+1
        idf_dict[i]=(np.log((1+N)/(count+1)))+1
    return idf_dict 

In [22]:
def fit(whole_data):
    unique_words = set()
    if isinstance(whole_data, (list,)):
      for x in whole_data:
        for y in x.split():
          if len(y)<2:
            continue
          unique_words.add(y)
      unique_words = sorted(list(unique_words))
      vocab = {j:i for i,j in enumerate(unique_words)}
      Idf_values_of_all_unique_words=IDF(whole_data,unique_words)
    return vocab, Idf_values_of_all_unique_words
Vocabulary, idf_of_vocabulary=fit(corpus)

print(Vocabulary)
print(idf_of_vocabulary)
# 'this is the first document',
# 'this document is the second document',
# 'and this is the third one',
# 'is this the first document',

len of the corpus: 4
{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}
{'and': 1.916290731874155, 'document': 1.2231435513142097, 'first': 1.5108256237659907, 'is': 1.0, 'one': 1.916290731874155, 'second': 1.916290731874155, 'the': 1.0, 'third': 1.916290731874155, 'this': 1.0}
