In [1]:
from collections import defaultdict
from os import listdir
from os.path import isfile
from nltk.stem.porter import PorterStemmer
import re
import numpy as np

In [2]:
with open("/content/drive/MyDrive/DS_TrainingLab_phase_1/Session 1/Data/20news-full-processed.txt", encoding="utf8", errors='ignore') as f:
  lines = f.read().splitlines()
len(lines) 
# Have 18846 docs
# docs: (label, doc_id, text): 0<fff>53068<fff>decay cbnewsj cb att com dean kaflowitz subject

18846

In [3]:
# Compute idf
def compute_idf(df, corpus_size):
    assert df > 0
    return np.log10(corpus_size * 1. / df)

In [5]:
# Generate vocab (word, idf): proselyt<fff>3.2338265016239234
def generate_vocabulary(data_path):
    with open(data_path, encoding="utf8", errors='ignore') as f:
        lines = f.read().splitlines()
    doc_count = defaultdict(int)
    corpus_size = len(lines)

    for line in lines:
        features = line.split('<fff>')
        text = features[-1]
        words = list(set(text.split()))
        for word in words:
            doc_count[word] += 1

    words_idfs = [(word, compute_idf(document_freq, corpus_size))
                   for word, document_freq in zip(doc_count.keys(), doc_count.values())
                   if document_freq > 10 and not word.isdigit()]
    words_idfs.sort(key=lambda word_idf: -word_idf[1])
    print('Vocabulary size: {}'.format(len(words_idfs)))
    with open("/content/drive/MyDrive/DS_TrainingLab_phase_1/Session 1/Data/words_idfs.txt", 'w') as f:
        f.write('\n'.join([word + '<fff>' + str(idf) for word, idf in words_idfs]))

In [6]:
generate_vocabulary("/content/drive/MyDrive/DS_TrainingLab_phase_1/Session 1/Data/20news-full-processed.txt")
# Vocabulary size: 14079

Vocabulary size: 14079


In [7]:
def get_tf_idf(data_path):
  with open("/content/drive/MyDrive/DS_TrainingLab_phase_1/Session 1/Data/words_idfs.txt", encoding="utf8", errors='ignore') as f:
      words_idfs = [(line.split('<fff>')[0], float(line.split('<fff>')[1]))
                    for line in f.read().splitlines()]
      words_IDs = dict([(word, index)
                        for index, (word, idf) in enumerate(words_idfs)])
      idfs = dict(words_idfs)

  with open(data_path, encoding="utf8", errors='ignore') as f:
      documents = [(int(line.split('<fff>')[0]),
                    int(line.split('<fff>')[1]),
                    line.split('<fff>')[2])
                    for line in f.read().splitlines()]

      data_tf_idf = []
      for document in documents:
          label, doc_id, text = document
          words = [word for word in text.split() if word in idfs]
          word_set = list(set(words))
          max_term_freq = max([words.count(word) for word in word_set])
          words_tfidfs = []
          sum_squares = 0.0

          for word in word_set:
              term_freq = words.count(word)
              tf_idf_value = term_freq * (1. / max_term_freq) * idfs[word]
              words_tfidfs.append((words_IDs[word], tf_idf_value))
              sum_squares += tf_idf_value ** 2

          words_tfidfs_normalize = [str(index) + ':'
                                    + str(tf_idf_value / np.sqrt(sum_squares))
                                    for index, tf_idf_value in words_tfidfs]
          sparse_rep = ' '.join(words_tfidfs_normalize)
          data_tf_idf.append((label, doc_id, sparse_rep))
  return data_tf_idf

In [8]:
data_tf_idf = get_tf_idf("/content/drive/MyDrive/DS_TrainingLab_phase_1/Session 1/Data/20news-full-processed.txt")


In [9]:
data_tf_idf
# (label, doc_id, sparse_rep): (2, 10131, '14078:0.0 10864:0.1092775151817038 13253:0.078910118528671 ...')

[(0,
  53068,
  '13525:0.071589085211584 6641:0.5292071925623247 13248:0.07835021340249498 13991:0.1034425292102589 13606:0.06914767596321644 14077:5.3674759779789784e-05 14041:0.042471094355120004 12537:0.18095241493914133 9580:0.11829728532190209 11764:0.19956865274336005 13707:0.13215092298663653 13827:0.06145445662389579 14074:0.016863054769357146 1551:0.1470688256485503 11993:0.09738798734512896 13186:0.07963345357073419 10296:0.11305405853588868 9581:0.11829728532190209 12997:0.0832509872242124 13851:0.06059737824857234 14075:0.014801112026277234 13823:0.06156701536177525 14078:0.0 8486:0.12436067089106805 13478:0.07285106574789471 14061:0.03263143203306505 13981:0.05236943392395905 13960:0.053839211297197286 13503:0.07222934773426529 14064:0.029040515781645442 13135:0.08047894267312507 14068:0.017436320416582874 14020:0.09690174183230167 8487:0.2487213417821361 13241:0.07845409147807397 14046:0.041196224021790434 13820:0.06172565856423261 9582:0.23659457064380418 14071:0.0150518

In [10]:
len(data_tf_idf)
# docs

18846