# TF-IDF

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from sklearn.feature_extraction.text import CountVectorizer

tqdm.pandas()

In [2]:
# read preprocessed queries in pandas dataframe
queries = pd.read_csv('data/preprocessed_query_data.csv')
queries['keywords'] = queries['keywords'].str.strip('][').str.replace("'", "").str.split(', ')
queries = queries.set_index('id')
queries.index.name = 'query_id'

queries.head()

Unnamed: 0_level_0,keywords,title,rel_docs
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
84,"[cultiv, agricultur, maiz, corn, fruit, wheat,...",Agriculture,"[572, 627, 678, 903, 1193, 1542, 1634, 3751, 3..."
111,"[reptil, lizard, salamand, fossil, frog, prehi...",Amphibians and Reptiles,"[621, 809, 1380, 6641, 8311, 8937, 13134, 1446..."
265,"[astronom, astronomi, astrophysicist, mathemat...",Astronomy,"[39, 308, 580, 664, 736, 748, 791, 798, 799, 1..."
323,"[aviat, airfield, airport, aerospac, aircraft,...",Aviation,"[849, 852, 1293, 1902, 1942, 2039, 2075, 2082,..."
396,"[actor, cast, screenwrit, filmmak, film, actre...",Biography/WikiProject Actors and Filmmakers,"[344, 676, 808, 872, 1247, 1806, 1828, 2083, 2..."


## 1) TF vectors

In [3]:
query_terms = (list(set([a for b in queries['keywords'].tolist() for a in b])))

# initiate count vectorizer (using only words that appear in the query for performance reasons)
count_vectorizer = CountVectorizer(vocabulary=query_terms)

# read logfile to look for last successfully processed chunk
logfile_path = 'data/tf-idf_chunks_log.txt'
try:
    f = open(logfile_path, "r")
    last_chunk_idx = int(f.read())
    f.close()
except FileNotFoundError:
    last_chunk_idx = -1

# iterate over chunks to get term frequencies of every document
corpus_iterator = pd.read_csv('data/preprocessed_corpus.csv', index_col='id', chunksize=100000)
for i, corpus_df in enumerate(corpus_iterator):
    # skip chunk if preprocessing is already done for this chunk
    if i <= last_chunk_idx:
        continue
        
    print(f'chunk {i+1} / 65: ', end='')
    
    # drop empty documents
    corpus_df = corpus_df.dropna()
    corpus_df.index.name = 'page_id'
    
    # get count of words per document
    print('count words... ', end='')
    total_word_counts = corpus_df['plain'].str.split(' ').str.len().rename('count')
    
    # get term counts (using only words that appear in the query for performance reasons)
    print('get term counts... ', end='')
    term_counts = count_vectorizer.fit_transform(corpus_df['plain'])
    terms = count_vectorizer.get_feature_names_out()
    term_counts_df = pd.DataFrame(term_counts.toarray(), columns=terms, index=corpus_df.index)
    
    # calculate term frequencies
    print('calculate term frequencies...  ', end='')
    term_frequencies = term_counts_df.div(total_word_counts, axis=0)
    
    # normalize and apply logarithmic decay
    print('normalize and apply logarithmic decay... ', end='')
    #max_freqs = term_frequencies.max(axis=1).copy()
    #max_freqs = np.log10(max_freqs)
    #max_freqs = max_freqs + 1
    term_frequencies = np.log10(term_frequencies)
    term_frequencies = term_frequencies + 1
    term_frequencies = term_frequencies.replace([np.inf, -np.inf], -999999)
    #term_frequencies = term_frequencies.div(max_freqs, axis=0)
    
    # writing file
    print('write to file.')
    # Set writing mode to append after first chunk
    mode = 'w' if i == 0 else 'a'
    # Add header if it is the first chunk
    header = i == 0
    
    term_frequencies.to_csv(
        "data/term_frequencies.csv",
        header=header,
        mode=mode)
    
    # write chunk index to log file
    f = open(logfile_path, "w")
    f.write(str(i))
    f.close()

chunk 1 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 2 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 3 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 4 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 5 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 6 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 7 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 8 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 9 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


chunk 39 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 40 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 41 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 42 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 43 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 44 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 45 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 46 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 47 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 48 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 49 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 50 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 51 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 52 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 53 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 54 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 55 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 56 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 57 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 58 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 59 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 60 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 61 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 62 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 63 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 64 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.
chunk 65 / 65: count words... get term counts... calculate term frequencies...  normalize and apply logarithmic decay... 

  result = func(self.values, **kwargs)


write to file.


## 2) IDF vectors 

In [4]:
# read logfile to look for last successfully processed chunk
logfile_path2 = 'data/tf-idf_chunks_log2.txt'
try:
    f = open(logfile_path2, "r")
    last_chunk_idx = int(f.read())
    f.close()
except FileNotFoundError:
    last_chunk_idx = -1
    
# iterate over tf vectors in chunks and generate document counts
tf_iterator = pd.read_csv('data/term_frequencies.csv', index_col='page_id', chunksize=100000)
for i, tf in enumerate(tf_iterator):
    
    if i <= last_chunk_idx:
        continue
    
    print(f'chunk {i+1} / 65')
    
    doc_counts_chunk = tf.copy()
    doc_counts_chunk = doc_counts_chunk.replace([-999999], 0)
    doc_counts_chunk = doc_counts_chunk.astype(bool).astype(int)
    doc_counts_chunk = doc_counts_chunk.sum()
    doc_counts_chunk.name = 'count'
    doc_counts_chunk.index.name = 'term'
    if i == 0:
        doc_counts_chunk.to_csv('data/doc_counts.csv')
    else:
        doc_counts = pd.read_csv('data/doc_counts.csv', index_col='term')
        doc_counts = doc_counts.squeeze()
        doc_counts = doc_counts.add(doc_counts_chunk, axis=0)
        doc_counts.to_csv('data/doc_counts.csv')
        
    # write chunk index to log file
    f = open(logfile_path2, "w")
    f.write(str(i))
    f.close()

chunk 1 / 65
chunk 2 / 65
chunk 3 / 65
chunk 4 / 65
chunk 5 / 65
chunk 6 / 65
chunk 7 / 65
chunk 8 / 65
chunk 9 / 65
chunk 10 / 65
chunk 11 / 65
chunk 12 / 65
chunk 13 / 65
chunk 14 / 65
chunk 15 / 65
chunk 16 / 65
chunk 17 / 65
chunk 18 / 65
chunk 19 / 65
chunk 20 / 65
chunk 21 / 65
chunk 22 / 65
chunk 23 / 65
chunk 24 / 65
chunk 25 / 65
chunk 26 / 65
chunk 27 / 65
chunk 28 / 65
chunk 29 / 65
chunk 30 / 65
chunk 31 / 65
chunk 32 / 65
chunk 33 / 65
chunk 34 / 65
chunk 35 / 65
chunk 36 / 65
chunk 37 / 65
chunk 38 / 65
chunk 39 / 65
chunk 40 / 65
chunk 41 / 65
chunk 42 / 65
chunk 43 / 65
chunk 44 / 65
chunk 45 / 65
chunk 46 / 65
chunk 47 / 65
chunk 48 / 65
chunk 49 / 65
chunk 50 / 65
chunk 51 / 65
chunk 52 / 65
chunk 53 / 65
chunk 54 / 65
chunk 55 / 65
chunk 56 / 65
chunk 57 / 65
chunk 58 / 65
chunk 59 / 65
chunk 60 / 65
chunk 61 / 65
chunk 62 / 65
chunk 63 / 65
chunk 64 / 65
chunk 65 / 65


In [4]:
# calculat idf from document counts
doc_counts = pd.read_csv('data/doc_counts.csv', index_col='term')
doc_counts = doc_counts.squeeze()
N = 6475473
idf = N / doc_counts
idf = np.log10(idf)

## TF-IDF vectors

In [5]:
# read logfile to look for last successfully processed chunk
logfile_path3 = 'data/tf-idf_chunks_log3.txt'
try:
    f = open(logfile_path3, "r")
    last_chunk_idx = int(f.read())
    f.close()
except FileNotFoundError:
    last_chunk_idx = -1
    
# iterate over tf vectors in chunks and generate tf-idf vectors
tf_iterator = pd.read_csv('data/term_frequencies.csv', index_col='page_id', chunksize=100000)
for i, tf_chunk in enumerate(tf_iterator):
    
    if i <= last_chunk_idx:
        continue
    
    print(f'chunk {i+1} / 65')
    
    # in case a document appears twice per chunk, drop duplications
    tf_chunk = tf_chunk[~tf_chunk.index.duplicated(keep='first')]
    
    tfidf_chunk = tf_chunk.multiply(idf, axis=1)
                       
    # Set writing mode to append after first chunk
    mode = 'w' if i == 0 else 'a'
    # Add header if it is the first chunk
    header = i == 0
    
    tfidf_chunk.to_csv(
        "data/tfidf_doc.csv",
        header=header,
        mode=mode)
    
    # write chunk index to log file
    f = open(logfile_path3, "w")
    f.write(str(i))
    f.close()

chunk 52 / 65
chunk 53 / 65
chunk 54 / 65
chunk 55 / 65
chunk 56 / 65
chunk 57 / 65
chunk 58 / 65
chunk 59 / 65
chunk 60 / 65
chunk 61 / 65
chunk 62 / 65
chunk 63 / 65
chunk 64 / 65
chunk 65 / 65


## TF-IDF vectors of queries

In [6]:
query_terms = (list(set([a for b in queries['keywords'].tolist() for a in b])))

# initiate count vectorizer (using only words that appear in the query for performance reasons)
count_vectorizer = CountVectorizer(vocabulary=query_terms)

query_term_counts = count_vectorizer.fit_transform(queries['keywords'].str.join(' '))

In [7]:
query_terms = count_vectorizer.get_feature_names_out()
query_term_counts_df = pd.DataFrame(query_term_counts.toarray(), columns=query_terms, index=queries.index)

In [8]:
# term counts

query_terms = (list(set([a for b in queries['keywords'].tolist() for a in b])))

# initiate count vectorizer (using only words that appear in the query for performance reasons)
count_vectorizer = CountVectorizer(vocabulary=query_terms)

query_total_word_counts = queries['keywords'].str.len().rename('count')
query_term_counts = count_vectorizer.fit_transform(queries['keywords'].str.join(' '))
query_terms = count_vectorizer.get_feature_names_out()
query_term_counts_df = pd.DataFrame(query_term_counts.toarray(), columns=query_terms, index=queries.index)

# frequencies
query_term_frequencies = query_term_counts_df.div(query_total_word_counts, axis=0)

In [19]:
# normalize and apply logarithmic decay
#max_query_freqs = query_term_frequencies.max(axis=1).copy()
#max_query_freqs = np.log10(max_query_freqs)
#max_query_freqs = max_query_freqs + 1
query_term_frequencies = np.log10(query_term_frequencies)
query_term_frequencies = query_term_frequencies + 1
query_term_frequencies = query_term_frequencies.replace([np.inf, -np.inf], -999999)
#query_term_frequencies = query_term_frequencies.div(max_query_freqs, axis=0)

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


In [10]:
#query_counts = query_term_frequencies.copy()
#query_counts = query_counts.astype(bool).astype(int)
#query_counts = query_counts.sum()
#query_counts.name = 'count'
#query_counts.index.name = 'term'
#query_counts = query_counts[query_counts != 0]

In [18]:
#N = len(queries)
#query_idf = N / query_counts
#query_idf = np.log10(query_idf)

In [12]:
#query_tfidf = query_term_frequencies.multiply(query_idf, axis=1)
query_tfidf = query_term_frequencies.multiply(idf, axis=1)

In [17]:
query_tfidf.to_csv('data/tfidf_queries.csv', index=True)