# Instructor Do: Terms Relevance (Understanding TF-IDF)

In [15]:
# Initial imports
import nltk
from nltk.corpus import reuters
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer



## Loading Text from the Reuters Dataset

To demonstrate how TF-IDF works, we will use the _Reuters_ dataset that is bundled in NLTK.

In [16]:
# Download/update the Reuters dataset
nltk.download("reuters")



[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\mshel\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [17]:
# Count the total number of documents in the collection
doc_ids = reuters.fileids()
print(f"Total number of docs in the corpus: {len(doc_ids)}")


Total number of docs in the corpus: 10788


## Getting Bag of Words from a Single Document

We select a single document from the corpus to get it's "Bag of Words". The same can be done from multiple documents by pasing a list of documents (or documents ids on this example) to the `CountVectorizer()` object.

In [18]:
# Select and print the original single document text
doc_id = "test/15045"
doc_text = reuters.raw(doc_id)
print(doc_text)



DUTCH ADJUSTED UNEMPLOYMENT RISES IN MARCH
  Dutch seasonally adjusted unemployment
  rose in the month to end-March to a total 693,000 from 690,600
  at end-February, but was well down from 730,100 at end-March
  1986, Social Affairs Ministry figures show.
      The figure for male jobless rose by 2,000 in the month to
  436,500 compared with 470,700 a year earlier. The figure for
  women was 256,500 at end-March against 256,100 a month earlier
  and 259,400 at end-March 1986.
      On an unadjusted basis total unemployment fell by 16,500 in
  the month to end-March to 692,200. In March 1986 the figure was
  725,000.
      A ministry spokesman said the unadjusted figures showed a
  smaller than usual seasonal decrease for the time of year,
  because of particularly cold weather delaying work in the
  building industry. He said this explained the increase in the
  adjusted statistics.
      Total vacancies available rose by 1,900 to 26,300 at
  end-March. A year earlier the figure was 

In [19]:
# Creating the CountVectorizer instance defining the stopwords in English to be ignored
vectorizer = CountVectorizer(stop_words="english")

# Getting the tokenization and occurrence counting
X = vectorizer.fit_transform([doc_text])

# Retrieve unique words list
words = vectorizer.get_feature_names()
print(words)



['000', '100', '16', '1986', '200', '256', '259', '26', '28', '300', '400', '436', '470', '500', '600', '690', '692', '693', '700', '725', '730', '763', '900', 'adjusted', 'affairs', 'available', 'basis', 'building', 'cold', 'compared', 'decrease', 'delaying', 'dutch', 'earlier', 'end', 'explained', 'february', 'fell', 'figure', 'figures', 'increase', 'industry', 'jobless', 'male', 'march', 'ministry', 'month', 'particularly', 'rises', 'rose', 'said', 'seasonal', 'seasonally', 'showed', 'smaller', 'social', 'spokesman', 'statistics', 'time', 'total', 'unadjusted', 'unemployment', 'usual', 'vacancies', 'weather', 'women', 'work', 'year']


In [None]:
# X raw data contains the occurrence of each term in the document. A unique ID is assigned to each term.
print(X)



In [21]:
# Getting the bag of words as DataFrame
words_df = pd.DataFrame(
    list(zip(words, np.ravel(X.sum(axis=0)))), columns=["Word", "Word_Count"]
)
words_df


Unnamed: 0,Word,Word_Count
0,000,3
1,100,2
2,16,1
3,1986,3
4,200,1
5,256,2
6,259,1
7,26,1
8,28,1
9,300,1


## Calculating the TF-IDF from a Corpus

In [None]:
# Getting the corpus (first 1000 files from Reuters dataset)
all_docs_id = reuters.fileids()
corpus_id = all_docs_id[0:1000]
corpus = [reuters.raw(doc) for doc in corpus_id]

# Print sample document
print(corpus[50])



In [None]:
# Getting the TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
X_corpus = vectorizer.fit_transform(corpus)



In [None]:
# Getting matrix info
print(f"Matrix shape: {X_corpus.shape}")
print(f"Total number of documents: {X_corpus.shape[0]}")
print(f"Total number of unique words (tokens): {X_corpus.shape[1]}")



In [None]:
# Retrieve words list from corpus
words_corpus = vectorizer.get_feature_names()
print(words_corpus)



In [None]:
# Getting the TF-IDF weight of each word in corpus as DataFrame
words_corpus_df = pd.DataFrame(
    list(zip(words_corpus, np.ravel(X_corpus.mean(axis=0)))), columns=["Word", "TF-IDF"]
)

words_corpus_df = words_corpus_df.sort_values(by=["TF-IDF"], ascending=False)



In [None]:
# Highest 10 TF-IDF scores
words_corpus_df.head(10)



In [None]:
# Lowest 10 TF-IDF scores
words_corpus_df.tail(10)
