In [1]:
import numpy as np
import pandas as pd
import textwrap
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
import re

from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#load the csv file and show it
df = pd.read_csv("bbc_text_cls.csv")
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [3]:
#check the unique labels in the labels column
df['labels'].unique()

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [4]:
#create a function which will split the document to create a list of unique words
                  
def split_doc(doc):
    
    return re.findall(r'\b\w+\b', doc)

#'\b\w+\b'
#The first \b asserts that we're at a word boundary.
#The \w+ then matches one or more word characters.
#The final \b asserts that we are again at a word boundary.

In [5]:
#Create a df and apply the split_doc function on it to create a list of words in each document
df['cleaned_text'] = df['text'].apply(split_doc)
#select the business label
label = 'business'
#create a df texts contaning only the cleaned_text values for the business label
texts = df[df['labels'] == label][['cleaned_text']]
#show the df
texts.head()

Unnamed: 0,cleaned_text
0,"[Ad, sales, boost, Time, Warner, profit, Quart..."
1,"[Dollar, gains, on, Greenspan, speech, The, do..."
2,"[Yukos, unit, buyer, faces, loan, claim, The, ..."
3,"[High, fuel, prices, hit, BA, s, profits, Brit..."
4,"[Pernod, takeover, talk, lifts, Domecq, Shares..."


In [6]:
#check if the created df is a df object
isinstance(texts, pd.DataFrame)

True

In [34]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data from the DataFrame
tfidf_matrix = vectorizer.fit_transform(df['text'])

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Create a dictionary to store each word and its TF-IDF score
word_tfidf_dict = {}

# Loop through each document (each row in tfidf_matrix)
for i in range(tfidf_matrix.shape[0]):
    # Get the TF-IDF vector for the i-th document in sparse format
    feature_index = tfidf_matrix[i,:].nonzero()[1]
    
    # Get the corresponding TF-IDF score
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    
    # Update the word_tfidf_dict
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        word_tfidf_dict[w] = s  # Update the dictionary

print(word_tfidf_dict)



In [23]:
#pick a random value based on the len of texts df
i = np.random.choice(len(texts))
#print the value
print(i)

279


In [31]:
#locate the random value created in the last step and take a look at how a specific row in the column looks like
doc = texts.iloc[1].tolist()
#print it
print(type(doc))
print(doc)
flat_list = [item for sublist in doc for item in sublist]
print(flat_list)

<class 'list'>
[['Dollar', 'gains', 'on', 'Greenspan', 'speech', 'The', 'dollar', 'has', 'hit', 'its', 'highest', 'level', 'against', 'the', 'euro', 'in', 'almost', 'three', 'months', 'after', 'the', 'Federal', 'Reserve', 'head', 'said', 'the', 'US', 'trade', 'deficit', 'is', 'set', 'to', 'stabilise', 'And', 'Alan', 'Greenspan', 'highlighted', 'the', 'US', 'government', 's', 'willingness', 'to', 'curb', 'spending', 'and', 'rising', 'household', 'savings', 'as', 'factors', 'which', 'may', 'help', 'to', 'reduce', 'it', 'In', 'late', 'trading', 'in', 'New', 'York', 'the', 'dollar', 'reached', '1', '2871', 'against', 'the', 'euro', 'from', '1', '2974', 'on', 'Thursday', 'Market', 'concerns', 'about', 'the', 'deficit', 'has', 'hit', 'the', 'greenback', 'in', 'recent', 'months', 'On', 'Friday', 'Federal', 'Reserve', 'chairman', 'Mr', 'Greenspan', 's', 'speech', 'in', 'London', 'ahead', 'of', 'the', 'meeting', 'of', 'G7', 'finance', 'ministers', 'sent', 'the', 'dollar', 'higher', 'after', 'it

In [32]:
#for each value in doc list assign it's corresponding tfidf score to create a dictionary
def doc_freq_assigner(flat_list, word_tfidf_dict):

    doc_assigner = {}
    
    for word in flat_list:
         # If the word exists in the dict, assign its value
        if word in word_tfidf_dict:
            doc_assigner[word] = word_tfidf_dict[word]
        else:
            doc_assigner[word] = 0  # If the word does not exist in tfidf_dict, you can assign it a score of 0 or any other value you choose
    
    return doc_assigner

#for each key in dictionary assign a <mask> if the score is x% below average TFIDF
#merge the newly created dictionary into a list and remove the values to leave only the keys
#merge the list into a doc

In [33]:
freq_doc = doc_freq_assigner(flat_list, word_tfidf_dict)
print(freq_doc)
print(type(freq_doc))

{'Dollar': 0, 'gains': 0.04782194724481373, 'on': 0.04898320821299338, 'Greenspan': 0, 'speech': 0.05149605542936226, 'The': 0, 'dollar': 0.0369904896180221, 'has': 0.016018068400987077, 'hit': 0.0066549700902375315, 'its': 0.018049397060893684, 'highest': 0.09072058992086972, 'level': 0.007987585429768327, 'against': 0.005170284480196565, 'the': 0.250026233163237, 'euro': 0.0364284027155771, 'in': 0.11943747226690583, 'almost': 0.015026698411026744, 'three': 0.005147960523344205, 'months': 0.006293257561207275, 'after': 0.015977268439964762, 'Federal': 0, 'Reserve': 0, 'head': 0.017727630541088075, 'said': 0.005197682243710186, 'US': 0, 'trade': 0.03652258641413257, 'deficit': 0.03064430949986817, 'is': 0.12016478065486096, 'set': 0.015164209848520441, 'to': 0.20854583411681518, 'stabilise': 0.04660757610047918, 'And': 0, 'Alan': 0, 'highlighted': 0.03699507125444677, 'government': 0.005699489602089406, 's': 0, 'willingness': 0.039211780718705604, 'curb': 0.04287563729070538, 'spendin

In [41]:
#Calculate the average of all scores
def total_score_calc(word_tfidf_dict):
    sum_of_scores = sum(word_tfidf_dict.values())
    num_words = len(word_tfidf_dict)
    if num_words == 0:
        return 0
    return sum_of_scores / num_words

In [44]:
print('The average tfidf score is: ' + str(total_score_calc(word_tfidf_dict)))

The average tfidf score is: 0.07514476694250771


In [None]:
#if value in freq dock is higher than the tfidf score, replace the key as <mask>.