In [29]:
import numpy as np
import pandas as pd
import textwrap
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer

from transformers import pipeline

In [2]:
df = pd.read_csv("bbc_text_cls.csv")
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [3]:
df['labels'].unique()

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [4]:
label = 'business'
texts = df[df['labels'] == label]['text']
texts.head()

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

In [25]:
i = np.random.choice(len(texts))
print(i)

117


In [26]:
doc = texts.iloc[i]
pprint(doc)

('Ericsson sees earnings improve\n'
 '\n'
 'Telecoms equipment supplier Ericsson has posted a rise in fourth quarter '
 'profits thanks to clients like Deutsche Telekom upgrade their networks.\n'
 '\n'
 'Operating profit in the three months to 31 December was 9.5bn kronor (£722m; '
 '$1.3bn) against 6.3bn kronor last year. Shares tumbled, however, as the '
 'company reported a profit margin of 45.6%, less than the 47.3% forecast by '
 'analysts and down from 47.1% in the third quarter. Ericsson shares dropped '
 '5.9% to 20.7 kronor in early trading on Thursday. However, the company '
 'remained optimistic about its earnings outlook after sales in the fourth '
 'quarter rose 9% to 39.4bn kronor. "Long-term growth drivers of the industry '
 'remain solid," Ericsson said in a statement.\n'
 '\n'
 'Chief executive Carl-Henric Svanberg explained that about "27% of the '
 'world\'s population now has access to mobile communications". "This is '
 'exciting for a company with a vision of an a

In [69]:
#split doc function

def split_doc(doc):
    splitter = doc.split(" ")
    
    cleaned_splitter = [word.replace('.','').replace('\n\n',' ') for word in splitter]
    
    return cleaned_splitter

In [70]:
word_list = split_doc(doc)
print(word_list)

['Ericsson', 'sees', 'earnings', 'improve Telecoms', 'equipment', 'supplier', 'Ericsson', 'has', 'posted', 'a', 'rise', 'in', 'fourth', 'quarter', 'profits', 'thanks', 'to', 'clients', 'like', 'Deutsche', 'Telekom', 'upgrade', 'their', 'networks Operating', 'profit', 'in', 'the', 'three', 'months', 'to', '31', 'December', 'was', '95bn', 'kronor', '(£722m;', '$13bn)', 'against', '63bn', 'kronor', 'last', 'year', 'Shares', 'tumbled,', 'however,', 'as', 'the', 'company', 'reported', 'a', 'profit', 'margin', 'of', '456%,', 'less', 'than', 'the', '473%', 'forecast', 'by', 'analysts', 'and', 'down', 'from', '471%', 'in', 'the', 'third', 'quarter', 'Ericsson', 'shares', 'dropped', '59%', 'to', '207', 'kronor', 'in', 'early', 'trading', 'on', 'Thursday', 'However,', 'the', 'company', 'remained', 'optimistic', 'about', 'its', 'earnings', 'outlook', 'after', 'sales', 'in', 'the', 'fourth', 'quarter', 'rose', '9%', 'to', '394bn', 'kronor', '"Long-term', 'growth', 'drivers', 'of', 'the', 'industry

In [39]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data from the DataFrame
tfidf_matrix = vectorizer.fit_transform(df['text'])

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Create a dictionary to store each word and its TF-IDF score
word_tfidf_dict = {}

# Loop through each document (each row in tfidf_matrix)
for i in range(tfidf_matrix.shape[0]):
    # Get the TF-IDF vector for the i-th document in sparse format
    feature_index = tfidf_matrix[i,:].nonzero()[1]
    
    # Get the corresponding TF-IDF score
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    
    # Update the word_tfidf_dict
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        word_tfidf_dict[w] = s

print(word_tfidf_dict)



In [43]:
def freq_assigner(doc):

    assigner = {}
    for word in word_list:
        frequency = word_tfidf_dict.get(word,0)

        assigner[word] = frequency

    print(assigner)


In [44]:
dict = freq_assigner(doc)

{'Ericsson': 0, 'sees': 0.023898385730437736, 'earnings': 0.043500932738018826, 'improve\n\nTelecoms': 0, 'equipment': 0.03177440260470953, 'supplier': 0.21651184985336672, 'has': 0.016018068400987077, 'posted': 0.05656924240924514, 'a': 0, 'rise': 0.025006581071069237, 'in': 0.11943747226690583, 'fourth': 0.029762066606211345, 'quarter': 0.033521088782140065, 'profits': 0.056173035440752216, 'thanks': 0.03868227795947055, 'to': 0.20854583411681518, 'clients': 0.024957328445679182, 'like': 0.04322065927676663, 'Deutsche': 0, 'Telekom': 0, 'upgrade': 0.03338936205912448, 'their': 0.052525730392061265, 'networks.\n\nOperating': 0, 'profit': 0.07270256542228919, 'the': 0.250026233163237, 'three': 0.005147960523344205, 'months': 0.006293257561207275, '31': 0.03905537749225283, 'December': 0, 'was': 0.05810523369999857, '9.5bn': 0, 'kronor': 0.6807221012622213, '(£722m;': 0, '$1.3bn)': 0, 'against': 0.005170284480196565, '6.3bn': 0, 'last': 0.004214605522893685, 'year.': 0, 'Shares': 0, 'tu