In [13]:
## Note : Referred Online Tools for Understanding TF-IDF generation

In [14]:
#Importing all the packages
import pandas as pd
import string
import nltk
import json
import numpy as np
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
from collections import Counter

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
#Reading input files
news_corpus = pd.read_csv('news-train-1.csv')
with open('dictionary.txt', 'r') as file:
    dictionary_words = file.read().splitlines()

dict_df = pd.DataFrame({'dict_words': dictionary_words})
words_dictionary_list = dict_df['dict_words'].tolist()

# Data Preprocessing

In [16]:
ps = PorterStemmer()
remove_punctuation = dict((ord(char), None) for char in string.punctuation)
def get_processed_tokens(text):
    lowers = text.lower()
    no_punctuation = lowers.translate(remove_punctuation)
    tokens = nltk.word_tokenize(no_punctuation)
    filtered = [word for word in tokens if not word in stopwords.words('english')]
    lemmatize= []
    for item in filtered:
        lemmatize.append(ps.stem(item))
    return lemmatize

news_corpus['cleaned_text'] = news_corpus['Text'].apply(lambda x: get_processed_tokens(x))

In [17]:
def filter_words_from_dictionary(words_list):
    filtered_words = []
    for word in words_list:
        if word in words_dictionary_list:
             filtered_words.append(word)
    return filtered_words
news_corpus['cleaned_filtered_text'] = news_corpus['cleaned_text'].apply(lambda x: filter_words_from_dictionary(x))
final_corpus = news_corpus['cleaned_filtered_text'].to_list()

#TF-IDF

In [18]:
term_freq_matrix = []
lexicons = set()
for word_list in final_corpus:
    lexicons.update(word_list)
lexicons = sorted(list(lexicons))
word_to_index = {word: index for index, word in enumerate(lexicons)}

In [19]:
##Creating the Tf matrix while maintaining document order

for word_list in final_corpus:
    vect = [0] * len(lexicons)
    for word in word_list:
        if word in word_to_index:
            vect[word_to_index[word]] += 1
    max_tf = max(vect)
    tf_vector_normalized = [tf / max_tf if max_tf > 0 else 0 for tf in vect]
    term_freq_matrix.append(tf_vector_normalized)

In [20]:
#creating IDF vector
num_documents = len(final_corpus)
df = [0] * len(lexicons)
for words_list in final_corpus:
    set_of_words = set(words_list)
    for word in set_of_words:
        if word in word_to_index:
            df[word_to_index[word]] += 1
idf_vector = [0 if df_value == 0 else np.log(num_documents / df_value) for df_value in df]

In [21]:
# Calculate TF-IDF matrix
tfidf_matrix = []
for vect in term_freq_matrix:
    tfidf_vector = [tf * idf for tf, idf in zip(vect, idf_vector)]
    tfidf_matrix.append(tfidf_vector)

# # Convert the TF-IDF matrix to a pandas DataFrame
columns = lexicons
tfidf_df = pd.DataFrame(tfidf_matrix,columns=columns)

# Save the TF-IDF matrix to a text file without column names
with open('matrix.txt', 'w', encoding='utf-8') as f:
    for row in tfidf_matrix:
        f.write(','.join(map(str, row)) + '\n')

print("Output saved to matrix.txt")

Output saved to matrix.txt


# Top 3 Word Occurances

In [22]:
# Group the data by category

category_order = ['sport', 'business', 'politics', 'entertainment', 'tech']

category_data = news_corpus.groupby('Category')['cleaned_filtered_text'].sum()

word_frequencies = {category: Counter() for category in category_order}

for category, text in category_data.items():
    word_count = Counter(text)
    word_frequencies[category] = word_count

# Find the top 3 most frequent words for each category
top_words_per_category = {}
for category, word_count in word_frequencies.items():
    top_words = word_count.most_common(3)
    top_words_per_category[category] = {word: count for word, count in top_words}


# Output the result to a JSON file
with open('frequency.json', 'w') as json_file:
    json.dump(top_words_per_category, json_file, indent=4)

print("Output saved to frequency.json")


Output saved to frequency.json


# Top3 average highest tf-idf words

In [23]:
tfidf_df['Category'] = news_corpus['Category']

# Group the data by category
grouped = tfidf_df.groupby('Category',sort=False)

# Calculate the average TF-IDF scores for each word within each category
#average_scores_per_category = {}
average_scores_per_category = {category: Counter() for category in category_order}

for category, group in grouped:
    avg_scores = group.drop(columns=['Category']).mean().sort_values(ascending=False)
    top_words = avg_scores[:3]
    average_scores_per_category[category] = {word: score for word, score in top_words.items()}

# Output the result to a JSON file
with open('scores.json', 'w') as json_file:
    json.dump(average_scores_per_category, json_file, indent=4)

print("Output saved to scores.json")

Output saved to scores.json
