In [2]:
from collections import Counter #count word occurrences 
import pandas as pd

In [6]:
file_path = "Processed_Reviews (3).csv" 
df = pd.read_csv(file_path)

In [8]:
tokenized_reviews = df['tokenized'].dropna().apply(eval) 


In [10]:
all_words = [word for review in tokenized_reviews for word in review] 
unique_words = list(set(all_words))

In [16]:
word_freq = Counter(all_words) 
sorted_word_freq = dict(sorted(word_freq.items(), reverse=True))

In [20]:
document_vectors = [] 
for review in tokenized_reviews: 
    document_vector = [1 if word in review else 0 for word in sorted_word_freq.keys()] 
    document_vectors.append(document_vector)

In [22]:
doc_vectors_df = pd.DataFrame(document_vectors, columns=sorted_word_freq.keys()) 
doc_vectors_df.to_csv("document_vectors.csv", index=False)

In [28]:
word_freq_df = pd.DataFrame(list(sorted_word_freq.items()), columns=["Word","Frequency"]) 

print("Word Frequency Table:") 
print(word_freq_df)

Word Frequency Table:
           Word  Frequency
0           wow          1
1         worth          1
2          work          2
3           use          1
4       totally          1
5          time          1
6         short          1
7          shoe          1
8       quality          3
9      purchase          1
10      product          7
11        phone          1
12    perfectly          2
13    packaging          1
14           oh          1
15          not          1
16       nicely          1
17         much          1
18         love          2
19         life          2
20       laptop          1
21          jog          1
22           hz          1
23       honest          1
24        happy          1
25        great          2
26         good          1
27          god          1
28         full          1
29          fit          1
30         fine          1
31         fast          1
32    expensive          1
33       expect          2
34    excellent          1
35    

In [34]:
import math

file_path = "Processed_Reviews (3).csv" 
df = pd.read_csv(file_path) 

In [36]:
tokenized_reviews = df['tokenized'].dropna().apply(eval) 

def compute_tf(document): 
    word_count = Counter(document) 
    tf = {word: count / len(document) for word, count in word_count.items()} 
    return tf

In [38]:
def compute_idf(documents): 
    N = len(documents)  # Total number of documents 
    idf = {} 
    all_words = set(word for doc in documents for word in doc)  # Unique words 
    for word in all_words: 
        count = sum(1 for doc in documents if word in doc) 
        idf[word] = math.log(N / count) 
    return idf

In [40]:
def compute_tfidf(document, idf): 
    tfidf = {} 
    tf = compute_tf(document)  # Get TF values for the document 
    for word, tf_value in tf.items(): 
        tfidf[word] = tf_value * idf[word]  # Multiply TF and IDF 
    return tfidf

In [42]:
documents = tokenized_reviews.tolist() 

tf_data = [compute_tf(doc) for doc in documents] 
tf_df = pd.DataFrame(tf_data).fillna(0) 
tf_df.to_csv("tf_scores.csv", index=False)

In [44]:
idf = compute_idf(documents) 
idf_df = pd.DataFrame([idf]).fillna(0) 
idf_df.to_csv("idf_scores.csv", index=False) 

tfidf_data = [compute_tfidf(doc, idf) for doc in documents] 
tfidf_df = pd.DataFrame(tfidf_data).fillna(0) 
tfidf_df.to_csv("tfidf_scores.csv", index=False) 