In [None]:
import re

In [None]:
import pandas as pd
input_file = "posts_first_targil.xlsx"

# Read the Excel file with multiple sheets
df = pd.read_excel(input_file, sheet_name=None)

for sheet_name, data in df.items():
    print(f"Sheet name: {sheet_name} Headlines:, {list(data.columns)}")

Sheet name: A-J Headlines:, ['sub_title', 'date', 'Newspaper', 'Body Text', 'title']
Sheet name: BBC Headlines:, ['date', 'Newspaper', 'Body Text', 'title']
Sheet name: J-P Headlines:, ['date', 'Newspaper', 'Body', 'title']
Sheet name: NY-T Headlines:, ['date', 'Newspaper', 'Body Text', 'title']


In [None]:
if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Verify the change
for sheet_name, data in df.items():
    print(f"Sheet name: {sheet_name} Headlines:, {list(data.columns)}")

Sheet name: A-J Headlines:, ['sub_title', 'date', 'Newspaper', 'Body Text', 'title']
Sheet name: BBC Headlines:, ['date', 'Newspaper', 'Body Text', 'title']
Sheet name: J-P Headlines:, ['date', 'Newspaper', 'Body Text', 'title']
Sheet name: NY-T Headlines:, ['date', 'Newspaper', 'Body Text', 'title']


**Function to clean the data text**

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    regx = r"((?<!\w)[^\s\w]|[^\s\w](?!\w))"
    dot_pattern = r"(?<!\w)([a-zA-Z]{2,})\.([a-zA-Z]{2,})(?!\w)"
    clean_t = re.sub(regx, r" \1 ", text)
    clean_t = re.sub(dot_pattern, r"\1 . \2", clean_t)
    return re.sub(r"\s+", " ", clean_t).strip()


**Part 2: Functions for processing data by lemmatize the text**

In [None]:
# Load spaCy's language model
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def lemmatize_text(text):
    if not isinstance(text, str):
        return ""
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

**Creating the processed files:**

*A.* clean_file.xlsx

*B.* lemma_file.xlsx

**By each of the functions:**

*A.* lemmatize_text()

*B.* clean_text()

In [None]:
clean_sheets = {}
lemma_sheets = {}
for sheet_name, data in df.items():
    # Apply clean_text to all string columns in the DataFrame
    processed_clean_df = data.map(clean_text)
    clean_sheets[sheet_name] = processed_clean_df
    processed_lemma_df = data.map(lemmatize_text)
    lemma_sheets[sheet_name] = processed_lemma_df

# Save each processed sheet to a separate Excel file
output_clean_file = "output_files/clean_file.xlsx"
with pd.ExcelWriter(output_clean_file) as writer:
    for sheet_name, processed_df in clean_sheets.items():
        processed_df.to_excel(writer, sheet_name=sheet_name, index=False)

output_lemma_file = "output_files/lemma_file.xlsx"
with pd.ExcelWriter(output_lemma_file) as writer:
    for sheet_name, processed_df in lemma_sheets.items():
        processed_df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"Processed clean Excel file saved as: {output_clean_file}")
print(f"Processed lemma Excel file saved as: {output_lemma_file}")


Processed clean Excel file saved as: output_files/clean_file.xlsx
Processed lemma Excel file saved as: output_files/lemma_file.xlsx


***Part 3: using TF-IDF BM25/Okapi (words_file or lemma_files)***

In [None]:
!pip install rank_bm25
import os
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
from nltk.corpus import stopwords
import nltk



In [None]:
# Download NLTK stopwords
nltk.download('stopwords')

# Initialize the set of English stop words
stop_words = set(stopwords.words('english'))

# Define the function to filter out stopwords from text
def filter_stopwords(text):
    tokens = text.split()  # Split the input into tokens
    return [token.lower() for token in tokens if token.lower() not in stop_words]

# File paths
input_file = "output_files/clean_file.xlsx"
words_file = "output_files/words_file.xlsx"  # File containing the words to use in BM25
output_folder = "bm25/clean"
os.makedirs(output_folder, exist_ok=True)

# Load the specific words from the file
words_df = pd.read_excel(words_file, header=None)
words_to_include = set(words_df[0].str.lower())  # Convert to lowercase to ensure case-insensitive matching

# Read the input Excel file
clean_df = pd.read_excel(input_file, sheet_name=None)

# Process each sheet
for sheet_name, data in clean_df.items():
    print(f"Processing sheet: {sheet_name}")

    # Construct corpus by removing stopwords and combining text fields
    if sheet_name == "A-J":
        documents = [
            filter_stopwords(f'{record["title"]} {record["sub_title"]} {record["Body Text"]}')
            for _, record in data.iterrows()
        ]
    else:
        documents = [
            filter_stopwords(f'{record["title"]} {record["Body Text"]}')
            for _, record in data.iterrows()
        ]

    # Filter documents to only include the specified words
    filtered_documents = [
        [word for word in doc if word in words_to_include]
        for doc in documents
    ]

    # Create BM25 model
    bm25_model = BM25Okapi(filtered_documents)

    # Create a BM25 matrix for the specific words
    bm25_matrix = []
    for doc_index, document in enumerate(filtered_documents):
        doc_scores = {word: bm25_model.get_scores([word])[doc_index] for word in words_to_include}
        bm25_matrix.append(doc_scores)

    # Convert BM25 matrix to a DataFrame
    bm25_df = pd.DataFrame(bm25_matrix).fillna(0)  # Fill NaN with 0 for words with no score in a document
    bm25_df.insert(0, "DocumentIndex", range(len(filtered_documents)))  # Add document indices as the first column

    # Save the BM25 scores to an Excel file
    output_file = os.path.join(output_folder, f"bm25_{sheet_name}.xlsx")
    bm25_df.to_excel(output_file, index=False)
    print(f"BM25 matrix for sheet '{sheet_name}' saved to {output_file}")

PART 4 - Word2Vec (glove or w2v) without IDF

In [None]:
import pandas as pd
import numpy as np
import string
import re
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.data import find
import csv
import nltk

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.data import find
nltk.download('punkt')
nltk.download('stopwords')
# File paths
input_file = "/content/output/lemma_file.xlsx"  # Replace with your Excel file path
output_file = "/content/output/glove_lemma_withoutIdf_withStopWords.xlsx"

df = pd.read_excel(input_file, sheet_name=None)

if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Load GloVe vectors via gensim downloader
try:
    glove_model = api.load("glove-wiki-gigaword-300")  # 300-dimensional GloVe vectors
except Exception as e:
    print(f"Error loading model: {e}")

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text)  # Remove digits and dates
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize the text
    return tokens


# Process each sheet
results = []

for sheet_name, data in df.items():
    for index, row in data.iterrows():
        # Combine text from relevant columns
        if sheet_name == 'A-J':
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))

        # Preprocess text and get tokens
        tokens = preprocess_text(combined_text)
        # Extract vectors for each word
        vectors = []
        for word in tokens:
            if word in glove_model:
                vectors.append(glove_model[word])

        # If there are word vectors for the document, compute the average
        if vectors:
            avg_vector = np.mean(vectors, axis=0)
            results.append([sheet_name, index] + avg_vector.tolist())

# Save results to a CSV file
header = ["Sheet", "RowIndex"] + [f"Dim{i}" for i in range(glove_model.vector_size)]
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(results)

print(f"Word vectors saved to {output_file}")


Word2Vec with IDF

In [None]:
import pandas as pd
import string
import re
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import csv
import nltk

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
input_file = "/content/output/lemma_file.xlsx"
output_file = "/content/output/glove_lemma_withIDF_withoutStopWords.csv"

df = pd.read_excel(input_file, sheet_name=None)

if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

try:
    glove_model = api.load("glove-wiki-gigaword-300")  # 300-dimensional GloVe vectors
except Exception as e:
    print(f"Error loading model: {e}")

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text)  # Remove digits and dates
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize the text
    stop_words = set(stopwords.words("english"))
    return [word for word in tokens if word not in stop_words]  # Remove stopwords

def calculate_idf(corpus):
    vectorizer = TfidfVectorizer(use_idf=True, stop_words="english")
    vectorizer.fit(corpus)
    idf_dict = defaultdict(lambda: 0)
    for word, idf in zip(vectorizer.get_feature_names_out(), vectorizer.idf_):
        idf_dict[word] = idf
    return idf_dict

corpus = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == 'A-J':
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))
        corpus.append(combined_text)

idf_dict = calculate_idf(corpus)


results = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        # Combine text from relevant columns
        if sheet_name == 'A-J':
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))

        # Preprocess text and get tokens
        tokens = preprocess_text(combined_text)
        vectors = []
        for word in tokens:
            if word in glove_model:
                vector = glove_model[word]
                idf_value = idf_dict[word]
                vectors.append(vector * idf_value)

        # If there are word vectors for the document, compute the average
        if vectors:
            avg_vector = np.mean(vectors, axis=0)
            results.append([sheet_name, index] + avg_vector.tolist())

# Save results to a CSV file
header = ["Sheet", "RowIndex"] + [f"Dim{i}" for i in range(glove_model.vector_size)]
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(results)

print(f"Word vectors saved to {output_file}")

Part 5: doc2vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
source_file = "posts_first_targil.xlsx"

# Load source Excel file
df = pd.read_excel(source_file, sheet_name=None)
if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\d+", "", text)  # Remove digits
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return word_tokenize(text)  # Tokenize the text

# Prepare TaggedDocuments
tagged_documents = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == "A-J":
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))

        tokens = preprocess_text(combined_text)
        tagged_documents.append(TaggedDocument(words=tokens, tags=[f"{sheet_name}_{index}"]))

# Train Doc2Vec model
model = Doc2Vec(vector_size=300, min_count=2, epochs=40, workers=4)
model.build_vocab(tagged_documents)
model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)

# Save document vectors to CSV
output_file = "output_files/doc2vec_vectors.csv"
header = "Sheet,RowIndex," + ",".join([f"Dim{i}" for i in range(model.vector_size)])
with open(output_file, "w", encoding="utf-8") as file:
    file.write(header + "\n")
    for doc_id, doc in enumerate(tagged_documents):
        # Extract sheet name and row index from doc.tags[0]
        sheet, row_index = doc.tags[0].split("_")
        vector = model.dv[doc.tags[0]].tolist()
        file.write(f"{sheet},{row_index}," + ",".join(map(str, vector)) + "\n")

print(f"Document vectors with RowIndex saved to {output_file}")

In [None]:
import pandas as pd
output_file = "output_files/doc2vec_vectors.csv"  # Replace with your file path

try:
    # Read the first 10 rows of the CSV file
    data = pd.read_csv(output_file)
    print(data.head())
except FileNotFoundError:
    print(f"File '{output_file}' not found. Please check the file path.")

  Sheet  RowIndex      Dim0      Dim1      Dim2      Dim3      Dim4      Dim5  \
0   A-J         0 -0.120381  0.379345  0.134694 -0.177616 -0.337673  0.041946   
1   A-J         1 -0.067182  0.138951  0.090573  0.097399  0.135496 -0.669350   
2   A-J         2  0.025135  0.409332  0.405838  0.012679 -0.301956 -0.096406   
3   A-J         3  0.036234  0.189337  0.372502  0.030741  0.225192 -0.578521   
4   A-J         4 -0.228523  0.438873  0.277208  0.343170 -0.130110 -0.245843   

       Dim6      Dim7  ...    Dim290    Dim291    Dim292    Dim293    Dim294  \
0  0.782558  0.720419  ...  0.071085  0.264771  0.607232  0.047377  0.579811   
1  0.080852  0.947449  ... -0.025642  0.406675  0.322913 -0.005030  0.661448   
2  0.210589  0.551659  ...  0.046759  0.205108  0.021800  0.378580  0.208742   
3  0.198960  0.444680  ... -0.048156  0.490379  0.326162 -0.107866  0.724978   
4  0.018259  0.585959  ...  0.031302  0.473778  0.751093 -0.053052  0.747913   

     Dim295    Dim296    Dim297 

Part 6: Ssentence_BERT


In [None]:
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
from sentence_transformers import SentenceTransformer




In [None]:
# Download NLTK resources if needed
nltk.download("punkt")

# File path to source documents
source_file = "posts_first_targil.xlsx"
output_file = "output_files/sbert_vectors.csv"

# Load source documents
df = pd.read_excel(source_file, sheet_name=None)
if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Load pre-trained SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

results = []

# Function to process each document and generate a document vector
def get_document_vector(text):
    # Split the text into sentences
    sentences = sent_tokenize(text)
    # Generate sentence vectors using SBERT
    sentence_vectors = model.encode(sentences)
    # Average the sentence vectors to get the document vector
    document_vector = sentence_vectors.mean(axis=0)
    # Normalize the vector by dividing by the number of sentences
    document_vector /= len(sentences)
    return document_vector

for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == "A-J":
             combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
             combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))

        # Generate document vector
        document_vector = get_document_vector(combined_text)
        # Convert to list and append results
        results.append([sheet_name, index] + document_vector.tolist())

# Save results to CSV
header = ["Sheet", "RowIndex"] + [f"Dim{i}" for i in range(len(results[0]) - 2)]

with open(output_file, "w", encoding="utf-8") as file:
    file.write(",".join(header) + "\n")
    for row in results:
        file.write(",".join(map(str, row)) + "\n")

print(f"SBERT vectors saved to {output_file}")


Part 7: BERT with IDF

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import nltk
nltk.download("punkt")


In [None]:
source_file = "posts_first_targil.xlsx"
output_file = "output_files/new_bert_vectors.csv"

df = pd.read_excel(source_file, sheet_name=None)
if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def calculate_idf(corpus):
    vectorizer = TfidfVectorizer(use_idf=True, stop_words="english")
    vectorizer.fit(corpus)
    idf_dict = defaultdict(lambda: 0)
    for word, idf in zip(vectorizer.get_feature_names_out(), vectorizer.idf_):
        idf_dict[word] = idf
    return idf_dict

corpus = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == 'A-J':
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))
        corpus.append(combined_text)

idf_dict = calculate_idf(corpus)


def get_bert_vectors(text_chunk):
    inputs = tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: [sequence_length, hidden_size]
    attention_mask = inputs["attention_mask"].squeeze(0)  # Shape: [sequence_length]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze(0))  # List of tokens

    return tokens, token_embeddings, attention_mask

# Function to process subwords into full word embeddings
def process_tokens(tokens, token_embeddings, attention_mask, idf_dict):
    word_embeddings = []
    current_word = ""
    current_word_vectors = []

    for token, embedding, mask in zip(tokens, token_embeddings, attention_mask):
        if mask == 0 or token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue

        if token.startswith("##"):  # Subword continuation
            current_word += token[2:]
            current_word_vectors.append(embedding)
        else:  # New word starts
            if current_word:  # Combine previous word embeddings
                combined_embedding = torch.mean(torch.stack(current_word_vectors), dim=0)
                idf = idf_dict.get(current_word, 1.0)  # Default IDF to 1.0 if not found
                word_embeddings.append(combined_embedding * idf)

            # Start new word
            current_word = token
            current_word_vectors = [embedding]

    # Process the last word
    if current_word:
        combined_embedding = torch.mean(torch.stack(current_word_vectors), dim=0)
        idf = idf_dict.get(current_word, 1.0)
        word_embeddings.append(combined_embedding * idf)

    return word_embeddings
# Function to process an entire document
def process_document(text, idf_dict):
    tokens = tokenizer.tokenize(text)
    max_tokens = 512
    num_chunks = (len(tokens) + max_tokens - 1) // max_tokens  # Ceiling division
    all_word_embeddings = []

    for i in range(num_chunks):
        chunk_tokens = tokens[i * max_tokens : (i + 1) * max_tokens]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        tokens, embeddings, attention_mask = get_bert_vectors(chunk_text)
        chunk_word_embeddings = process_tokens(tokens, embeddings, attention_mask, idf_dict)
        all_word_embeddings.extend(chunk_word_embeddings)

    # Aggregate all word embeddings for the document (e.g., by mean or sum)
    document_embedding = torch.mean(torch.stack(all_word_embeddings), dim=0)
    return document_embedding


results = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == "A-J":
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))

        # Generate BERT vectors for the document
        bert_vector = process_document(combined_text, idf_dict)
        vector_list = bert_vector.tolist()
        results.append([sheet_name, index] + vector_list)
        print(vector_list)



# Save vectors to CSV
header = ["Sheet", "RowIndex"] + [f"Dim{i}" for i in range(bert_vector.shape[0])]
with open(output_file, "w", encoding="utf-8") as file:
    file.write(",".join(header) + "\n")
    for row in results:
        file.write(",".join(map(str, row)) + "\n")

print(f"BERT vectors with RowIndex saved to {output_file}")

In [None]:
output_file = "output_files/new_bert_vectors.csv"

try:
    data = pd.read_csv(output_file)
    print(data.head())
except FileNotFoundError:
    print(f"File '{output_file}' not found. Please check the file path.")

  Sheet  RowIndex      Dim0      Dim1      Dim2      Dim3      Dim4      Dim5  \
0   A-J         0 -0.788050  0.056614  1.110174 -1.022818  0.147041 -0.474798   
1   A-J         1 -0.003478  0.293304 -1.017949 -0.549554  0.197699 -0.491028   
2   A-J         2 -0.168909  0.331912  0.046262 -0.966383 -0.039441 -1.237246   
3   A-J         3 -0.188200 -0.066150  0.061154 -0.881887  0.318995 -0.174752   
4   A-J         4  0.367581  0.150649 -0.167966 -0.829529  0.578362 -0.619802   

       Dim6      Dim7  ...    Dim758    Dim759    Dim760    Dim761    Dim762  \
0  0.684777  1.736403  ...  0.441260 -0.295122  1.369656 -0.888012  1.068354   
1 -0.230678  0.669581  ...  0.126593  0.276906  0.418110 -0.073043  0.532075   
2  0.228654  1.053331  ...  0.301428  0.401214  0.752306 -0.367575  0.183579   
3 -0.718530  1.247533  ...  0.775314  0.303543  0.816760 -0.345624 -0.053506   
4 -0.128312  0.784007  ...  0.429937  0.090282  0.748448 -0.224150  0.255825   

     Dim763    Dim764    Dim765 

Bert Withput IDF

In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import nltk
nltk.download("punkt")

source_file = "/content/output/posts_first_targil.xlsx"
output_file = "/content/output/bert_withoutIDF.csv"

df = pd.read_excel(source_file, sheet_name=None)
if "J-P" in df:
    df["J-P"].rename(columns={"Body": "Body Text"}, inplace=True)

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


def get_bert_vectors(text_chunk):
    inputs = tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state.squeeze(0)  # Shape: [sequence_length, hidden_size]
    attention_mask = inputs["attention_mask"].squeeze(0)  # Shape: [sequence_length]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze(0))  # List of tokens

    return tokens, token_embeddings, attention_mask

# Function to process subwords into full word embeddings
def process_tokens(tokens, token_embeddings, attention_mask):
    word_embeddings = []
    current_word = ""
    current_word_vectors = []

    for token, embedding, mask in zip(tokens, token_embeddings, attention_mask):
        if mask == 0 or token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue

        if token.startswith("##"):  # Subword continuation
            current_word += token[2:]
            current_word_vectors.append(embedding)
        else:  # New word starts
            if current_word:  # Combine previous word embeddings
                combined_embedding = torch.mean(torch.stack(current_word_vectors), dim=0)
                word_embeddings.append(combined_embedding)

            # Start new word
            current_word = token
            current_word_vectors = [embedding]

    # Process the last word
    if current_word:
        combined_embedding = torch.mean(torch.stack(current_word_vectors), dim=0)
        word_embeddings.append(combined_embedding)

    return word_embeddings
# Function to process an entire document
def process_document(text):
    tokens = tokenizer.tokenize(text)
    max_tokens = 512
    num_chunks = (len(tokens) + max_tokens - 1) // max_tokens  # Ceiling division
    all_word_embeddings = []

    for i in range(num_chunks):
        chunk_tokens = tokens[i * max_tokens : (i + 1) * max_tokens]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        tokens, embeddings, attention_mask = get_bert_vectors(chunk_text)
        chunk_word_embeddings = process_tokens(tokens, embeddings, attention_mask)
        all_word_embeddings.extend(chunk_word_embeddings)

    # Aggregate all word embeddings for the document (e.g., by mean or sum)
    document_embedding = torch.mean(torch.stack(all_word_embeddings), dim=0)
    return document_embedding


results = []
for sheet_name, data in df.items():
    for index, row in data.iterrows():
        if sheet_name == "A-J":
            combined_text = " ".join(str(row[col]) for col in ['title', 'sub_title', 'Body Text'] if pd.notna(row[col]))
        else:
            combined_text = " ".join(str(row[col]) for col in ['title', 'Body Text'] if pd.notna(row[col]))

        # Generate BERT vectors for the document
        bert_vector = process_document(combined_text)
        vector_list = bert_vector.tolist()
        results.append([sheet_name, index] + vector_list)
        print(vector_list)



# Save vectors to CSV
header = ["Sheet", "RowIndex"] + [f"Dim{i}" for i in range(bert_vector.shape[0])]
with open(output_file, "w", encoding="utf-8") as file:
    file.write(",".join(header) + "\n")
    for row in results:
        file.write(",".join(map(str, row)) + "\n")

print(f"BERT vectors with RowIndex saved to {output_file}")


output_file = "output_files/new_bert_vectors.csv"

try:
    data = pd.read_csv(output_file)
    print(data.head())
except FileNotFoundError:
    print(f"File '{output_file}' not found. Please check the file path.")