In [2]:
import pandas as pd

In [3]:
df=pd.read_csv(r'c:\Users\ACER\Desktop\Chuncks-5\chunk_1.csv')


In [4]:
df.columns

Index(['title_token_id', 'title_attention_mask', 'text_token_id',
       'text_attention_mask'],
      dtype='object')

In [5]:
import ast

# Function to ensure the attention mask is in the correct format
def convert_to_list(mask):
    if isinstance(mask, str):
        return ast.literal_eval(mask)  # Convert string representation of list to an actual list
    return mask

# Convert attention masks to lists if they are in string format
df['title_attention_mask'] = df['title_attention_mask'].apply(convert_to_list)
df['text_attention_mask'] = df['text_attention_mask'].apply(convert_to_list)

# Function to truncate tokens and attention masks to 512 tokens
def truncate_tokens_and_masks(tokens, attention_mask, max_length=512):
    truncated_tokens = tokens[:max_length]
    truncated_attention_mask = attention_mask[:max_length] + [0] * (max_length - len(attention_mask))
    return truncated_tokens, truncated_attention_mask

# Apply truncation to both title and text
df['title_token_id'], df['title_attention_mask'] = zip(*df.apply(
    lambda row: truncate_tokens_and_masks(row['title_token_id'], row['title_attention_mask']), axis=1))

df['text_token_id'], df['text_attention_mask'] = zip(*df.apply(
    lambda row: truncate_tokens_and_masks(row['text_token_id'], row['text_attention_mask']), axis=1))

# Recheck the validity after truncation
print("Max title token length (after truncation):", df['title_token_id'].apply(len).max())
print("Max text token length (after truncation):", df['text_token_id'].apply(len).max())

# Recheck if attention masks match the token length
invalid_title_attention_mask = sum(df['title_attention_mask'].apply(len) != df['title_token_id'].apply(len))
invalid_text_attention_mask = sum(df['text_attention_mask'].apply(len) != df['text_token_id'].apply(len))

print(f"Invalid title attention masks (mismatch with token length) after truncation: {invalid_title_attention_mask}")
print(f"Invalid text attention masks (mismatch with token length) after truncation: {invalid_text_attention_mask}")

# Recheck if there are any rows where the token lengths exceed 512
invalid_title_token_length = sum(df['title_token_id'].apply(len) > 512)
invalid_text_token_length = sum(df['text_token_id'].apply(len) > 512)

print(f"Invalid title token lengths (> 512 tokens) after truncation: {invalid_title_token_length}")
print(f"Invalid text token lengths (> 512 tokens) after truncation: {invalid_text_token_length}")

# Ensure there are no empty rows
empty_titles = df['title_token_id'].apply(len).eq(0).sum()
empty_texts = df['text_token_id'].apply(len).eq(0).sum()

print(f"Empty title rows after truncation: {empty_titles}")
print(f"Empty text rows after truncation: {empty_texts}")


Max title token length (after truncation): 512
Max text token length (after truncation): 512
Invalid title attention masks (mismatch with token length) after truncation: 0
Invalid text attention masks (mismatch with token length) after truncation: 0
Invalid title token lengths (> 512 tokens) after truncation: 0
Invalid text token lengths (> 512 tokens) after truncation: 0
Empty title rows after truncation: 0
Empty text rows after truncation: 0


In [6]:
# Check if all columns are ready for embedding generation
def final_check(df):
    # Check for missing values
    missing_values = df[['title_token_id', 'title_attention_mask', 'text_token_id', 'text_attention_mask']].isnull().sum()

    # Check token length compliance
    title_token_lengths = df['title_token_id'].apply(len)
    text_token_lengths = df['text_token_id'].apply(len)

    max_title_length = title_token_lengths.max()
    max_text_length = text_token_lengths.max()

    # Check if any rows have more than 512 tokens
    invalid_title_lengths = (title_token_lengths > 512).sum()
    invalid_text_lengths = (text_token_lengths > 512).sum()

    # Check attention masks match token lengths
    invalid_title_attention_mask = (df['title_attention_mask'].apply(len) != title_token_lengths).sum()
    invalid_text_attention_mask = (df['text_attention_mask'].apply(len) != text_token_lengths).sum()

    # Check for empty rows
    empty_title_rows = df['title_token_id'].apply(lambda x: len(x) == 0).sum()
    empty_text_rows = df['text_token_id'].apply(lambda x: len(x) == 0).sum()

    print("Missing values in columns:")
    print(missing_values)
    print("\nMax token lengths:")
    print(f"Max title token length: {max_title_length}")
    print(f"Max text token length: {max_text_length}")
    print("\nInvalid token lengths (> 512 tokens):")
    print(f"Invalid title token lengths (> 512 tokens): {invalid_title_lengths}")
    print(f"Invalid text token lengths (> 512 tokens): {invalid_text_lengths}")
    print("\nInvalid attention masks (mismatch with token length):")
    print(f"Invalid title attention masks: {invalid_title_attention_mask}")
    print(f"Invalid text attention masks: {invalid_text_attention_mask}")
    print("\nEmpty rows:")
    print(f"Empty title rows: {empty_title_rows}")
    print(f"Empty text rows: {empty_text_rows}")
    
# Run final check
final_check(df)


Missing values in columns:
title_token_id          0
title_attention_mask    0
text_token_id           0
text_attention_mask     0
dtype: int64

Max token lengths:
Max title token length: 512
Max text token length: 512

Invalid token lengths (> 512 tokens):
Invalid title token lengths (> 512 tokens): 0
Invalid text token lengths (> 512 tokens): 0

Invalid attention masks (mismatch with token length):
Invalid title attention masks: 0
Invalid text attention masks: 0

Empty rows:
Empty title rows: 0
Empty text rows: 0


In [7]:
df.head()

Unnamed: 0,title_token_id,title_attention_mask,text_token_id,text_attention_mask
0,"[101, 889, 31222, 11714, 79632, 11267, 852, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 11081, 35247, 12347, 20691, 78530, 28462...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[101, 885, 18187, 898, 45753, 31277, 14835, 28...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 11081, 35247, 12347, 20691, 78530, 28462...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[101, 852, 35247, 107144, 13718, 24667, 25595,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 11081, 35247, 12347, 20691, 78530, 28462...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[101, 887, 17279, 14835, 16373, 11081, 49545, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[101, 11081, 35247, 12347, 20691, 78530, 28462...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[101, 882, 76549, 44388, 12512, 11186, 110091,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[101, 11081, 35247, 12347, 20691, 78530, 28462...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [8]:
import pandas as pd

# Assuming your DataFrame is named df
# Check for missing values (NaN) in the columns
missing_values = df[['title_token_id', 'title_attention_mask', 'text_token_id', 'text_attention_mask']].isnull().sum()

# Check the data types of the columns to ensure they are correct (likely lists or arrays for token IDs and attention masks)
data_types = df[['title_token_id', 'title_attention_mask', 'text_token_id', 'text_attention_mask']].dtypes

print("Missing values in columns:")
print(missing_values)

print("\nData types of columns:")
print(data_types)

# Optionally, check if all lists or arrays have the same length (if these are lists or arrays of token ids)
column_lengths = {
    'title_token_id': df['title_token_id'].apply(len).unique(),
    'title_attention_mask': df['title_attention_mask'].apply(len).unique(),
    'text_token_id': df['text_token_id'].apply(len).unique(),
    'text_attention_mask': df['text_attention_mask'].apply(len).unique()
}

print("\nUnique lengths of token_id and attention_mask columns:")
print(column_lengths)


Missing values in columns:
title_token_id          0
title_attention_mask    0
text_token_id           0
text_attention_mask     0
dtype: int64

Data types of columns:
title_token_id          object
title_attention_mask    object
text_token_id           object
text_attention_mask     object
dtype: object

Unique lengths of token_id and attention_mask columns:
{'title_token_id': array([512]), 'title_attention_mask': array([512]), 'text_token_id': array([512]), 'text_attention_mask': array([512])}


In [9]:
# Check the type of elements inside the columns
element_types = {
    'title_token_id': type(df['title_token_id'][0]),
    'title_attention_mask': type(df['title_attention_mask'][0]),
    'text_token_id': type(df['text_token_id'][0]),
    'text_attention_mask': type(df['text_attention_mask'][0])
}

print("\nTypes of elements in the columns:")
print(element_types)



Types of elements in the columns:
{'title_token_id': <class 'str'>, 'title_attention_mask': <class 'list'>, 'text_token_id': <class 'str'>, 'text_attention_mask': <class 'list'>}


In [10]:
# Function to ensure each string is closed properly
def fix_incomplete_token_string(token_str):
    if token_str.endswith(","):  # If the string ends with a comma but lacks the closing bracket
        token_str = token_str[:-1]  # Remove the last comma
    if not token_str.endswith("]"):  # Check if the string does not end with a closing bracket
        token_str = token_str + "]"  # Add the closing bracket
    return token_str

# Apply this fix to both 'title_token_id' and 'text_token_id'
df['title_token_id'] = df['title_token_id'].apply(fix_incomplete_token_string)
df['text_token_id'] = df['text_token_id'].apply(fix_incomplete_token_string)

# Now safely convert the string to lists using ast.literal_eval
df['title_token_id'] = df['title_token_id'].apply(ast.literal_eval)
df['text_token_id'] = df['text_token_id'].apply(ast.literal_eval)

# Check the result
print("\nTypes of elements after conversion:")
print(df[['title_token_id', 'text_token_id']].dtypes)



Types of elements after conversion:
title_token_id    object
text_token_id     object
dtype: object


In [11]:
# Ensure that the strings are correctly converted to lists
df['title_token_id'] = df['title_token_id'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['text_token_id'] = df['text_token_id'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Check if they are now actual lists
print("\nTypes of elements after conversion to lists:")
print(df[['title_token_id', 'text_token_id']].applymap(type).head())



Types of elements after conversion to lists:
   title_token_id   text_token_id
0  <class 'list'>  <class 'list'>
1  <class 'list'>  <class 'list'>
2  <class 'list'>  <class 'list'>
3  <class 'list'>  <class 'list'>
4  <class 'list'>  <class 'list'>


  print(df[['title_token_id', 'text_token_id']].applymap(type).head())


In [12]:
# Check the types of elements in the columns without using applymap
print("\nTypes of elements after conversion to lists:")
print(df[['title_token_id', 'text_token_id']].apply(lambda x: type(x[0])).head())



Types of elements after conversion to lists:
title_token_id    <class 'list'>
text_token_id     <class 'list'>
dtype: object


In [16]:
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
import ast

# Function to pad or truncate sequences to length 512
def pad_or_truncate(tokens, max_length=512):
    return tokens[:max_length] + [0] * (max_length - len(tokens)) if len(tokens) < max_length else tokens[:max_length]

# Function to generate embeddings for batches of tokenized sequences
def generate_embeddings_in_batches(df, batch_size, device):
    model_name = 'bert-base-multilingual-cased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name).to(device)

    title_embeddings = []
    text_embeddings = []

    # Process the data in batches
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))

        # Extract batch of tokenized data
        title_tokens = df['title_token_id'][start_idx:end_idx].tolist()
        text_tokens = df['text_token_id'][start_idx:end_idx].tolist()

        # Ensure that all sequences are properly padded/truncated to 512 tokens
        title_tokens_padded = [pad_or_truncate(tokens) for tokens in title_tokens]
        text_tokens_padded = [pad_or_truncate(tokens) for tokens in text_tokens]

        # Generate attention masks (1 for non-padding, 0 for padding)
        title_attention_mask = [[1 if token != 0 else 0 for token in tokens] for tokens in title_tokens_padded]
        text_attention_mask = [[1 if token != 0 else 0 for token in tokens] for tokens in text_tokens_padded]

        # Convert to tensors
        input_ids_title = torch.tensor(title_tokens_padded).to(device)
        input_ids_text = torch.tensor(text_tokens_padded).to(device)
        attention_mask_title = torch.tensor(title_attention_mask).to(device)
        attention_mask_text = torch.tensor(text_attention_mask).to(device)

        # Get the embeddings for title and text (token-level embeddings)
        with torch.no_grad():
            output_title = model(input_ids_title, attention_mask=attention_mask_title)[0]
            output_text = model(input_ids_text, attention_mask=attention_mask_text)[0]

            # Append the token-level embeddings (not averaged)
            title_embeddings.append(output_title.cpu().numpy())
            text_embeddings.append(output_text.cpu().numpy())

    # Concatenate all batch embeddings into one array
    title_embeddings = np.concatenate(title_embeddings, axis=0)
    text_embeddings = np.concatenate(text_embeddings, axis=0)

    return title_embeddings, text_embeddings

# Example usage:
# Assuming df is your DataFrame containing the tokenized columns
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Ensure tokenized columns are lists, not strings
df['title_token_id'] = df['title_token_id'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['text_token_id'] = df['text_token_id'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Generate embeddings with batch size of 16
title_embeddings, text_embeddings = generate_embeddings_in_batches(df, batch_size=16, device=device)

# Save embeddings to numpy files
np.save('title_embeddings.npy', title_embeddings)
np.save('text_embeddings.npy', text_embeddings)

print("Embeddings saved successfully!")


Embeddings saved successfully!


In [17]:
embeddings = np.load('text_embeddings.npy')

# Print the shape
print(embeddings.shape)


(9683, 512, 768)


In [18]:
import numpy as np
print(np.isnan(embeddings).any())  # Check for NaN
print(np.isinf(embeddings).any())  # Check for Inf

False
False


In [19]:
from sklearn.metrics.pairwise import cosine_similarity

# Example with two embeddings
sim = cosine_similarity([embeddings[0, 0]], [embeddings[1, 0]])
print(sim)

[[0.96371275]]


In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings = tsne.fit_transform(embeddings.reshape(-1, embeddings.shape[-1]))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1])
plt.show()

[WinError 2] The system cannot find the file specified
  File "c:\Users\ACER\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\ACER\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 546, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ACER\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1022, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\ACER\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1491, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [15]:
import numpy as np

# Load the embeddings from the .npy files
title_embeddings = np.load('title_embeddings.npy')
text_embeddings = np.load('text_embeddings.npy')

# Check the shape of the embeddings
print("Title embeddings shape:", title_embeddings.shape)
print("Text embeddings shape:", text_embeddings.shape)

# Optionally, check a sample of the embeddings
print("\nSample of title embeddings:", title_embeddings[:5])
print("\nSample of text embeddings:", text_embeddings[:5])


Title embeddings shape: (9683, 768)
Text embeddings shape: (9683, 768)

Sample of title embeddings: [[ 0.20855236  0.10874665  0.53104085 ...  0.4342361  -0.11074243
  -0.06717227]
 [ 0.11317145  0.25548178  0.35806325 ...  0.06417514  0.00992666
   0.3277155 ]
 [ 0.29713917  0.21178836  0.67768514 ...  0.25230223 -0.01435768
   0.18460104]
 [ 0.4270823  -0.20197648  0.7608882  ...  0.399489   -0.08765424
  -0.05823053]
 [ 0.34615248  0.0669473   0.4245155  ... -0.03246038 -0.2579427
  -0.22427583]]

Sample of text embeddings: [[ 0.1267626   0.00684246  0.4304107  ...  0.06968568  0.01075805
  -0.22324109]
 [ 0.10639322 -0.04703356  0.41828835 ...  0.33613008 -0.0265827
  -0.32796347]
 [ 0.35151932  0.07523967  0.42336535 ...  0.12713951  0.01377472
  -0.17955732]
 [ 0.11898284 -0.17859691  0.5000466  ...  0.00448671  0.03648679
  -0.24647889]
 [ 0.05929607 -0.12212496  0.30757695 ...  0.26474008 -0.00805373
  -0.17712441]]
