In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import boto3
import faiss
import numpy as np

In [2]:
# MinIO configuration
minio_url = "http://localhost:9000"
access_key = "admin"
secret_key = "admin123"
bucket_name = "processed-reports"

# Set up MinIO client
s3_client = boto3.client(
    's3',
    endpoint_url=minio_url,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)

In [3]:
# List all files in the 'processed-reports' bucket
def list_processed_files(bucket_name):
    response = s3_client.list_objects_v2(Bucket=bucket_name)
    files = [content['Key'] for content in response.get('Contents', [])]
    return files

In [4]:
# Download a file from MinIO
def download_file(file_name, local_path):
    s3_client.download_file(bucket_name, file_name, local_path)
    print(f"Downloaded {file_name} to {local_path}")

In [5]:
# List all files in the processed-reports bucket
files = list_processed_files(bucket_name)
print(f"Files in processed-reports bucket: {files}")

# Example: Download the first file to local storage
if files:
    download_file(files[0], f"Data_dump/{files[0]}")

Files in processed-reports bucket: ['processed_A-Study-on-Consumer-Brand-Awareness-of-Fast-Moving-Consumer-Goods.pdf.txt']
Downloaded processed_A-Study-on-Consumer-Brand-Awareness-of-Fast-Moving-Consumer-Goods.pdf.txt to Data_dump/processed_A-Study-on-Consumer-Brand-Awareness-of-Fast-Moving-Consumer-Goods.pdf.txt


In [6]:
# Load a pre-trained Hugging Face model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def generate_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Generate embeddings for each downloaded file
def generate_embeddings_from_files(file_list):
    embeddings = []
    for file_path in file_list:
        with open(file_path, "r") as f:
            text = f.read()
            embedding = generate_embedding(text)
            embeddings.append(embedding)
    return embeddings

# Example: Generate embeddings for downloaded files
downloaded_files = [f"Data_dump/{file}" for file in files]
document_embeddings = generate_embeddings_from_files(downloaded_files)



In [7]:
# Define the dimensionality of the embeddings
dimension = 768  # This should match the embedding size
index = faiss.IndexFlatL2(dimension)  # L2 similarity

# Add document embeddings to FAISS
def add_to_faiss_index(embeddings):
    embeddings_np = np.array(embeddings)  # Convert to NumPy array
    index.add(embeddings_np)  # Add embeddings to the index
    print(f"Total documents indexed: {index.ntotal}")

# Index the newly generated embeddings
add_to_faiss_index(document_embeddings)

# Save the FAISS index to disk for later use
faiss.write_index(index, "faiss_index.bin")

Total documents indexed: 1


In [8]:
# Load the FAISS index from disk
index = faiss.read_index("faiss_index.bin")

In [9]:
def retrieve_similar_documents(query, k=5):
    query_embedding = generate_embedding(query).reshape(1, -1)
    distances, indices = index.search(query_embedding, k)
    return distances, indices

# Example: Query and retrieve similar documents
query = "What is the effectiveness of the recent marketing campaign?"
distances, indices = retrieve_similar_documents(query)
print(f"Top {len(indices[0])} similar documents: {indices}")
print(f"Distances: {distances}")

Top 5 similar documents: [[ 0 -1 -1 -1 -1]]
Distances: [[6.0458282e+01 3.4028235e+38 3.4028235e+38 3.4028235e+38 3.4028235e+38]]


In [10]:
document_mapping = {0: "processed_report1.txt", 1: "processed_report2.txt"}

# Loop through the indices and check for valid ones
for i in indices[0]:
    if i == -1:
        print("No more similar documents found.")
    else:
        print(f"Similar Document: {document_mapping.get(i, 'Document not found')}")

Similar Document: processed_report1.txt
No more similar documents found.
No more similar documents found.
No more similar documents found.
No more similar documents found.


#### Augment the Dataset with FAISS Embeddings:

In [11]:
# Retrieve similar documents and augment the dataset
def augment_training_data_with_embeddings(queries, k=5):
    augmented_data = []
    for query in queries:
        retrieved_docs = retrieve_similar_documents(query, k)
        augmented_data.append({
            'query': query, 
            'retrieved_docs': retrieved_docs
        })
    return augmented_data

# Example queries for training
queries = ["What is the effectiveness of the recent campaign?", "How did the campaign perform in Q2?"]

# Augment the dataset
augmented_dataset = augment_training_data_with_embeddings(queries)

In [12]:
from datasets import Dataset

# Extract query and retrieved document embeddings from augmented_dataset
queries = [item['query'] for item in augmented_dataset]

# Extract embeddings from the first part of 'retrieved_docs' tuple (ignore indices)
retrieved_embeddings = [item['retrieved_docs'][0][0] for item in augmented_dataset]  # Only embeddings

# Create a dictionary suitable for Dataset.from_dict
dataset_dict = {
    'query': queries,
    'retrieved_docs': retrieved_embeddings
}

# Now create the Dataset from the dict
dataset = Dataset.from_dict(dataset_dict)

# Print to confirm
print(dataset)

Dataset({
    features: ['query', 'retrieved_docs'],
    num_rows: 2
})


In [13]:
import torch
import os

# Check if MPS (Apple GPU) is available and use it, otherwise fall back to CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Print the device being used (MPS or CPU)
if device.type == "mps":
    print("Using Apple M1 GPU (MPS)")
else:
    print("Using CPU")
    
# Disable tokenizer parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Tokenize the queries (BERT input)
def tokenize_query(examples):
    return tokenizer(examples['query'], padding='max_length', truncation=True)

# Tokenize queries (MLM task, no need for FAISS embeddings)
tokenized_query_dataset = dataset.map(tokenize_query, batched=True)

# Ensure the dataset contains 'input_ids' and 'attention_mask'
print(tokenized_query_dataset)

# Set format to torch to ensure the correct format for PyTorch models
tokenized_query_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

Using Apple M1 GPU (MPS)


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Dataset({
    features: ['query', 'retrieved_docs', 'input_ids', 'attention_mask'],
    num_rows: 2
})


### Fine-tune the Model Using Augmented Data

In [14]:
import os
import torch
import mlflow
import mlflow.pytorch
from transformers import DistilBertForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Disable tokenizer parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Check if MPS (Apple GPU) is available and use it, otherwise fall back to CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Print the device being used (MPS or CPU)
if device.type == "mps":
    print("Using Apple M1 GPU (MPS)")
else:
    print("Using CPU")

# Load a pre-trained model with Masked Language Modeling head
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased').to(device)

# Load the tokenizer for tokenizing queries
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the dataset
def tokenize_query(examples):
    return tokenizer(examples['query'], padding='max_length', truncation=True)

tokenized_query_dataset = dataset.map(tokenize_query, batched=True)

# Data collator for dynamic masking during training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # Mask 15% of the tokens
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./mlm_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    dataloader_pin_memory=False,  # Set False for MPS (pin_memory is for CUDA)
)

# Start MLflow logging
mlflow.set_experiment("MLM Fine-tuning with DistilBERT")

with mlflow.start_run():
    # Log hyperparameters
    mlflow.log_param("learning_rate", training_args.learning_rate)
    mlflow.log_param("batch_size", training_args.per_device_train_batch_size)
    mlflow.log_param("num_epochs", training_args.num_train_epochs)
    
    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_query_dataset,  # Tokenized dataset
        eval_dataset=tokenized_query_dataset,   # Optionally use a validation set
        data_collator=data_collator,            # MLM data collator
    )
    
    # Train the model with MLM
    trainer.train()
    
    # Safely log metrics (only log if the value is not None)
    metrics = trainer.state.log_history[-1]  # Get the last set of logged metrics
    
    # Check if 'loss' and 'epoch' are present and valid before logging
    if 'loss' in metrics and metrics['loss'] is not None:
        mlflow.log_metric("train_loss", float(metrics['loss']))
    if 'epoch' in metrics and metrics['epoch'] is not None:
        mlflow.log_metric("epoch", float(metrics['epoch']))
    
    # Log the model to MLflow
    mlflow.pytorch.log_model(model, "masked_language_model")
    
    # Log tokenizer
    tokenizer.save_pretrained("./fine_tuned_mlm_model")
    mlflow.log_artifact("./fine_tuned_mlm_model/tokenizer_config.json", "tokenizer")

# Save locally
model.save_pretrained("./fine_tuned_mlm_model")
tokenizer.save_pretrained("./fine_tuned_mlm_model")  # Save tokenizer

Using Apple M1 GPU (MPS)




Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.02111385017633438, 'eval_runtime': 0.1114, 'eval_samples_per_second': 17.957, 'eval_steps_per_second': 8.978, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.00785028375685215, 'eval_runtime': 0.0825, 'eval_samples_per_second': 24.235, 'eval_steps_per_second': 12.118, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.007601247169077396, 'eval_runtime': 0.0887, 'eval_samples_per_second': 22.546, 'eval_steps_per_second': 11.273, 'epoch': 3.0}
{'train_runtime': 7.6672, 'train_samples_per_second': 0.783, 'train_steps_per_second': 0.391, 'train_loss': 0.003227462371190389, 'epoch': 3.0}




('./fine_tuned_mlm_model/tokenizer_config.json',
 './fine_tuned_mlm_model/special_tokens_map.json',
 './fine_tuned_mlm_model/vocab.txt',
 './fine_tuned_mlm_model/added_tokens.json',
 './fine_tuned_mlm_model/tokenizer.json')

In [15]:
## Witout ML Flow 

# import os
# import torch
# from transformers import DistilBertForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# # Disable tokenizer parallelism
# os.environ["TOKENIZERS_PARALLELISM"] = "true"

# # Check if MPS (Apple GPU) is available and use it, otherwise fall back to CPU
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# # Print the device being used
# if device.type == "mps":
#     print("Using Apple M1 GPU (MPS)")
# else:
#     print("Using CPU")

# # Load a pre-trained model with Masked Language Modeling head
# model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased').to(device)

# # Load the tokenizer for tokenizing queries
# tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# # Tokenize the dataset
# def tokenize_query(examples):
#     return tokenizer(examples['query'], padding='max_length', truncation=True)

# # Assuming you have a dataset to tokenize
# tokenized_dataset = dataset.map(tokenize_query, batched=True)

# # Data collator for dynamic masking during training
# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=True,
#     mlm_probability=0.15  # Mask 15% of the tokens
# )

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./mlm_results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     dataloader_pin_memory=False,  # Set False for MPS (pin_memory is for CUDA)
# )

# # Initialize the Trainer for MLM
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset,  # Tokenized dataset
#     eval_dataset=tokenized_dataset,   # Optionally use a validation set
#     data_collator=data_collator,      # MLM data collator
# )

# # Train the model with MLM
# trainer.train()

# # After training, you can save the fine-tuned model
# model.save_pretrained("./fine_tuned_mlm_model")
# tokenizer.save_pretrained("./fine_tuned_mlm_model")

In [16]:
# Load the fine-tuned MLM model
fine_tuned_model = DistilBertForMaskedLM.from_pretrained("./fine_tuned_mlm_model").to(device)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_mlm_model")

# Now we can move the inputs to the device (MPS or CPU)
def generate_embeddings(dataset, model):
    model.eval()
    embeddings = []
    for batch in dataset:
        # Ensure conversion to tensor before moving to device
        input_ids = torch.tensor(batch['input_ids']).to(device)
        attention_mask = torch.tensor(batch['attention_mask']).to(device)
        
        with torch.no_grad():
            # Set output_hidden_states=True to get the hidden states
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
            hidden_states = outputs.hidden_states  # This gives you all the hidden states
            last_hidden_state = hidden_states[-1]  # Take the last hidden state

            cls_embedding = last_hidden_state[:, 0, :]  # CLS token embedding
            embeddings.append(cls_embedding.cpu().numpy())  # Move back to CPU for storage
    return embeddings

# Generate embeddings from the tokenized dataset
embeddings = generate_embeddings(tokenized_query_dataset, model)

# Print the number of embeddings generated
print(f"Generated {len(embeddings)} embeddings")

Generated 2 embeddings


In [17]:
import numpy as np
import boto3

# MinIO configuration
minio_url = "http://localhost:9000"
access_key = "admin"
secret_key = "admin123"
bucket_name = "embeddings"

# Set up MinIO client
s3_client = boto3.client(
    's3',
    endpoint_url=minio_url,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)

# Function to save embeddings to MinIO
def save_embeddings_to_minio(embeddings, file_name):
    local_processed_file = f'{file_name}.npy'
    # Save the embeddings locally as a .npy file
    np.save(file_name, embeddings)
    
    # Upload the .npy file to MinIO
    s3_client.upload_file(f"{file_name}.npy", bucket_name, f"{file_name}.npy")
    print(f"Embeddings saved to MinIO as {file_name}.npy")
    os.remove(local_processed_file)

# Example: Saving embeddings
save_embeddings_to_minio(embeddings, "report_embeddings")

Embeddings saved to MinIO as report_embeddings.npy
