In [54]:
import os
import boto3
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BartForConditionalGeneration, BartTokenizer, AutoModelForMaskedLM, AutoTokenizer

In [55]:
# MinIO configuration
minio_url = "http://localhost:9000"
access_key = "admin"
secret_key = "admin123"
processed_bucket_name = "processed-reports"
embedding_bucket_name = "embeddings"

# Set up MinIO client
s3_client = boto3.client(
    's3',
    endpoint_url=minio_url,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)

In [56]:
# Function to retrieve embeddings from the embeddings bucket
def load_embeddings_from_minio(file_name):
    s3_client.download_file(embedding_bucket_name, f"{file_name}.npy", f"{file_name}.npy")
    embeddings = np.load(f"{file_name}.npy")
    print(f"Embeddings {file_name} loaded from MinIO")
    return embeddings

# Example: Loading embeddings
loaded_embeddings = load_embeddings_from_minio("report_embeddings")

Embeddings report_embeddings loaded from MinIO


In [57]:
# Disable tokenizer parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# Check if MPS (Apple GPU) is available and use it, otherwise fall back to CPU
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# Force all computations to use the CPU instead of MPS
device = torch.device("cpu")

# Print the device being used (MPS or CPU)
if device.type == "mps":
    print("Using Apple M1 GPU (MPS)")
else:
    print("Using CPU")

Using CPU


In [58]:
# Load the fine-tuned MLM model and tokenizer
fine_tuned_model = AutoModelForMaskedLM.from_pretrained("./fine_tuned_mlm_model").to(device)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_mlm_model")
print("Model and tokenizer loaded from the local directory.")

# Function to generate query embedding using the fine-tuned model
def generate_query_embedding(query, model, tokenizer, device):
    tokens = tokenizer(query, return_tensors="pt", padding='max_length', truncation=True)
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1]
        query_embedding = last_hidden_state[:, 0, :]  # CLS token embedding
    
    return query_embedding.cpu().numpy().reshape(1, -1)

Model and tokenizer loaded from the local directory.


In [59]:
# Function to perform similarity search with stored embeddings
def find_similar_embeddings(query_embedding, stored_embeddings, top_k=5):
    if len(stored_embeddings.shape) == 3:
        stored_embeddings = stored_embeddings.reshape(stored_embeddings.shape[0], -1)
    
    similarities = cosine_similarity(query_embedding, stored_embeddings)
    top_k_indices = np.argsort(similarities[0])[::-1][:top_k]
    return top_k_indices

In [60]:
# List the files in the MinIO bucket
def list_files_in_bucket(bucket_name):
    response = s3_client.list_objects_v2(Bucket=bucket_name)
    if 'Contents' in response:
        files = [content['Key'] for content in response['Contents']]
        print(f"Files in {bucket_name} bucket: {files}")
    else:
        print(f"No files found in bucket {bucket_name}.")

list_files_in_bucket(processed_bucket_name)

Files in processed-reports bucket: ['processed_A-Study-on-Consumer-Brand-Awareness-of-Fast-Moving-Consumer-Goods.pdf.txt']


In [61]:
# Map embedding indices to report files
report_mapping = {
    0: "processed_A-Study-on-Consumer-Brand-Awareness-of-Fast-Moving-Consumer-Goods.pdf.txt"
}

# Function to retrieve reports from MinIO based on the embedding indices
def retrieve_reports_from_indices(indices):
    retrieved_reports = []
    for idx in indices:
        report_file_name = report_mapping.get(idx)
        if report_file_name:
            local_file_path = f"./{report_file_name}"
            s3_client.download_file(processed_bucket_name, report_file_name, local_file_path)
            with open(local_file_path, "r") as f:
                report_content = f.read()
            retrieved_reports.append(report_content)
        else:
            retrieved_reports.append(f"Report with index {idx} not found.")
    return retrieved_reports

# Load the BART model and tokenizer for summarization
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')



In [51]:
# Function to generate query-based summary
def generate_query_based_summary(query, report_texts, model, tokenizer):
    input_text = f"Query: {query}\n\nReport: {report_texts}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary



In [62]:
# Chatbot function to handle user queries related to campaign performance
def chatbot_query_with_advanced_summary(user_input):
    query_embedding = generate_query_embedding(user_input, fine_tuned_model, fine_tuned_tokenizer, device)
    similar_reports_indices = find_similar_embeddings(query_embedding, loaded_embeddings)
    similar_reports = retrieve_reports_from_indices(similar_reports_indices)
    combined_reports_text = " ".join(similar_reports)
    advanced_summary = generate_query_based_summary(user_input, combined_reports_text, bart_model, bart_tokenizer)
    return advanced_summary

# Example chatbot interaction
user_input = "How did the recent marketing campaigns perform?"
chatbot_response = chatbot_query_with_advanced_summary(user_input)
print(f"Chatbot response:\n{chatbot_response}")

Chatbot response:
Fast Moving Consumer Goods product sell quickly relatively low cost satisfy elemental day to day household need grocery range package. FMCG sector worth r 1,300 billion expect around whopping value r 4,000 r 6,000 billion 2020 henceforth fMCG close companion retail sector likely create job.
