In [1]:
# Required Libraries

import google.generativeai as genai
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain_community.llms import OpenAI

In [2]:
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer
import torch
import faiss
import numpy as np
import os

In [3]:
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [None]:
from config import *

In [4]:
# Required Secret Key

# Get your key: https://ai.google.dev/
gemini_key = ""
# Get your key: https://platform.openai.com/signup
open_ai_key = ""

In [5]:
# Input

gdp_img = "E:\Torchcode\Test_data\GPG_test\GenderPaygapExample_Graph.png"
pdf_loc = ["E:\Torchcode\Test_data\GPG_test\WEF_GGGR_2014.pdf","E:\Torchcode\Test_data\GPG_test\WEF_GGGR_2015.pdf",'E:\Torchcode\Test_data\GPG_test\WEF_GGGR_2016.pdf',"E:\Torchcode\Test_data\GPG_test\WEF_GGGR_2017.pdf"]
pdf_loc = ["E:\Torchcode\Test_data\GPG_test\WEF_GGGR_2015.pdf"]

In [6]:
pdf_loc


['E:\\Torchcode\\Test_data\\GPG_test\\WEF_GGGR_2015.pdf']

In [7]:
# Code to get image information

def input_image_setup(file_loc):
    from pathlib import Path

    if not (img := Path(file_loc)).exists():
        raise FileNotFoundError(f"Could not find image: {img}")

    image_parts = [
        {
            "mime_type": "image/jpeg",
            "data": Path(file_loc).read_bytes()
            }
        ]
    return image_parts

def get_image_info(image_loc):

    genai.configure(api_key=gemini_key)

    # Set up the model
    generation_config = {
        "temperature":0.4,
        "top_p":1,
        "top_k":32,
        "max_output_tokens":4096,
    }

    model = genai.GenerativeModel(model_name="gemini-pro-vision", generation_config=generation_config)

    input_prompt = """ You are an expert in understanding graphs/visualisation.
    You will receive images with visualisation and stats you will have to provide summary of graph in paragraph."""

    question_prompt = "Give a brief context"


    image_prompt = input_image_setup(image_loc)
    prompt_parts = [input_prompt, image_prompt[0], question_prompt]
    response = model.generate_content(prompt_parts)
    return str(response.text)

gemini_img_output = get_image_info(gdp_img)
print(gemini_img_output)

 The graph shows the Global Gender Gap Index evolution from 2006 to 2014 by region. The index measures the gap between men and women in terms of economic participation, educational attainment, health and survival, and political empowerment. Scores range from 0 to 1, with 1 representing full equality.

In 2006, the highest-scoring region was North America, with a score of 0.709. The lowest-scoring region was Sub-Saharan Africa, with a score of 0.549. In 2014, the highest-scoring region was Europe and Central Asia, with a score of 0.761. The lowest-scoring region was the Middle East and North Africa, with a score of 0.560.

Overall, the graph shows that there has been progress in closing the gender gap in all regions of the world. However, there is still a long way to go, as no region has achieved full equality.


In [61]:
# Load a pre-trained tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("E:\Gitcloned\gte-base-en-v1.5")
model = AutoModel.from_pretrained("E:\Gitcloned\gte-base-en-v1.5", trust_remote_code=True)

In [62]:


# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    pdf_reader = PdfReader(pdf_path)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to split text into chunks
def split_text_into_chunks(text, chunk_size=1000, chunk_overlap=200):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - chunk_overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Function to convert text chunks to embeddings
def text_to_embeddings(chunks, tokenizer, model):
    embeddings_list = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Using the [CLS] token representation
        embeddings_list.append(embeddings)
    
    # Combine all embeddings into a single array
    embeddings_array = np.vstack(embeddings_list)
    return embeddings_array


# # Now you can perform searches on the GPU index
# def search_index(query_text, tokenizer, model, gpu_index_flat, k=5):
#     # Convert query text to embeddings
#     inputs = tokenizer(query_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     query_embedding = outputs.last_hidden_state[:, 0, :].numpy()
    
#     # Perform the search
#     distances, indices = gpu_index_flat.search(query_embedding, k)
    
#     return distances, indices


In [69]:
# Function to perform search on the FAISS index
def search_index(query_text, tokenizer, model, gpu_index_flat, chunks, k=10):
    # Convert query text to embeddings
    inputs = tokenizer(query_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    query_embedding = outputs.last_hidden_state[:, 0, :].numpy()
    
    # Perform the search
    distances, indices = gpu_index_flat.search(query_embedding, k)
    
    # Retrieve the corresponding documents
    retrieved_docs = [chunks[i] for i in indices[0]]
    return retrieved_docs


In [70]:
pdf_path = pdf_loc[0]

In [72]:
# # Path to your PDF
# pdf_path = 'E:\\Torchcode\\Test_data\\GPG_test\\WEF_GGGR_2015.pdf'

# Extract text from the PDF
text = extract_text_from_pdf(pdf_path)
print("Text extraction done.")

# Split the text into chunks
chunks = split_text_into_chunks(text)
print("Text split into chunks done.")

# Load a pre-trained tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("E:\\Gitcloned\\gte-base-en-v1.5")
model = AutoModel.from_pretrained("E:\\Gitcloned\\gte-base-en-v1.5", trust_remote_code=True)

# Convert the text chunks to embeddings
embeddings = text_to_embeddings(chunks, tokenizer, model)
print("Text to embeddings conversion done.")

# Define the dimensionality of the vectors (embeddings)
d = embeddings.shape[1]

# Initialize GPU resources
res = faiss.StandardGpuResources()

# Create a CPU index
index_flat = faiss.IndexFlatL2(d)

# Transfer the index to GPU
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)

# Add embeddings to the GPU index
gpu_index_flat.add(embeddings)
print("Number of vectors in the GPU index:", gpu_index_flat.ntotal)

# Example: Find the 5 nearest neighbors for a query text
query_text = "Gender Pay gap in 2015 was"
# distances, indices = search_index(query_text, tokenizer, model, gpu_index_flat)
# print("Indices of the nearest neighbors:", indices)
# print("Distances to the nearest neighbors:", distances)


Text extraction done.
Text split into chunks done.
Text to embeddings conversion done.
Number of vectors in the GPU index: 289


In [73]:
query_text = gemini_img_output

In [75]:
# Function to generate response using the Gemini model
def gemini_model_with_knowledge(vector_db, chunks, user_input, gemini_key):
    # Search for relevant documents in the vector database
    
    
    # Configure the Gemini API
    genai.configure(api_key=gemini_key)
    
    # Set up the model configuration
    generation_config = {
        "temperature": 0.4,
        "top_p": 1,
        "top_k": 32,
        "max_output_tokens": 4096,
    }
    
    # Initialize the Gemini model
    gen_model = genai.GenerativeModel(model_name="gemini-pro", generation_config=generation_config)
    
    # Create the prompt with retrieved documents and user input
    template = """You are an expert in data updation. You have got context and data to be updated.
    your task is to write a paragraph to update the given data till 2015, which will be used to retreive data from vector database using similarity search."""
    
    combined_input = "\n\n" + template + "\n\n" + user_input
    
    # Generate response
    response = gen_model.generate_content(combined_input)
    mid_input = str(response.text)
    #mid_input = user_input
    print(mid_input)
    retrieved_docs = search_index(mid_input, tokenizer, model, vector_db, chunks)
    print(retrieved_docs)
    
    final_ans_template = """You are an expert in data updation.
    your task is to reorganize the given data in a meaningful manner.Make sure you are not changing the given data points, if any datapoint is missing leave represent it with NA.The output should be in tabular format."""
    final_combined_input = "\n".join(retrieved_docs) +"\n\n" + user_input + "\n\n" + final_ans_template
    final_response = gen_model.generate_content(final_combined_input)
    
    
    # model = genai.GenerativeModel(model_name="gemini-pro", generation_config=generation_config)

    # input_prompt = """ You are an expert in understanding graphs/visualisation.
    # You will receive images with visualisation and stats you will have provide tabular representation of the data """

    # question_prompt = "Give a brief context and tabular representation of entire data"


    # image_prompt = input_image_setup(image_loc)
    # prompt_parts = [input_prompt, image_prompt[0], question_prompt]
    # response = model.generate_content(prompt_parts)
    # return str(response.text)



    
    return str(final_response.text)

In [76]:
user_input = str(gemini_img_output)
print(user_input)

 The graph shows the Global Gender Gap Index evolution from 2006 to 2014 by region. The index measures the gap between men and women in terms of economic participation, educational attainment, health and survival, and political empowerment. Scores range from 0 to 1, with 1 representing full equality.

In 2006, the highest-scoring region was North America, with a score of 0.709. The lowest-scoring region was Sub-Saharan Africa, with a score of 0.549. In 2014, the highest-scoring region was Europe and Central Asia, with a score of 0.761. The lowest-scoring region was the Middle East and North Africa, with a score of 0.560.

Overall, the graph shows that there has been progress in closing the gender gap in all regions of the world. However, there is still a long way to go, as no region has achieved full equality.


In [77]:


# Ensure vector_store and chunks are properly initialized
if gpu_index_flat and chunks:
    gpt_output_with_knowledge = gemini_model_with_knowledge(gpu_index_flat, chunks, user_input, gemini_key)
    print(gpt_output_with_knowledge)
else:
    print("Error: vector_store or chunks not properly initialized.")

In 2015, the Global Gender Gap Index continued to show progress in closing the gender gap in all regions of the world. Europe and Central Asia remained the highest-scoring region, with a score of 0.765. The Middle East and North Africa remained the lowest-scoring region, but its score improved to 0.565. North America's score also improved to 0.715, while Sub-Saharan Africa's score remained relatively unchanged at 0.551. These updates indicate that while there has been progress in closing the gender gap, significant disparities persist between regions and further efforts are needed to achieve full equality.
["134 0.605 Oman 135 0.604 Egypt 136 0.599 Lebanon 138 0.598 Morocco 139 0.593 Jordan 140 0.593 Syria 143 0.568 Yemen 145 0.484ASIA AND THE PACIFIC LATIN AMERICA AND THE CARIBBEAN MIDDLE EAST AND NORTH AFRICA (Continued on next page)Part 1: Measuring the Global Gender Gap 16 | The Global Gender Gap Report 2015The region’s score has improved compared to 2014 on all subindexes except E

In [78]:
# import faiss
# import numpy as np

# from transformers import AutoTokenizer, AutoModel

# # Function to extract text from a PDF
# def extract_text_from_pdf(pdf_path):
#     # doc = fitz.open(pdf_path)
#     # text = ""
#     # for page in doc:
#     #     text += page.get_text()
#     # return text


#     # Combine text from all PDFs
#     text = ""
#     for pdf_path in pdf_paths:
#         pdf_reader = PdfReader(pdf_path)
#         for page in pdf_reader.pages:
#             text += page.extract_text()

#     return text

# # Function to convert text to embeddings
# def text_to_embeddings(text, tokenizer, model):
#     # inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
#     # with torch.no_grad():
#     #     outputs = model(**inputs)
#     # # Use the embeddings from the last hidden state
#     # embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
#     # return embeddings

#     # Split text into chunks
#     text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
#     chunks = text_splitter.split_text(text)



#     # Tokenize and generate embeddings for each chunk
#     embeddings_list = []
#     for chunk in chunks:
#         inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Using the [CLS] token representation
#         embeddings_list.append(embeddings)

#     # Combine all embeddings into a single array
#     embeddings_array = np.vstack(embeddings_list)

#     # Create and populate the FAISS index
#     #dimension = embeddings_array.shape[1]
#    # index = faiss.IndexFlatL2(dimension)  # Using L2 distance
#     #index.add(embeddings_array)

#     return embeddings_array,chunks



# # Path to your PDF
# pdf_path = 'E:\Torchcode\Test_data\GPG_test\WEF_GGGR_2014.pdf'

# # Extract text from the PDF
# text = extract_text_from_pdf(pdf_path)
# print("Text extraction Done")

# # Load a pre-trained tokenizer and model from Hugging Face
# tokenizer = AutoTokenizer.from_pretrained("E:\Gitcloned\gte-base-en-v1.5")
# model = AutoModel.from_pretrained("E:\Gitcloned\gte-base-en-v1.5", trust_remote_code=True)

# # Convert the text to embeddings
# embeddings,chunks = text_to_embeddings(text, tokenizer, model)

# print("Extracted text to embeddings Done")

# # Define the dimensionality of the vectors (embeddings)
# d = embeddings.shape[1]

# # Initialize GPU resources
# res = faiss.StandardGpuResources()

# # Create a CPU index
# index_flat = faiss.IndexFlatL2(d)

# # Transfer the index to GPU
# gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)

# print("Added to FAISS Done")

# # Add embeddings to the GPU index
# vector_store = gpu_index_flat.add(embeddings)
# print("Number of vectors in the GPU index:", gpu_index_flat.ntotal)



# # Now you can perform searches on the GPU index
# # Example: Find the 5 nearest neighbors for a query embedding
# # query = text_to_embeddings("Gender Pay gap in 2015 was", tokenizer, model)
# # distances, indices = gpu_index_flat.search(query)

# # # Print the results
# # print("Indices of the nearest neighbors:")
# # print(indices)
# # print("Distances to the nearest neighbors:")
# # print(distances)


In [79]:
# # Function to perform search and generate response using Gemini model
# def gemini_model_with_knowledge(vector_db, chunks, user_input, gemini_key):
#     # Load tokenizer and model for user input
#     tokenizer = AutoTokenizer.from_pretrained("E:\Gitcloned\gte-base-en-v1.5")
#     model = AutoModel.from_pretrained("E:\Gitcloned\gte-base-en-v1.5", trust_remote_code=True)

#     # Tokenize and generate embeddings for the user input query
#     inputs = tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     query_embedding = outputs.last_hidden_state[:, 0, :].numpy()  # Using the [CLS] token representation

#     # Perform search on the vector database
#     D, I = vector_db.search(query_embedding)  # Retrieve top 5 nearest neighbors
#     retrieved_docs = [chunks[i] for i in I[0]]


#     genai.configure(api_key=gemini_key)

#     # Set up the model
#     generation_config = {
#         "temperature":0.4,
#         "top_p":1,
#         "top_k":32,
#         "max_output_tokens":4096,
#     }
    
#     # Configure and use the Gemini model
#     genai.configure(api_key=gemini_key)
#     model = genai.GenerativeModel(model_name="gemini-pro", generation_config= generation_config)

#     # Create the prompt with retrieved documents and user input
#     template = ("""You are an expert in data updation. You have got context and data to be updated.
#         Task is to understand the context of the data and update the data till 2015 with your own knowledge
#         and return JSON output""")

#     combined_input = "\n".join(retrieved_docs) + "\n\n" + template + "\n\n" + user_input

#     # Generate response
#     response = model.generate_content(combined_input)

#     return response.text

# # Example usage
# #pdf_paths = [r"E:\\Torchcode\\Test_data\\test_pdf.pdf"]  # List of PDF file paths
# #vector_store, chunks = convert_pdf_to_vector_db(pdf_paths)
# #gemini_key = "your_gemini_api_key"
# user_input = "Update the Data till 2015"

# gpt_output_with_knowledge = gemini_model_with_knowledge(vector_store, chunks, user_input, gemini_key)
# print(gpt_output_with_knowledge)

In [None]:
# # Function to generate response using the Gemini model
# def gemini_model_with_knowledge(vector_db, chunks, user_input, gemini_key):
#     # Search for relevant documents in the vector database
#     retrieved_docs = search_index(user_input, tokenizer, model, vector_db, chunks)
    
#     # Configure the Gemini API
#     genai.configure(api_key=gemini_key)
    
#     # Set up the model configuration
#     generation_config = {
#         "temperature": 0.4,
#         "top_p": 1,
#         "top_k": 32,
#         "max_output_tokens": 4096,
#     }
    
#     # Initialize the Gemini model
#     gen_model = genai.GenerativeModel(model_name="gemini-pro", generation_config=generation_config)
    
#     # Create the prompt with retrieved documents and user input
#     template = """You are an expert in data updation. You have got context and data to be updated.
#     Task is to understand the context of the data and update the data till 2015 with your own knowledge
#     and return JSON output."""
    
#     combined_input = "\n".join(retrieved_docs) + "\n\n" + template + "\n\n" + user_input
    
#     # Generate response
#     response = gen_model.generate_content(combined_input)

    
#     return str(response.text)

In [88]:
def print_all_data_from_faiss(vector_db, chunks):
    # Retrieve all the vectors
    stored_vectors = vector_db.reconstruct_n(0, vector_db.ntotal)
    text = ""
    # Print the stored vectors and corresponding chunks
    for i, vector in enumerate(stored_vectors):
        #print(f"Vector {i+1}:\n{vector}")
        print(f"\n{chunks[i]}")
        #print("-" * 80)
        text += chunks[i]
# Print all data from the FAISS vector database
    print(text)
print_all_data_from_faiss(gpu_index_flat, chunks)


The Global Gender Gap Report 2015Insight Report 10th Anniversary EditionInsight Report The Global Gender Gap Report 2015 10th Anniversary EditionThe Global Gender Gap Report 2015 is published by the World Economic Forum. AT THE WORLD ECONOMIC FORUM Professor Klaus Schwab Founder and Executive Chairman Richard Samans Head of the Centre for the Global Agenda, Member of the Managing Board Saadia Zahidi Head of Employment and Gender Initiatives, Member of the Executive Committee Yasmina Bekhouche Project Lead, Gender Parity Initiative Paulina Padilla Ugarte Specialist, Employment and Gender Initiatives Vesselina Ratcheva Data Analyst, Employment and Gender Parity Initiatives AT HARVARD UNIVERSITY Professor Ricardo Hausmann Director of the Center for International Development (CID) and Professor of the Practice of Economic Development at the Harvard Kennedy School. AT THE UNIVERSITY OF CALIFORNIA, BERKELEY Professor Laura D’Andrea Tyson Director of the Institute for Business and Social Imp