# Using OpenAI

## 1. Setup

### 1.1 Installing Dependencies

In [None]:
!pip install -qq langchain==0.0.137
!pip install -qq openai==0.28
!pip install -qq tiktoken
!pip install -qq faiss-cpu
!pip install -qq pypdf

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m41.0/76.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.[0m[31m
[0m

### 1.2 Import Libraries

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
import openai
import getpass
import os

In [None]:
openAPI_key = getpass.getpass("Enter your API Key...")
openai.api_key = openAPI_key

Enter your API Key...··········


### 1.3 Function for Text wraping

In [None]:
!rm -f my_lib.py
!wget -qq https://github.com/senseiji/python_tools/raw/main/my_lib.py

In [None]:
from my_lib import wrap_text

## 2. Preparing Data

### 2.1 Getting the Data

In [None]:
!wget -qq -O CS.txt https://raw.githubusercontent.com/Meghana-Chebolu/CS/main/CS.txt

In [None]:
!wget -qq -O mondee_Final.txt https://raw.githubusercontent.com/Meghana-Chebolu/CS/main/mondee_final.txt

In [None]:
!wget -qq -O Jennifer.txt https://www.dropbox.com/scl/fi/p5pfm5vbd53s1pqc5n3bt/Jennifer-Johnson-Crowdstrike.txt?rlkey=vfqsbn02kx2vqd2gdqr0cquo7&dl=0

### 2.2 Converting Text File to PDF (If Required)

In [None]:
!apt-get install -y -qq enscript
!apt-get install -y -qq ghostscript

In [None]:
def text2Pdf(text_File):
  output_File = f"{text_File.split('.')[0]}.pdf"
  !enscript -p output.ps $text_File
  !ps2pdf output.ps $output_File
  !rm output.ps
  return output_File

text_Files = ['CS.txt', 'mondee_Final.txt', 'Jennifer.txt']
for text_File in text_Files:
  text2Pdf(text_File)

[ 4 pages * 1 copy ] left in output.ps
49 lines were wrapped
6 non-printable characters
[ 12 pages * 1 copy ] left in output.ps
165 lines were wrapped
7 non-printable characters
[ 3 pages * 1 copy ] left in output.ps
13 lines were wrapped
6 non-printable characters


### 2.3 PDF processing using Poppler Utils (If Required)

In [None]:
### Install Poppler

# !apt-get install -y -qq poppler-utils

In [None]:
### Getting Text from PDF

# !pdftotext input.pdf output.txt

In [None]:
### Merging Two PDFs

# !pdfunite input1.pdf input2.pdf output.pdf

### 2.4 Loading the data

In [None]:
def doc_Loader(file_Path):
  if file_Path.endswith('.pdf'):
    return PyPDFLoader(file_Path).load_and_split()
  elif file_Path.endswith('.txt'):
    return TextLoader(file_Path).load()

In [None]:
resume_Data = doc_Loader('Aditya_verma.pdf')
modules_Data = doc_Loader('modules_Basic_SAP.txt')

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 10000, chunk_overlap = 100)
source_Data = text_splitter.split_text(modules_Data[0].page_content)
# target_Data = text_splitter.split_documents(resume_Data)
context = "\n\n".join(str(page.page_content) for page in resume_Data)
target_Data = text_splitter.split_text(context)

### 2.5 Storing the Data in Vector Store

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key = openai.api_key)

In [None]:
vector_Store_Index = FAISS.from_texts(target_Data, embeddings)

## 3. Model

In [None]:
llm = OpenAI(openai_api_key = openAPI_key)

In [None]:
qa = RetrievalQA.from_chain_type(llm = llm,
                                 chain_type = "stuff",
                                 retriever = vector_Store_Index.as_retriever()
                                 )

## 4. Q&A

In [None]:
def run_qa(q):
    q = q + " Give bulleted points wherever necessary. Go through all the data provided before answering. Do not repeat the statements"
    response = qa.run(q)
    wrap_text(response, numchars = 130)
    return response

In [None]:
run_qa("list all the sap modules")

 Aditya Verma has experience with: 
- SAP ECC 6.0
- SAP PP
- SAP SD


' Aditya Verma has experience with: \n- SAP ECC 6.0\n- SAP PP\n- SAP SD'

In [None]:
prompt = "tell me about all the projects in the document seperately in the following format : \
          Project Title, Role, Project Duration, Project Location, Client, Company at which project is done, and get all the SAP modules mentioned in the project.\
          Do not give contributions or description or bulleted points of work done, just go through those sections and get the SAP modules used"

In [None]:
run_qa(prompt)

 
1. Shell SAP Support & Maintenance: Role - ABAP Developer, Duration - Nov 2021 - Current, Project Location - Offshore, TCS Noida,
Client - Shell, Company - TCS, SAP Modules used - Classic Reports, ALV Reports, Smart Forms, Transactions, BAPI, BADI,
Enhancement, Data Migrations, RFC, SD & MM. 
2. Shell MECM Administrator: Role - MECM Administrator, Duration - Mar 2020 - Nov 2021, Project Location - Offshore, TCS Noida,
Client - Shell, Company - TCS, SAP Modules used - Windows OS, WSUS, MS SQL Server, C# .net, Asp.net and .net Framework.


' \n1. Shell SAP Support & Maintenance: Role - ABAP Developer, Duration - Nov 2021 - Current, Project Location - Offshore, TCS Noida, Client - Shell, Company - TCS, SAP Modules used - Classic Reports, ALV Reports, Smart Forms, Transactions, BAPI, BADI, Enhancement, Data Migrations, RFC, SD & MM. \n2. Shell MECM Administrator: Role - MECM Administrator, Duration - Mar 2020 - Nov 2021, Project Location - Offshore, TCS Noida, Client - Shell, Company - TCS, SAP Modules used - Windows OS, WSUS, MS SQL Server, C# .net, Asp.net and .net Framework.'

In [None]:
# run_qa("What is the most important service of crowdstrike?")

In [None]:
# run_qa("Give me the education and experience details of jennifer johnson")

## 5. Data Extraction

In [None]:
import openai

# Assuming you've obtained information and answers from the PDFs
prompt = f'Use the {source_Data} as the source file for recognizing SAP Modules. \
        In {target_Data} concentrate on the projects data present. You should get the following details from each project in {target_Data} : \
        [Project Title, Role, Project Start Date, Project End Date, Project Location, Client, Company where project is done (call it "Company")]\
        Also find all the SAP Modules that are used and mentioned in the given project. Remember that {source_Data} contains the list of SAP Modules. \
        Extract information about each of the projects mentioned in {target_Data} and list them seperately in the above specified list format including \
        the SAP modules used. Return the data in a json format'


def extract_SAP_Data(prompt):
  response = openai.ChatCompletion.create(
      model = "gpt-3.5-turbo-1106", temperature = 0,
      messages = [
          {
              "role" : "system",
              "content" : prompt
          }
      ],
      max_tokens = 500 # Adjust the desired length of the email generated
  )

  # Get the generated email text
  data_Extracted = response['choices'][0]['message']['content']
  return wrap_text(data_Extracted)

extract_SAP_Data(prompt)

```json
[
    {
        "Project Title": "SHELL SAP SUPPORT & MAINTENANCE",
        "Role": "ABAP Developer",
        "Project Start Date": "Nov 2021",
        "Project End Date": "Current",
        "Project Location": "Offshore, TCS Noida",
        "Client": "Shell",
        "Company": "TCS",
        "SAP Modules Used": ["SAP ECC 6.0", "SAP SD", "SAP MM", "SAP Smart Forms", "SAP BAPI", "SAP
BADI", "SAP Data Migration"]
    },
    {
        "Project Title": "SHELL MECM ADMINISTRATOR",
        "Role": "MECM Administrator",
        "Project Start Date": "Mar 2020",
        "Project End Date": "Nov 2021",
        "Project Location": "Offshore, TCS Noida",
        "Client": "Shell",
        "Company": "TCS",
        "SAP Modules Used": []
    }
]
```


# Using Gemini

## 1. Setup

### 1.1 Install Dependencies

In [None]:
!pip install -q --upgrade google-generativeai langchain-google-genai

In [None]:
!pip install -U -qq google.generativeai
!pip install -qq chromadb
!pip install -qq langchain==0.0.137
!pip install -qq langchain-google-genai
!pip install -qq tiktoken
!pip install -qq faiss-cpu
!pip install -qq pypdf

### 1.2 Import Libraries

In [None]:
import google.generativeai as genai
# import google.ai.generativelanguage as glm
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from langchain_core.prompt_values import ChatPromptValue, PromptValue
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.vectorstores import Chroma

from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain

import getpass
import os
import numpy as np
import pandas as pd

### 1.3 Custom Display and API Key

In [None]:
!rm -f my_lib.py
!wget -qq https://github.com/senseiji/python_tools/raw/main/my_lib.py

In [None]:
from my_lib import wrap_text

In [None]:
gemini_Api_Key = getpass.getpass("Enter Gemini API Key : ")
genai.configure(api_key = gemini_Api_Key)

Enter Gemini API Key : ··········


In [None]:
demo_Basic = genai.GenerativeModel(model_name = 'gemini-pro')
wrap_text(demo_Basic.generate_content("what are all the available types of SAP modules").text)

## 2. Preparing Data

### 2.1 Loading Data

In [None]:
def doc_Loader(file_Path):
  if file_Path.endswith('.pdf'):
    return PyPDFLoader(file_Path).load_and_split()
  elif file_Path.endswith('.txt'):
    # return PyPDFLoader(text2Pdf(file_Path)).load_and_split()
    return TextLoader(file_Path).load_and_split()

In [None]:
source_Pages = doc_Loader('modules_Basic_SAP.txt')
target_Pages = doc_Loader('Aditya_verma.pdf')

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 10000, chunk_overlap = 100)
def text_Chunker(data_Pages, text_splitter):
  context = "\n\n".join(str(page.page_content) for page in data_Pages)
  return text_splitter.split_text(context)
source_Chunks = text_Chunker(source_Pages, text_splitter)
target_Chunks = text_Chunker(target_Pages, text_splitter)

### 2.2 Creating Vectors

In [None]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key = gemini_Api_Key)

In [None]:
type(embeddings)

langchain_google_genai.embeddings.GoogleGenerativeAIEmbeddings

In [None]:
vector_Index = FAISS.from_texts(source_Chunks, embeddings).as_retriever(search_kwargs = {"k" : 3})

## 3. Gemini Model and Retrieval Chain

In [None]:
from langchain_google_genai import GoogleGenerativeAI
llm = GoogleGenerativeAI(model = "gemini-pro", google_api_key = gemini_Api_Key)

In [None]:
llm

GoogleGenerativeAI(client= genai.GenerativeModel(
   model_name='models/gemini-pro',
   generation_config={}.
   safety_settings={}
), model='gemini-pro', google_api_key=SecretStr('**********'))

In [None]:
result = llm.invoke('give me list of all the available SAP modules without any description')

In [None]:
model = ChatGoogleGenerativeAI(model = "gemini-pro",
                               google_api_key = gemini_Api_Key,
                               temperature = 0.1,
                               convert_system_message_to_human = True,
                               )

In [None]:
result = model.invoke("Hey tell me about SAP")

In [None]:
qa_chain = load_qa_chain(model, retriever = vector_Index, return_source_documents = True)

ValidationError: ignored