# Install required libraries
!pip install -q langchain sentence-transformers faiss-cpu chromadb transformers bitsandbytes accelerate

# Import necessary libraries
import pandas as pd
import random
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.docstore.document import Document
from transformers import AutoTokenizer, pipeline, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
from IPython.display import display, Markdown
import torch
from torch import cuda, bfloat16

# Prepare dataset
fraud_statements = ["The company reported inflated revenues by including sales that never occurred.", "Revenue was recognized prematurely before the actual sales occurred."]
non_fraud_statements = ["The company reported stable revenues consistent with historical trends.", "Revenue was recognized in accordance with standard accounting practices."]
data = [{"text": statement, "fraud_status": "fraud"} for statement in fraud_statements] + \
       [{"text": statement, "fraud_status": "non-fraud"} for statement in non_fraud_statements]
df = pd.DataFrame(data)

# Clean dataset
nltk.download('punkt')
nltk.download('stopwords')
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in set(stopwords.words('english'))])
df['Clean_Text'] = df['text'].apply(clean_text)

# Create documents for vector store
documents = [Document(page_content=f"id:{i}\\Fillings: {row['Clean_Text']}\\Fraud_Status: {row['fraud_status']}") for i, row in df.iterrows()]

# Create embeddings and vector store
hg_embeddings = HuggingFaceEmbeddings()
persist_directory = "docs/chroma_rag/"
vectorstore = Chroma.from_documents(documents, collection_name="finance_data", embedding=hg_embeddings, persist_directory=persist_directory)

# Load and configure model
model_id = 'HuggingFaceH4/zephyr-7b-beta'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=bfloat16)
model_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, config=model_config, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)
query_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.float16, max_length=1024, device_map="auto")

# Query the model
def colorize_text(text):
    for word, color in zip(["Reasoning", "Question", "Answer"], ["blue", "red", "green"]):
        text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text

question = "Explain the concept of financial fraud."
llm = HuggingFacePipeline(pipeline=query_pipeline)
response = llm(prompt=question)
full_response = f"Question: {question}\nAnswer: {response}"
display(Markdown(colorize_text(full_response)))


In [7]:
# Install required libraries
!pip install -q langchain sentence-transformers faiss-cpu chromadb transformers bitsandbytes accelerate

# Import necessary libraries
import pandas as pd
import random
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.docstore.document import Document
from transformers import AutoTokenizer, pipeline, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
from IPython.display import display, Markdown
import torch
from torch import cuda, bfloat16

# Prepare dataset
fraud_statements = ["The company reported inflated revenues by including sales that never occurred.", "Revenue was recognized prematurely before the actual sales occurred."]
non_fraud_statements = ["The company reported stable revenues consistent with historical trends.", "Revenue was recognized in accordance with standard accounting practices."]
data = [{"text": statement, "fraud_status": "fraud"} for statement in fraud_statements] + \
       [{"text": statement, "fraud_status": "non-fraud"} for statement in non_fraud_statements]
df = pd.DataFrame(data)

# Clean dataset
nltk.download('punkt')
nltk.download('stopwords')
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in set(stopwords.words('english'))])
df['Clean_Text'] = df['text'].apply(clean_text)

# Create documents for vector store
documents = [Document(page_content=f"id:{i}\\Fillings: {row['Clean_Text']}\\Fraud_Status: {row['fraud_status']}") for i, row in df.iterrows()]

# Create embeddings and vector store
hg_embeddings = HuggingFaceEmbeddings()
persist_directory = "docs/chroma_rag/"
vectorstore = Chroma.from_documents(documents, collection_name="finance_data", embedding=hg_embeddings, persist_directory=persist_directory)

# Load and configure model
model_id = 'HuggingFaceH4/zephyr-7b-beta'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4', bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=bfloat16)
model_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, config=model_config, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)
query_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.float16, max_length=1024, device_map="auto")

# Query the model
def colorize_text(text):
    for word, color in zip(["Reasoning", "Question", "Answer"], ["blue", "red", "green"]):
        text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text

question = "Explain the concept of financial fraud."
llm = HuggingFacePipeline(pipeline=query_pipeline)
response = llm(prompt=question)
full_response = f"Question: {question}\nAnswer: {response}"
display(Markdown(colorize_text(full_response)))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  hg_embeddings = HuggingFaceEmbeddings()
CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend