# Data Loading

In [1]:
import os
import pandas as pd

In [5]:
def read_text_file(filepath):
    with open(filepath, 'r') as file:
        text = file.read()
    
    return text

# Data Preprocessing

In [6]:
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [7]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# !python -m spacy download en_core_web_sm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
nlp = spacy.load("en_core_web_sm")

In [9]:
stopword_list = set(stopwords.words('english'))

In [10]:
lemmatizer = WordNetLemmatizer()

In [11]:
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    tokens_without_stopwords = [word for word in tokens if word.lower() not in stopword_list]
    text_without_stopwords = nlp(' '.join(tokens_without_stopwords))
    lemmatized_tokens = [token.lemma_ for token in text_without_stopwords]
    cleaned_data = [re.sub(r'[#*]', '', token) for token in lemmatized_tokens]
    cleaned_data = [re.sub(r'[-]{2,}', '', token) for token in cleaned_data]
    cleaned_data = [token for token in cleaned_data if token.strip()]
    cleaned_data = ' '.join(cleaned_data)

    return cleaned_data

In [12]:
sample_text = read_text_file('C:\\Users\\Akash\\Desktop\\GenAI\\CentraComm\\CentraComm\\data\\project\\alex_johnson.txt')

In [13]:
preprocessed_sample_text = preprocess_text(sample_text)

In [14]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [15]:
def recursive_character_splitter(text, chunk_size=512, chunk_overlap=64):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
    chunks = text_splitter.create_documents([text])
    
    return chunks

In [36]:
docs = recursive_character_splitter(preprocessed_sample_text)

In [18]:
from sentence_transformers import SentenceTransformer

In [19]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')



In [25]:
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.chains import RetrievalQA
from getpass import getpass

In [26]:
OPENAI_API_KEY = getpass('Enter OpenAI API Key:')

Enter OpenAI API Key: ········


In [27]:
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

In [37]:
library = FAISS.from_documents(docs, embeddings)

In [40]:
query_1 = 'What is the document about?'

In [41]:
query_answer = library.similarity_search(query_1)

In [45]:
print(query_answer[1].page_content)

Project Report title : implement Machine Learning Financial Fraud Detection prepared : Alex Johnson , Senior Data Scientist Date : May 25 , 2024 1 . introduction Financial fraud detection critical maintain integrity financial institution protect customer fraudulent activity . traditional rule - base system often fail adapt evolve fraud tactic , necessitating implementation machine learning ( ML ) technique . project report outline step take implement ML - base fraud detection , methodology use , outcome


In [46]:
docs_and_scores = library.similarity_search_with_score(query_1)

In [48]:
docs_and_scores[1]

(Document(page_content='Project Report title : implement Machine Learning Financial Fraud Detection prepared : Alex Johnson , Senior Data Scientist Date : May 25 , 2024 1 . introduction Financial fraud detection critical maintain integrity financial institution protect customer fraudulent activity . traditional rule - base system often fail adapt evolve fraud tactic , necessitating implementation machine learning ( ML ) technique . project report outline step take implement ML - base fraud detection , methodology use , outcome'),
 0.54368895)

In [49]:
retriever = library.as_retriever()

In [51]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(api_key=OPENAI_API_KEY), chain_type='stuff', retriever=retriever)

In [52]:
query_2 = 'What are the technologies used in this project?'

In [53]:
results = qa.invoke(query_2)

In [54]:
print(results)

{'query': 'What are the technologies used in this project?', 'result': ' Machine learning and traditional rule-based systems were used in this project to develop a fraud detection model. Specific technologies mentioned include logistic regression, decision trees, random forests, gradient boosting machines, and neural networks. Additionally, data collection and preprocessing techniques, as well as feature engineering, were utilized in the development of the model.'}
