<a href="https://colab.research.google.com/github/21062872/fraud-detection-rag/blob/main/RAG_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment Setup

In [3]:
!wget https://raw.githubusercontent.com/21062872/fraud-detection-rag/main/code_files/requirements.txt

--2024-08-13 20:09:05--  https://raw.githubusercontent.com/21062872/fraud-detection-rag/main/code_files/requirements.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 124 [text/plain]
Saving to: ‘requirements.txt.1’


2024-08-13 20:09:05 (3.93 MB/s) - ‘requirements.txt.1’ saved [124/124]



In [4]:
!pip install -r requirements.txt



In [5]:
import openai
print(openai.__version__)


0.28.0


In [6]:
import pandas as pd

class DataLoader:
    def __init__(self, url):
        self.url = url
        self.dataframe = None

    def load_data(self):
        """Load data from the URL into a Pandas DataFrame."""
        self.dataframe = pd.read_csv(self.url)
        return self.dataframe

dataset_url = 'https://raw.githubusercontent.com/21062872/fraud-detection-rag/main/data_files/dataset.csv'
data_loader = DataLoader(dataset_url)
df_raw = data_loader.load_data()

In [7]:
import numpy as np
class DataPreprocessor:
    @staticmethod
    def preprocess(dataframe):
        """Preprocess the DataFrame by stripping single quotes from string values."""
        return dataframe.map(lambda x: x.strip("'") if isinstance(x, str) else x)

    @staticmethod
    def rename_columns(dataframe):
        """Rename columns of the DataFrame according to specified mappings."""
        renamed_columns = {
            'step': 'timeStep',
            'customer': 'customerId',
            'age': 'customerAge',
            'gender': 'customerGender',
            'zipcodeOri': 'originZipCode',
            'merchant': 'merchantName',
            'zipMerchant': 'merchantZipCode',
            'category': 'merchantCategory',
            'amount': 'amount',
            'fraud': 'isFraud'
        }
        return dataframe.rename(columns=renamed_columns)

    @staticmethod
    def preprocess_gender(dataframe):
        """
        Preprocess the 'customerGender' column by standardizing values and handling unknowns.

        Parameters:
        dataframe (pd.DataFrame): The DataFrame with the 'customerGender' column.

        Returns:
        pd.DataFrame: The DataFrame with the preprocessed 'customerGender' column.
        """
        dataframe['customerGender'] = dataframe['customerGender'].replace({'E': np.nan, 'U': np.nan})
        return dataframe

    @staticmethod
    def encode_gender(dataframe):
        """
        Encode the 'customerGender' column using one-hot encoding.

        Parameters:
        dataframe (pd.DataFrame): The DataFrame with the 'customerGender' column.

        Returns:
        pd.DataFrame: The DataFrame with the one-hot encoded 'customerGender' column.
        """
        return pd.get_dummies(dataframe, columns=['customerGender'], prefix='Gender', dummy_na=True)

    @staticmethod
    def encode_merchant_category(dataframe):
        """
        Encode the 'merchantCategory' column using one-hot encoding.

        Parameters:
        dataframe (pd.DataFrame): The DataFrame with the 'merchantCategory' column.

        Returns:
        pd.DataFrame: The DataFrame with the one-hot encoded 'merchantCategory' column.
        """
        return pd.get_dummies(dataframe, columns=['merchantCategory'], prefix='Category')

    @staticmethod
    def convert_to_category_codes(df, column_name):
        """
        Convert a categorical column in the DataFrame to category codes.

        Parameters:
        df (pd.DataFrame): The DataFrame containing the data.
        column_name (str): The name of the column to convert.

        Returns:
        pd.DataFrame: The DataFrame with the categorical column converted to category codes.
        """
        df[column_name] = df[column_name].astype('category').cat.codes
        return df

    @staticmethod
    def filter_records_with_unidentified_data(df, column_name):
        """
        Filters out rows where 'customerAge' is 'U'.

        Parameters:
        df (pd.DataFrame): The DataFrame to be filtered.

        Returns:
        pd.DataFrame: A DataFrame with rows where 'customerAge' is not 'U'.
        """
        filtered_df = df[df[column_name] != 'U']

        return filtered_df

data_preprocessor = DataPreprocessor()
df_processed = data_preprocessor.preprocess(df_raw)
df_processed = data_preprocessor.rename_columns(df_processed)
df_processed = data_preprocessor.preprocess_gender(df_processed)
df_processed = data_preprocessor.filter_records_with_unidentified_data(df_processed, 'customerAge')

In [8]:
# Creating a duplicate of dataframe to process RAG
df_processed_rag = df_processed.copy()

In [9]:
def process_transactions(df):
    """
    Adds 'IslargeTransaction' and 'countForCustomerSameTime' columns to the DataFrame.

    Parameters:
    df (pd.DataFrame): DataFrame containing the transaction data with 'amount', 'timeStep', and 'customerId' columns.

    Returns:
    pd.DataFrame: DataFrame with the new columns added.
    """
    # Add 'IslargeTransaction' column based on the condition
    df['IslargeTransaction'] = df['amount'].apply(lambda x: 1 if x >= 5000 else 0)

    # Calculate the count of transactions for each customer at each time step
    df['countForCustomerSameTime'] = df.groupby(['timeStep', 'customerId'])['customerId'].transform('count')

    return df

def add_avg_transaction_amount(df, customer_id_col='customerId', amount_col='amount'):
    """
    Adds a new column to the DataFrame that contains the average transaction amount
    for each customerId.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the transaction data.
    customer_id_col (str): The column name representing customer IDs. Default is 'customerId'.
    amount_col (str): The column name representing transaction amounts. Default is 'amount'.

    Returns:
    pd.DataFrame: The DataFrame with an added column 'avgTransactionAmount'.
    """
    # Calculate the average transaction amount for each customerId
    avg_transaction_amount = df.groupby(customer_id_col)[amount_col].mean()

    # Map the average transaction amount back to the original DataFrame
    df['avgTransactionAmount'] = df[customer_id_col].map(avg_transaction_amount)

    return df

In [10]:
# Apply the function to process the DataFrame
df_processed = process_transactions(df_processed)
df_processed = add_avg_transaction_amount(df_processed)

In [11]:
df_processed = df_processed.drop(columns=['originZipCode', 'merchantZipCode'])
# Removing the first character from the customerId column
df_processed['customerId'] = df_processed['customerId'].str[1:]

# Removing the first character from the merchantName column
df_processed['merchantName'] = df_processed['merchantName'].str[1:]

# Removing the first three characters from the merchantCategory column
df_processed['merchantCategory'] = df_processed['merchantCategory'].str[3:]

In [12]:
def categorize_amount(amount):
    if amount < 1:
        return 'low amount'
    elif 1 <= amount < 100:
        return 'intermediate amount'
    elif 100 <= amount < 1000:
        return 'considerable amount'
    elif 1000 <= amount < 5000:
        return 'large amount'
    else:
        return 'hefty amount'

df_processed['bhv_amount'] = df_processed['amount'].apply(categorize_amount)

df_processed['bhv_frequent'] = df_processed['countForCustomerSameTime'].apply(
    lambda x: 'uncommon transaction' if x == 1 else 'recurring transaction'
)

df_processed['bhv_isFraud'] = df_processed['isFraud'].apply(
    lambda x: 'fraud transaction' if x == 1 else 'genuine transaction'
)

df_processed['bhv_gender'] = df_processed['customerGender'].apply(
    lambda x: 'male' if x == 'M' else 'female'
)


df_processed['customerAge'] = pd.to_numeric(df_processed['customerAge'], errors='coerce')

def categorize_customerAge(customerAge):
    if pd.isna(customerAge):  # Handle NaN values
        return 'Unknown'
    elif customerAge < 2:
        return 'Child'
    elif 2 <= customerAge < 4:
        return 'Teen'
    else:
        return 'Adult'

df_processed['bhv_customerAge'] = df_processed['customerAge'].apply(categorize_customerAge)

In [13]:
df_processed['transaction_behavior'] = df_processed.apply(
    lambda row: f"A {row['bhv_frequent']} originated from {row['bhv_customerAge']} {row['bhv_gender']} customer {row['customerId']} "
                f"from merchant {row['merchantName']} to category {row['merchantCategory']} "
                f"with {row['bhv_amount']} is categorized as {row['bhv_isFraud']}",
    axis=1
)

In [14]:
import pandas as pd

# Assuming df_processed is already defined

# Step 1: Select 500 records where isFraud=1
df_fraud = df_processed[df_processed['isFraud'] == 1].sample(n=500, random_state=42)

# Step 2: Select 500 records where isFraud=0
df_non_fraud = df_processed[df_processed['isFraud'] == 0].sample(n=500, random_state=42)

# Step 3: Concatenate the two subsets
df_subset = pd.concat([df_fraud, df_non_fraud])

# Step 4: Optionally, shuffle the df_subset
df_subset = df_subset.sample(frac=1, random_state=42).reset_index(drop=True)

# df_subset now contains 1000 records with 50% isFraud=1 and 50% isFraud=0


In [22]:
df_subset.to_csv('df_subset.csv', index=False)

In [15]:
df_subset.head(3)

Unnamed: 0,timeStep,customerId,customerAge,customerGender,merchantName,merchantCategory,amount,isFraud,IslargeTransaction,countForCustomerSameTime,avgTransactionAmount,bhv_amount,bhv_frequent,bhv_isFraud,bhv_gender,bhv_customerAge,transaction_behavior
0,42,1560893361,3,M,348934600,transportation,5.49,0,0,1,29.924235,intermediate amount,uncommon transaction,genuine transaction,male,Teen,A uncommon transaction originated from Teen ma...
1,142,2123823248,4,F,1823072687,transportation,5.28,0,0,1,28.757716,intermediate amount,uncommon transaction,genuine transaction,female,Adult,A uncommon transaction originated from Adult f...
2,119,1877622224,2,M,1823072687,transportation,8.37,0,0,1,41.693211,intermediate amount,uncommon transaction,genuine transaction,male,Teen,A uncommon transaction originated from Teen ma...


In [16]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss


def load_model(model_name: str = 'paraphrase-MiniLM-L6-v2') -> SentenceTransformer:
    """
    Load and return the SentenceTransformer model.

    Parameters:
    model_name (str): The name of the pre-trained model.

    Returns:
    SentenceTransformer: The loaded SentenceTransformer model.
    """
    return SentenceTransformer(model_name)


def generate_embeddings(df: pd.DataFrame, column: str, model: SentenceTransformer) -> pd.DataFrame:
    """
    Generate embeddings for a specific column in the DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    column (str): The name of the column to generate embeddings for.
    model (SentenceTransformer): The SentenceTransformer model.

    Returns:
    pd.DataFrame: The DataFrame with an additional column for embeddings.
    """
    df['embeddings'] = df[column].apply(lambda x: model.encode(x))
    return df


def initialize_faiss_index(embeddings: np.ndarray) -> faiss.IndexFlatL2:
    """
    Initialize and return a FAISS index.

    Parameters:
    embeddings (np.ndarray): The embeddings to be added to the index.

    Returns:
    faiss.IndexFlatL2: The initialized FAISS index.
    """
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index


def save_faiss_index(index: faiss.IndexFlatL2, filepath: str):
    """
    Save the FAISS index to disk.

    Parameters:
    index (faiss.IndexFlatL2): The FAISS index to be saved.
    filepath (str): The path where the index will be saved.
    """
    faiss.write_index(index, filepath)


def search_similar_transactions(query: str, model: SentenceTransformer, index: faiss.IndexFlatL2,
                                df: pd.DataFrame, k: int = 3) -> dict:
    """
    Search for similar transactions based on the query and return results.

    Parameters:
    query (str): The query string to find similar transactions.
    model (SentenceTransformer): The SentenceTransformer model.
    index (faiss.IndexFlatL2): The FAISS index.
    df (pd.DataFrame): The DataFrame containing the transaction behaviors.
    k (int): The number of similar items to retrieve.

    Returns:
    dict: Contains 'top_k_results' with transaction behaviors and similarity scores,
          and 'prediction' which is the last 2 words of the highest similarity transaction.
    """
    query_embedding = model.encode(query)
    D, I = index.search(np.array([query_embedding]), k)

    # Collect results
    top_k_results = []
    for idx, dist in zip(I[0], D[0]):
        result = {
            'transaction_behavior': df.iloc[idx]['transaction_behavior'],
            'similarity_score': 1 / (1 + dist)  # Convert distance to similarity score
        }
        top_k_results.append(result)

    # Extract the last 2 words of the highest similarity transaction
    highest_similarity_transaction = top_k_results[0]['transaction_behavior']
    prediction = ' '.join(highest_similarity_transaction.split()[-2:])

    return {
        'top_k_results': top_k_results,
        'prediction': prediction
    }


def display_results(results: dict):
    """
    Display the search results and prediction.

    Parameters:
    results (dict): Contains 'top_k_results' with transaction behaviors and similarity scores,
                    and 'prediction'.
    """
    print("Top K Results:")
    for i, res in enumerate(results['top_k_results']):
        print(f"Result {i+1}:")
        print(f"Transaction Behavior: {res['transaction_behavior']}")
        print(f"Similarity Score: {res['similarity_score']:.4f}")
        print("-" * 80)

    print("Prediction :")
    print("This transaction is a ", results['prediction'])



  from tqdm.autonotebook import tqdm, trange


In [17]:
if __name__ == "__main__":
    # Load data and model
    model = load_model()

    # Generate embeddings and add to DataFrame
    df_subset = generate_embeddings(df_subset, 'transaction_behavior', model)

    # Prepare embeddings and initialize FAISS index
    embeddings = np.vstack(df_subset['embeddings'].values)
    index = initialize_faiss_index(embeddings)

    # Optionally save the FAISS index
    save_faiss_index(index, "faiss_index.index")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [18]:
# Perform search with a query
query = "A uncommon transaction originated from Adult male customer 1038277619 from merchant 980657600 to category sportsandtoy"
results = search_similar_transactions(query, model, index, df_subset)

# Display the search results
display_results(results)

Top K Results:
Result 1:
Transaction Behavior: A uncommon transaction originated from Adult male customer 1038277619 from merchant 980657600 to category sportsandtoys with considerable amount is categorized as fraud transaction
Similarity Score: 0.1505
--------------------------------------------------------------------------------
Result 2:
Transaction Behavior: A uncommon transaction originated from Adult male customer 253956131 from merchant 980657600 to category sportsandtoys with considerable amount is categorized as fraud transaction
Similarity Score: 0.1380
--------------------------------------------------------------------------------
Result 3:
Transaction Behavior: A uncommon transaction originated from Teen male customer 825744832 from merchant 1649169323 to category sportsandtoys with intermediate amount is categorized as genuine transaction
Similarity Score: 0.1364
--------------------------------------------------------------------------------
Prediction :
This transactio

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def evaluate_model(df: pd.DataFrame, model: SentenceTransformer, index: faiss.IndexFlatL2, queries: list, true_labels: list) -> dict:
    """
    Evaluate the model's performance on a set of queries.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the transaction behaviors and actual labels.
    model (SentenceTransformer): The SentenceTransformer model.
    index (faiss.IndexFlatL2): The FAISS index.
    queries (list): A list of query strings.
    true_labels (list): The actual labels corresponding to the queries.

    Returns:
    dict: Evaluation metrics including accuracy, precision, recall, and F1 score,
          and 'predictions_df' DataFrame with predictions and actual labels.
    """
    predictions = []
    actual_labels = []

    for query, true_label in zip(queries, true_labels):
        results = search_similar_transactions(query, model, index, df)
        predicted_label = results['prediction']

        # Collect actual and predicted labels
        actual_labels.append(true_label)
        predictions.append(predicted_label)

    # Calculate metrics
    accuracy = accuracy_score(actual_labels, predictions)
    precision = precision_score(actual_labels, predictions, average='binary', pos_label='fraud transaction')  # Adjust pos_label as necessary
    recall = recall_score(actual_labels, predictions, average='binary', pos_label='fraud transaction')
    f1 = f1_score(actual_labels, predictions, average='binary', pos_label='fraud transaction')

    # Create DataFrame with predictions and actual labels
    predictions_df = pd.DataFrame({
        'transaction_behavior': queries,
        'actual': actual_labels,
        'prediction': predictions
    })

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'classification_report': classification_report(actual_labels, predictions),
        'predictions_df': predictions_df
    }

In [20]:
if __name__ == "__main__":
    model = load_model()

    queries = df_subset['transaction_behavior'].tolist()
    true_labels = df_subset['bhv_isFraud'].tolist()

    # Evaluate the model
    metrics = evaluate_model(df_subset, model, index, queries, true_labels)

    # Print evaluation metrics
    print("Evaluation Metrics:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1 Score: {metrics['f1_score']:.4f}")
    print("Classification Report:")
    print(metrics['classification_report'])

    print(metrics['predictions_df']['prediction'])

    # Append predictions to the sample DataFrame
    metrics['predictions_df']['prediction']
    metrics['predictions_df']['actual']


Evaluation Metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
Classification Report:
                     precision    recall  f1-score   support

  fraud transaction       1.00      1.00      1.00       500
genuine transaction       1.00      1.00      1.00       500

           accuracy                           1.00      1000
          macro avg       1.00      1.00      1.00      1000
       weighted avg       1.00      1.00      1.00      1000

0      genuine transaction
1      genuine transaction
2      genuine transaction
3      genuine transaction
4        fraud transaction
              ...         
995      fraud transaction
996      fraud transaction
997    genuine transaction
998      fraud transaction
999      fraud transaction
Name: prediction, Length: 1000, dtype: object


In [21]:
metrics['predictions_df']

Unnamed: 0,transaction_behavior,actual,prediction
0,A uncommon transaction originated from Teen ma...,genuine transaction,genuine transaction
1,A uncommon transaction originated from Adult f...,genuine transaction,genuine transaction
2,A uncommon transaction originated from Teen ma...,genuine transaction,genuine transaction
3,A uncommon transaction originated from Teen ma...,genuine transaction,genuine transaction
4,A uncommon transaction originated from Teen ma...,fraud transaction,fraud transaction
...,...,...,...
995,A uncommon transaction originated from Adult f...,fraud transaction,fraud transaction
996,A recurring transaction originated from Teen f...,fraud transaction,fraud transaction
997,A uncommon transaction originated from Teen fe...,genuine transaction,genuine transaction
998,A recurring transaction originated from Adult ...,fraud transaction,fraud transaction


In [22]:
query = "A recurring transaction originated from Teen male..to category health"
results = search_similar_transactions(query, model, index, df_subset)

# Display the search results
display_results(results)

Top K Results:
Result 1:
Transaction Behavior: A recurring transaction originated from Teen male customer 1249070059 from merchant 692898500 to category health with intermediate amount is categorized as genuine transaction
Similarity Score: 0.0505
--------------------------------------------------------------------------------
Result 2:
Transaction Behavior: A recurring transaction originated from Teen male customer 1935661715 from merchant 480139044 to category health with considerable amount is categorized as fraud transaction
Similarity Score: 0.0500
--------------------------------------------------------------------------------
Result 3:
Transaction Behavior: A recurring transaction originated from Teen female customer 196806031 from merchant 1053599405 to category health with considerable amount is categorized as genuine transaction
Similarity Score: 0.0500
--------------------------------------------------------------------------------
Prediction :
This transaction is a  genuine

In [23]:
query = "A recurring transaction originated from Teen female customer 1728122351 "
results = search_similar_transactions(query, model, index, df_subset)

# Display the search results
display_results(results)

Top K Results:
Result 1:
Transaction Behavior: A recurring transaction originated from Teen female customer 196806031 from merchant 1053599405 to category health with considerable amount is categorized as genuine transaction
Similarity Score: 0.0800
--------------------------------------------------------------------------------
Result 2:
Transaction Behavior: A recurring transaction originated from Teen female customer 1032319289 from merchant 480139044 to category health with considerable amount is categorized as fraud transaction
Similarity Score: 0.0796
--------------------------------------------------------------------------------
Result 3:
Transaction Behavior: A recurring transaction originated from Teen female customer 761166755 from merchant 1600850729 to category fashion with intermediate amount is categorized as genuine transaction
Similarity Score: 0.0795
--------------------------------------------------------------------------------
Prediction :
This transaction is a  ge

In [24]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

def load_model(model_name: str = 'paraphrase-MiniLM-L6-v2') -> SentenceTransformer:
    """
    Load and return the SentenceTransformer model.
    """
    return SentenceTransformer(model_name)

def generate_embeddings(df: pd.DataFrame, column: str, model: SentenceTransformer) -> pd.DataFrame:
    """
    Generate embeddings for a specific column in the DataFrame.
    """
    df['embeddings'] = df[column].apply(lambda x: model.encode(x))
    return df

def initialize_faiss_index(embeddings: np.ndarray) -> faiss.IndexFlatL2:
    """
    Initialize and return a FAISS index.
    """
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

def search_similar_transactions(query: str, model: SentenceTransformer, index: faiss.IndexFlatL2,
                                df: pd.DataFrame, k: int = 3, high_priority_terms: list = None) -> dict:
    """
    Search for similar transactions based on the query and adjust scores based on high-priority terms.
    """
    if high_priority_terms is None:
        high_priority_terms = []

    query_embedding = model.encode(query)
    D, I = index.search(np.array([query_embedding]), k)

    # Adjust the similarity scores based on high-priority terms
    results = []
    for idx, dist in zip(I[0], D[0]):
        transaction_behavior = df.iloc[idx]['transaction_behavior']
        similarity_score = 1 / (1 + dist)  # Convert distance to similarity score

        # Check for high-priority terms in the result
        priority_score = sum(term in transaction_behavior for term in high_priority_terms)
        adjusted_score = similarity_score + priority_score  # Adjust score

        result = {
            'transaction_behavior': transaction_behavior,
            'similarity_score': adjusted_score
        }
        results.append(result)

    # Sort results based on the adjusted score
    results = sorted(results, key=lambda x: x['similarity_score'], reverse=True)

    # Extract the highest similarity transaction
    highest_similarity_transaction = results[0]['transaction_behavior']
    prediction = ' '.join(highest_similarity_transaction.split()[-2:])

    return {
        'top_k_results': results,
        'prediction': prediction
    }

# Example usage
if __name__ == "__main__":
    model = load_model()

    # Assume df_subset is already defined and contains relevant data
    df_subset = generate_embeddings(df_subset, 'transaction_behavior', model)
    embeddings = np.vstack(df_subset['embeddings'].values)
    index = initialize_faiss_index(embeddings)

    query = "A uncommon transaction originated from Adult male customer 1038277619 from merchant 980657600 to category sportsandtoys"
    high_priority_terms = ["1038277619", "980657600"]

    # Perform search with a query and priority terms
    results = search_similar_transactions(query, model, index, df_subset, k=3, high_priority_terms=high_priority_terms)

    # Display the search results
    print("Top K Results:")
    for i, res in enumerate(results['top_k_results']):
        print(f"Result {i+1}:")
        print(f"Transaction Behavior: {res['transaction_behavior']}")
        print(f"Adjusted Similarity Score: {res['similarity_score']:.4f}")
        print("-" * 80)

    print("Prediction :")
    print(f"This transaction is a {results['prediction']}")


Top K Results:
Result 1:
Transaction Behavior: A uncommon transaction originated from Adult male customer 1038277619 from merchant 980657600 to category sportsandtoys with considerable amount is categorized as fraud transaction
Adjusted Similarity Score: 2.1528
--------------------------------------------------------------------------------
Result 2:
Transaction Behavior: A uncommon transaction originated from Adult male customer 253956131 from merchant 980657600 to category sportsandtoys with considerable amount is categorized as fraud transaction
Adjusted Similarity Score: 1.1400
--------------------------------------------------------------------------------
Result 3:
Transaction Behavior: A uncommon transaction originated from Teen male customer 825744832 from merchant 1649169323 to category sportsandtoys with intermediate amount is categorized as genuine transaction
Adjusted Similarity Score: 0.1359
--------------------------------------------------------------------------------
P

In [25]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, average_precision_score, ndcg_score

def evaluate_similarity_search_advanced(df: pd.DataFrame, model: SentenceTransformer, index: faiss.IndexFlatL2, queries: list, true_labels: list, k: int = 3) -> dict:
    """
    Evaluate the similarity search model's performance on a set of queries.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the transaction behaviors and actual labels.
    model (SentenceTransformer): The SentenceTransformer model.
    index (faiss.IndexFlatL2): The FAISS index.
    queries (list): A list of query strings.
    true_labels (list): The actual labels corresponding to the queries.
    k (int): The number of top results to consider for evaluation.

    Returns:
    dict: Evaluation metrics including Precision@K, Recall@K, Average Precision, and NDCG.
    """
    all_relevant = []
    all_scores = []
    predictions = []

    for query, true_label in zip(queries, true_labels):
        results = search_similar_transactions(query, model, index, df, k)

        # Extract results
        top_k_results = results['top_k_results']
        predicted_label = results['prediction']

        # Collect the predicted label
        predictions.append(predicted_label)

        # Collect relevant items and their scores
        relevant_items = [1 if true_label in result['transaction_behavior'] else 0 for result in top_k_results]
        all_scores.extend([result['similarity_score'] for result in top_k_results])
        all_relevant.extend(relevant_items)

    # Precision at K
    precision_at_k = np.mean([sum([1 if item == 1 else 0 for item in relevant_items]) / k for relevant_items in [all_relevant]])

    # Recall at K
    recall_at_k = np.mean([sum(all_relevant) / len(true_labels)])

    # Mean Average Precision (MAP)
    average_precision = average_precision_score(all_relevant, all_scores)

    # NDCG
    if len(all_relevant) > 0:
        ndcg = ndcg_score([all_relevant], [all_scores], k=k)
    else:
        ndcg = 0.0

    return {
        'precision_at_k': precision_at_k,
        'recall_at_k': recall_at_k,
        'average_precision': average_precision,
        'ndcg': ndcg
    }



if __name__ == "__main__":
    model = load_model()

    # Prepare the sample
    sample_df = df_subset.sample(n=10, random_state=42)
    queries = sample_df['transaction_behavior'].str.slice(0, 120).tolist()
    true_labels = sample_df['bhv_isFraud'].tolist()

    # Initialize FAISS index and generate embeddings
    embeddings = np.vstack(df_subset['embeddings'].values)
    index = initialize_faiss_index(embeddings)

    # Evaluate the model
    metrics = evaluate_similarity_search_advanced(df_subset, model, index, queries, true_labels, k=3)

    # Print evaluation metrics
    print("Evaluation Metrics:")
    print(f"Average Precision: {metrics['average_precision']:.4f}")
    print(f"NDCG: {metrics['ndcg']:.4f}")


Evaluation Metrics:
Average Precision: 0.7916
NDCG: 0.7039


In [26]:
df_subset.head(3)

Unnamed: 0,timeStep,customerId,customerAge,customerGender,merchantName,merchantCategory,amount,isFraud,IslargeTransaction,countForCustomerSameTime,avgTransactionAmount,bhv_amount,bhv_frequent,bhv_isFraud,bhv_gender,bhv_customerAge,transaction_behavior,embeddings
0,42,1560893361,3,M,348934600,transportation,5.49,0,0,1,29.924235,intermediate amount,uncommon transaction,genuine transaction,male,Teen,A uncommon transaction originated from Teen ma...,"[-0.30932543, -0.01592932, -0.19726196, -0.359..."
1,142,2123823248,4,F,1823072687,transportation,5.28,0,0,1,28.757716,intermediate amount,uncommon transaction,genuine transaction,female,Adult,A uncommon transaction originated from Adult f...,"[-0.25198793, -0.23569219, -0.12751329, -0.098..."
2,119,1877622224,2,M,1823072687,transportation,8.37,0,0,1,41.693211,intermediate amount,uncommon transaction,genuine transaction,male,Teen,A uncommon transaction originated from Teen ma...,"[-0.3359077, -0.028759424, -0.15090181, -0.327..."


In [155]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

def load_model(model_name: str = 'paraphrase-MiniLM-L6-v2') -> SentenceTransformer:
    """
    Load and return the SentenceTransformer model.
    """
    return SentenceTransformer(model_name)

def generate_embeddings(df: pd.DataFrame, column: str, model: SentenceTransformer) -> pd.DataFrame:
    """
    Generate embeddings for a specific column in the DataFrame.
    """
    df['embeddings'] = df[column].apply(lambda x: model.encode(x))
    return df

def initialize_faiss_index(embeddings: np.ndarray) -> faiss.Index:
    dimension = embeddings.shape[1]
    quantizer = faiss.IndexFlatL2(dimension)
    index = faiss.IndexIVFFlat(quantizer, dimension, 100)  # Using IVFFlat index
    index.train(embeddings)  # Train the index
    index.add(embeddings)
    return index

def advanced_scoring(query_embedding, results, df, high_priority_terms):
    """
    Adjust the similarity scores based on high-priority terms.
    """
    adjusted_results = []
    for idx, dist in zip(results['I'][0], results['D'][0]):
        transaction_behavior = df.iloc[idx]['transaction_behavior']
        similarity_score = 1 / (1 + dist)  # Convert distance to similarity score

        # Enhance scoring based on term presence
        priority_score = sum(10 for term in high_priority_terms if term in transaction_behavior)  # Increased weight
        adjusted_score = similarity_score + priority_score  # Add priority score directly

        adjusted_results.append({
            'transaction_behavior': transaction_behavior,
            'similarity_score': adjusted_score
        })
    return sorted(adjusted_results, key=lambda x: x['similarity_score'], reverse=True)

def search_similar_transactions(query: str, model: SentenceTransformer, index: faiss.Index,
                                df: pd.DataFrame, k: int = 3, high_priority_terms: list = None) -> dict:
    """
    Search for similar transactions based on the query and adjust scores based on high-priority terms.
    """
    if high_priority_terms is None:
        high_priority_terms = []

    query_embedding = model.encode(query)
    D, I = index.search(np.array([query_embedding]), k)

    # Get initial results
    initial_results = []
    for idx, dist in zip(I[0], D[0]):
        transaction_behavior = df.iloc[idx]['transaction_behavior']
        similarity_score = 1 / (1 + dist)  # Convert distance to similarity score

        initial_results.append({
            'transaction_behavior': transaction_behavior,
            'similarity_score': similarity_score
        })

    # Adjust scores using advanced scoring
    results = advanced_scoring(query_embedding, {'D': D, 'I': I}, df, high_priority_terms)

    # Extract the highest similarity transaction
    highest_similarity_transaction = results[0]['transaction_behavior']
    prediction = ' '.join(highest_similarity_transaction.split()[-2:])

    return {
        'top_k_results': results,
        'prediction': prediction
    }

# Example usage
if __name__ == "__main__":
    model = load_model()

    # Assume df_subset is already defined and contains relevant data
    df_subset = generate_embeddings(df_subset, 'transaction_behavior', model)
    embeddings = np.vstack(df_subset['embeddings'].values)
    index = initialize_faiss_index(embeddings)

    query = "customer 1728122351 from merchant 732195782"
    high_priority_terms = ["1728122351", "732195782"]

    # Perform search with a query and priority terms
    results = search_similar_transactions(query, model, index, df_subset, k=3, high_priority_terms=high_priority_terms)

    # Display the search results
    print("Top K Results:")
    for i, res in enumerate(results['top_k_results']):
        print(f"Result {i+1}:")
        print(f"Transaction Behavior: {res['transaction_behavior']}")
        print(f"Adjusted Similarity Score: {res['similarity_score']:.4f}")
        print("-" * 80)

    print("Prediction :")
    print(f"This transaction is a {results['prediction']}")


Top K Results:
Result 1:
Transaction Behavior: A uncommon transaction originated from Adult male customer 1640022626 from merchant 209847108 to category wellnessandbeauty with considerable amount is categorized as genuine transaction
Adjusted Similarity Score: 0.0335
--------------------------------------------------------------------------------
Result 2:
Transaction Behavior: A uncommon transaction originated from Teen female customer 1940107136 from merchant 1198415165 to category wellnessandbeauty with intermediate amount is categorized as genuine transaction
Adjusted Similarity Score: 0.0325
--------------------------------------------------------------------------------
Result 3:
Transaction Behavior: A uncommon transaction originated from Teen male customer 459595150 from merchant 1535107174 to category wellnessandbeauty with intermediate amount is categorized as genuine transaction
Adjusted Similarity Score: 0.0322
---------------------------------------------------------------

## generative part

In [27]:
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

def load_model(model_name: str = 'paraphrase-MiniLM-L6-v2') -> SentenceTransformer:
    """
    Load and return the SentenceTransformer model.
    """
    return SentenceTransformer(model_name)

def generate_embeddings(df: pd.DataFrame, column: str, model: SentenceTransformer) -> pd.DataFrame:
    """
    Generate embeddings for a specific column in the DataFrame.
    """
    df['embeddings'] = df[column].apply(lambda x: model.encode(x))
    return df

def initialize_faiss_index(embeddings: np.ndarray) -> faiss.IndexFlatL2:
    """
    Initialize and return a FAISS index.
    """
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

def search_and_generate_response(query: str, model: SentenceTransformer, index: faiss.IndexFlatL2,
                                 df: pd.DataFrame, api_key: str, k: int = 3, high_priority_terms: list = None) -> dict:
    """
    Search for similar transactions and generate a response based on the query and high-priority terms.
    """
    if high_priority_terms is None:
        high_priority_terms = []

    query_embedding = model.encode(query)
    D, I = index.search(np.array([query_embedding]), k)

    # Adjust the similarity scores based on high-priority terms
    results = []
    for idx, dist in zip(I[0], D[0]):
        transaction_behavior = df.iloc[idx]['transaction_behavior']
        similarity_score = 1 / (1 + dist)  # Convert distance to similarity score

        # Check for high-priority terms in the result
        priority_score = sum(term in transaction_behavior for term in high_priority_terms)
        adjusted_score = similarity_score + priority_score  # Adjust score

        result = {
            'transaction_behavior': transaction_behavior,
            'similarity_score': adjusted_score
        }
        results.append(result)

    # Sort results based on the adjusted score
    results = sorted(results, key=lambda x: x['similarity_score'], reverse=True)

    # Generate a response based on the top results
    response = generate_response(results, query, api_key)

    # Extract the highest similarity transaction
    highest_similarity_transaction = results[0]['transaction_behavior']
    prediction = ' '.join(highest_similarity_transaction.split()[-2:])

    return {
        'top_k_results': results,
        'prediction': prediction,
        'response': response
    }

In [167]:
import openai

In [28]:
def generate_response(retrieved_docs: list, query: str, api_key: str) -> str:
    """
    Generate a response using a generative model based on the retrieved documents.
    """
    openai.api_key = api_key

    # Format the prompt for a chat-based model
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"Query: {query}"},
        {"role": "user", "content": "Documents:"},
        {"role": "user", "content": "\n".join(doc['transaction_behavior'] for doc in retrieved_docs)}
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Or "gpt-4" depending on your access
        messages=messages,
        max_tokens=150,  # Adjust based on your needs
        temperature=0.7
    )

    return response.choices[0].message['content'].strip()


In [29]:
if __name__ == "__main__":
    model = load_model()

    # Assume df_subset is already defined and contains relevant data
    df_subset = generate_embeddings(df_subset, 'transaction_behavior', model)
    embeddings = np.vstack(df_subset['embeddings'].values)
    index = initialize_faiss_index(embeddings)

    query = "A uncommon transaction originated from Adult male customer 1038277619 from merchant 980657600 to category sportsandtoys"
    high_priority_terms = ["1038277619", "980657600"]
    api_key = #key here

    # Perform search with a query and priority terms
    results = search_and_generate_response(query, model, index, df_subset, api_key, k=3, high_priority_terms=high_priority_terms)

    # Display the search results
    print("Top K Results:")
    for i, res in enumerate(results['top_k_results']):
        print(f"Result {i+1}:")
        print(f"Transaction Behavior: {res['transaction_behavior']}")
        print(f"Adjusted Similarity Score: {res['similarity_score']:.4f}")
        print("-" * 80)

    print("Prediction:")
    print(f"This transaction is a {results['prediction']}")

    print("Generated Response:")
    print(results['response'])


Top K Results:
Result 1:
Transaction Behavior: A uncommon transaction originated from Adult male customer 1038277619 from merchant 980657600 to category sportsandtoys with considerable amount is categorized as fraud transaction
Adjusted Similarity Score: 2.1528
--------------------------------------------------------------------------------
Result 2:
Transaction Behavior: A uncommon transaction originated from Adult male customer 253956131 from merchant 980657600 to category sportsandtoys with considerable amount is categorized as fraud transaction
Adjusted Similarity Score: 1.1400
--------------------------------------------------------------------------------
Result 3:
Transaction Behavior: A uncommon transaction originated from Teen male customer 825744832 from merchant 1649169323 to category sportsandtoys with intermediate amount is categorized as genuine transaction
Adjusted Similarity Score: 0.1359
--------------------------------------------------------------------------------
P

In [23]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met