In [2]:
from mistralai import Mistral
import numpy as np
import faiss
import os
from dotenv import load_dotenv
from getpass import getpass

load_dotenv()

client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))

In [3]:
# Load document

from langchain_community.document_loaders import PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader("research-papers")
document = loader.load()

In [4]:
# Split into chunks

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2048,
    chunk_overlap=307
)
docs = text_splitter.split_documents(documents=document)

In [5]:
text_chunk = []

for i in range(len(docs)):
    text_chunk.append(docs[i].page_content)

In [6]:
import time

def get_text_embedding(input):
    embeddings_batch_response = client.embeddings.create(
        model="mistral-embed",
        inputs=[input]  # Still send as list for consistency
    )
    return embeddings_batch_response.data[0].embedding

def get_embeddings_with_rate_limit(chunks):
    embeddings = []
    for chunk in chunks:
        try:
            embedding = get_text_embedding(chunk)
            embeddings.append(embedding)
            time.sleep(0.1)  # Small delay between requests
        except Exception as e:
            if "429" in str(e):
                print("Rate limited, waiting 5 seconds...")
                time.sleep(5)
                embedding = get_text_embedding(chunk)  # Retry
                embeddings.append(embedding)
            else:
                raise e
    return np.array(embeddings)

text_embeddings = get_embeddings_with_rate_limit(text_chunk)

Rate limited, waiting 5 seconds...
Rate limited, waiting 5 seconds...
Rate limited, waiting 5 seconds...
Rate limited, waiting 5 seconds...
Rate limited, waiting 5 seconds...
Rate limited, waiting 5 seconds...
Rate limited, waiting 5 seconds...
Rate limited, waiting 5 seconds...
Rate limited, waiting 5 seconds...
Rate limited, waiting 5 seconds...
Rate limited, waiting 5 seconds...
Rate limited, waiting 5 seconds...


In [7]:
# Store in FAISS

import faiss

d = text_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(text_embeddings)

In [8]:
question = "What are the necessary and most accurate feature selction techniques to identify spam?"
question_embeddings = np.array([get_text_embedding(question)])

In [9]:
D, I = index.search(question_embeddings, k=50) # distance, index
retrieved_chunk = [text_chunk[i] for i in I.tolist()[0]]

In [10]:
prompt = f"""
Context information is below.
---------------------
{retrieved_chunk}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {question}
Answer:
"""

In [13]:
import time
import random

def run_mistral(user_message, model="mistral-large-latest", max_retries=3):
    for attempt in range(max_retries):
        try:
            messages = [{"role": "user", "content": user_message}]
            chat_response = client.chat.complete(
                model=model,
                messages=messages
            )
            return chat_response.choices[0].message.content
        except Exception as e:
            if "429" in str(e) and attempt < max_retries - 1:
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                print(f"Rate limited. Retrying in {wait_time:.2f} seconds...")
                time.sleep(wait_time)
            else:
                raise e

result = run_mistral(prompt)
print(result)

Based on the provided context, the most necessary and accurate **feature selection techniques** to identify spam reviews involve a combination of **feature engineering approaches** and **selection methods** that address the unique challenges of review spam detection (e.g., high dimensionality, class imbalance, and noisy data). Here’s a summary of the key techniques and insights:

---

### **1. Feature Engineering Approaches**
The accuracy of spam detection heavily depends on the **types of features extracted**. The most effective features include:
- **Linguistic Features**:
  - **N-grams**: Unigrams, bigrams, or trigrams (e.g., word pairs like "amazing product").
  - **Lexical Features**: Part-of-speech (POS) tags, punctuation patterns, or function words (e.g., excessive use of adjectives like "incredible").
  - **Stylometric Features**: Writing style metrics (e.g., average sentence length, vocabulary richness).
  - **Semantic Features**: Sentiment analysis (e.g., overly positive/negat