In [1]:
from mistralai import Mistral
import numpy as np
import os
from dotenv import load_dotenv

load_dotenv()

client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))

In [2]:
# Load document

from langchain_community.document_loaders import PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader("research-papers")
document = loader.load()

In [3]:
# Split into chunks

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2048,
    chunk_overlap=307
)
docs = text_splitter.split_documents(documents=document)

In [4]:
text_chunk = []

for i in range(len(docs)):
    text_chunk.append(docs[i].page_content)

In [6]:
import time
import random
import numpy as np

def get_text_embedding(input, max_retries=3):
    """Get embedding with built-in retry logic"""
    for attempt in range(max_retries):
        try:
            embeddings_batch_response = client.embeddings.create(
                model="mistral-embed",
                inputs=[input]
            )
            return embeddings_batch_response.data[0].embedding
        except Exception as e:
            if "429" in str(e) and attempt < max_retries - 1:
                # Exponential backoff with jitter
                wait_time = (2 ** attempt) + random.uniform(0, 2)
                print(f"Rate limited on attempt {attempt + 1}. Waiting {wait_time:.2f} seconds...")
                time.sleep(wait_time)
            else:
                # If it's the last attempt or not a rate limit error, raise it
                raise e

def get_embeddings_with_rate_limit(chunks, base_delay=0.5):
    """Process embeddings with rate limiting and progress tracking"""
    embeddings = []
    total_chunks = len(chunks)
    
    for i, chunk in enumerate(chunks):
        try:
            print(f"Processing chunk {i+1}/{total_chunks}...")
            embedding = get_text_embedding(chunk)
            embeddings.append(embedding)
            
            # Progressive delay - increase delay if we're processing many chunks
            if i < total_chunks - 1:  # Don't sleep after the last chunk
                delay = base_delay + (i * 0.1)  # Gradually increase delay
                time.sleep(min(delay, 2.0))  # Cap at 2 seconds
                
        except Exception as e:
            print(f"Failed to get embedding for chunk {i+1}: {str(e)}")
            # You can choose to either skip this chunk or stop processing
            # For now, let's skip and continue
            print("Skipping this chunk and continuing...")
            continue
    
    return np.array(embeddings) if embeddings else np.array([])

# Alternative version with batch processing (if your chunks are small)
def get_embeddings_batch_safe(chunks, batch_size=5, max_retries=3):
    """Process embeddings in smaller batches to reduce rate limiting"""
    all_embeddings = []
    
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1} ({len(batch)} chunks)...")
        
        for attempt in range(max_retries):
            try:
                # Process batch with longer delay between batches
                batch_embeddings = []
                for chunk in batch:
                    embedding = get_text_embedding(chunk, max_retries=2)
                    batch_embeddings.append(embedding)
                    time.sleep(0.3)  # Small delay within batch
                
                all_embeddings.extend(batch_embeddings)
                break  # Success, exit retry loop
                
            except Exception as e:
                if "429" in str(e) and attempt < max_retries - 1:
                    wait_time = 10 + (attempt * 5)  # 10, 15, 20 seconds
                    print(f"Batch rate limited. Waiting {wait_time} seconds before retrying...")
                    time.sleep(wait_time)
                else:
                    print(f"Failed to process batch after {max_retries} attempts: {str(e)}")
                    raise e
        
        # Longer delay between batches
        if i + batch_size < len(chunks):
            print("Waiting between batches...")
            time.sleep(2)
    
    return np.array(all_embeddings)

# Usage examples:
# Option 1: Simple with better retry logic
# text_embeddings = get_embeddings_with_rate_limit(text_chunk)

# Option 2: Batch processing (recommended for large datasets)
text_embeddings = get_embeddings_batch_safe(text_chunk, batch_size=3)

# Option 3: Very conservative approach
# text_embeddings = get_embeddings_with_rate_limit(text_chunk, base_delay=1.0)

Processing batch 1 (3 chunks)...
Waiting between batches...
Processing batch 2 (3 chunks)...
Waiting between batches...
Processing batch 3 (3 chunks)...
Waiting between batches...
Processing batch 4 (3 chunks)...
Waiting between batches...
Processing batch 5 (3 chunks)...
Waiting between batches...
Processing batch 6 (3 chunks)...
Waiting between batches...
Processing batch 7 (3 chunks)...
Waiting between batches...
Processing batch 8 (3 chunks)...
Waiting between batches...
Processing batch 9 (3 chunks)...
Waiting between batches...
Processing batch 10 (3 chunks)...
Waiting between batches...
Processing batch 11 (3 chunks)...
Waiting between batches...
Processing batch 12 (3 chunks)...
Waiting between batches...
Processing batch 13 (3 chunks)...
Waiting between batches...
Processing batch 14 (3 chunks)...
Waiting between batches...
Processing batch 15 (3 chunks)...
Waiting between batches...
Processing batch 16 (3 chunks)...
Waiting between batches...
Processing batch 17 (3 chunks)...

In [7]:
# Store in FAISS

import faiss

d = text_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(text_embeddings)

In [8]:
question = "The papers disuss unsupervised learning strategies and NLP techniques to extract features of reviews. What are the necessary and most accurate feature extraction and selection techniques to identify spam or not spam, based on the paper?"
question_embeddings = np.array([get_text_embedding(question)])

In [9]:
D, I = index.search(question_embeddings, k=50) # distance, index
retrieved_chunk = [text_chunk[i] for i in I.tolist()[0]]

In [None]:
import time
import random

prompt = f"""
Context information is below.
---------------------
{retrieved_chunk}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {question}
Answer:
"""

def run_mistral(user_message, model="mistral-large-latest", max_retries=3):
    for attempt in range(max_retries):
        try:
            messages = [{"role": "user", "content": user_message}]
            chat_response = client.chat.complete(
                model=model,
                messages=messages
            )
            return chat_response.choices[0].message.content
        except Exception as e:
            if "429" in str(e) and attempt < max_retries - 1:
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                print(f"Rate limited. Retrying in {wait_time:.2f} seconds...")
                time.sleep(wait_time)
            else:
                raise e

result = run_mistral(prompt)
print(result)

Rate limited. Retrying in 1.21 seconds...
Based on the provided context, the most **necessary and accurate feature extraction and selection techniques** for identifying spam or non-spam reviews, combining **unsupervised learning strategies** and **NLP techniques**, are summarized below:

---

### **1. Feature Extraction Techniques**
#### **A. Linguistic (NLP-Based) Features**
These are the most dominant and effective techniques for spam detection, focusing on the **textual content** of reviews. Key steps include:

1. **Preprocessing**:
   - **Stopword Removal**: Eliminate irrelevant words (e.g., "the," "is," "and").
   - **Punctuation Removal**: Clean text by removing symbols.
   - **Stemming/Lemmatization**: Reduce words to their root form (e.g., "working" → "work").
   - **Part-of-Speech (POS) Tagging**: Label words as nouns, verbs, adjectives, etc., to capture syntactic patterns.

2. **Tokenization**:
   - **N-grams**:
     - **Unigrams** (single words, e.g., "good").
     - **Bigra