**## Setup**

In [16]:
# Install required libraries
!pip install -q langchain langchain_openai pandas matplotlib openai faiss-cpu tiktoken tabulate langchain-community

# Import libraries
import pandas as pd
import numpy as np
import os
import time
from typing import Dict, List, Any, Optional

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "your-api-key-here"  # Replace with your actual key

# Import LangChain components
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser

**## 11.5.1 Architectural Adaptations for CSV Data**

In [10]:
class CSVQuestionAnsweringSystem:
    """A Q&A system for CSV files."""

    def __init__(self, model_name="gpt-3.5-turbo"):
        self.llm = ChatOpenAI(model_name=model_name, temperature=0)
        self.dataframes = {}
        self.df_info = {}

    def add_csv(self, file_path: str, name: str = None):
        if name is None:
            name = os.path.basename(file_path).split('.')[0]

        df = pd.read_csv(file_path)
        self.dataframes[name] = df

        info = self._create_dataframe_info(df)
        self.df_info[name] = info

        print(f"Added CSV '{name}' with {len(df)} rows and {len(df.columns)} columns")
        return True

    def _create_dataframe_info(self, df: pd.DataFrame) -> str:
        info = f"Columns: {', '.join(df.columns.tolist())}\n"
        info += f"Rows: {len(df)}\n"
        info += "Sample data:\n"
        info += df.head(3).to_string()
        return info

    def generate_answer(self, question: str, csv_names: List[str] = None) -> str:
        if csv_names is None:
            csv_names = list(self.dataframes.keys())

        context = self._prepare_context(csv_names)

        prompt = PromptTemplate.from_template(
            """You are a data analyst. Answer this question using the CSV data provided.

            DATA:
            {context}

            QUESTION:
            {question}

            ANSWER:"""
        )

        chain = prompt | self.llm | StrOutputParser()
        return chain.invoke({"context": context, "question": question})

    def _prepare_context(self, csv_names: List[str]) -> str:
        context = ""
        for name in csv_names:
            if name in self.dataframes:
                context += f"=== {name} Dataset ===\n{self.df_info[name]}\n\n"
        return context

**## 11.5.2 Loading and Processing CSV Files Efficiently**

In [11]:
def load_csv_efficiently(file_path: str,
                       chunk_size: Optional[int] = None,
                       optimize_dtypes: bool = True) -> pd.DataFrame:
    """Load a CSV file with performance optimizations."""
    start_time = time.time()

    # Sample the file to determine data types
    dtypes = None
    if optimize_dtypes:
        try:
            sample = pd.read_csv(file_path, nrows=1000)

            # Optimize dtypes based on the sample
            dtypes = {}
            for col in sample.columns:
                # Convert object columns with few unique values to category
                if sample[col].dtype == 'object':
                    unique_pct = sample[col].nunique() / len(sample)
                    if unique_pct < 0.5:  # If less than 50% unique values
                        dtypes[col] = 'category'
                # Downcast integers and floats
                elif sample[col].dtype == 'int64':
                    dtypes[col] = 'int32'  # Use smaller integer type
                elif sample[col].dtype == 'float64':
                    dtypes[col] = 'float32'  # Use smaller float type
        except:
            dtypes = None

    # Load the data
    if chunk_size is not None:
        # Load in chunks to reduce memory pressure
        chunks = []
        for chunk in pd.read_csv(file_path, chunksize=chunk_size, dtype=dtypes):
            chunks.append(chunk)
        df = pd.concat(chunks)
    else:
        # Load all at once
        df = pd.read_csv(file_path, dtype=dtypes)

    print(f"Loaded CSV in {time.time() - start_time:.2f} seconds")
    return df

def optimize_dataframe_memory(df: pd.DataFrame) -> pd.DataFrame:
    """Optimize a dataframe's memory usage by changing data types."""
    # Start with a copy of the dataframe
    result = df.copy()

    # Optimize integers
    for col in result.select_dtypes(include=['int']).columns:
        result[col] = pd.to_numeric(result[col], downcast='integer')

    # Optimize floats
    for col in result.select_dtypes(include=['float']).columns:
        result[col] = pd.to_numeric(result[col], downcast='float')

    # Convert string columns with low cardinality to category
    for col in result.select_dtypes(include=['object']).columns:
        num_unique = result[col].nunique()
        num_total = len(result)
        if num_unique / num_total < 0.5:  # If < 50% are unique values
            result[col] = result[col].astype('category')

    # Calculate memory savings
    original_mem = df.memory_usage(deep=True).sum() / 1024**2  # MB
    optimized_mem = result.memory_usage(deep=True).sum() / 1024**2  # MB

    print(f"Memory reduced from {original_mem:.2f} MB to {optimized_mem:.2f} MB")
    print(f"Saved {original_mem - optimized_mem:.2f} MB ({(1 - optimized_mem/original_mem) * 100:.1f}%)")

    return result

**## 11.5.3 Implementing Search Mechanisms**

In [12]:
class CSVSearchSystem:
    """Search system for CSV data supporting both keyword and semantic search."""

    def __init__(self, embedding_model="text-embedding-3-small"):
        """Initialize the search system."""
        self.embeddings = OpenAIEmbeddings(model=embedding_model)
        self.vector_store = None
        self.df = None

    def index_dataframe(self, df: pd.DataFrame, chunk_size: int = 20) -> bool:
        """Index a dataframe for searching."""
        self.df = df

        # Create text representations of chunks of the dataframe
        documents = []

        for i in range(0, len(df), chunk_size):
            end_idx = min(i + chunk_size, len(df))
            chunk = df.iloc[i:end_idx]

            # Create text representation of this chunk
            text = f"Rows {i} to {end_idx-1}:\n"
            for _, row in chunk.iterrows():
                text += str(dict(row)) + "\n"

            # Create a document with metadata tracking the row indices
            doc = Document(
                page_content=text,
                metadata={"start_idx": i, "end_idx": end_idx-1}
            )
            documents.append(doc)

        # Create vector store
        self.vector_store = FAISS.from_documents(documents, self.embeddings)
        print(f"Indexed {len(documents)} chunks from dataframe")
        return True

    def semantic_search(self, query: str, k: int = 3) -> pd.DataFrame:
        """Perform a semantic search on the dataframe."""
        if self.vector_store is None or self.df is None:
            return pd.DataFrame({"error": ["No dataframe has been indexed"]})

        # Perform the search
        results = self.vector_store.similarity_search(query, k=k)

        # Gather all row indices from the results
        all_indices = []
        for doc in results:
            start_idx = doc.metadata["start_idx"]
            end_idx = doc.metadata["end_idx"]
            all_indices.extend(range(start_idx, end_idx + 1))

        # Return the relevant rows
        return self.df.iloc[all_indices].copy()

    def keyword_search(self, query: str, k: int = 10) -> pd.DataFrame:
        """Perform a keyword-based search on the dataframe."""
        if self.df is None:
            return pd.DataFrame({"error": ["No dataframe has been indexed"]})

        # Split the query into keywords
        keywords = query.lower().split()

        # Initialize match scores
        scores = pd.Series(0, index=self.df.index)

        # Search for each keyword in each column
        for col in self.df.columns:
            # Convert column to string for searching
            col_str = self.df[col].astype(str).str.lower()

            for keyword in keywords:
                # Increase score for each match
                matches = col_str.str.contains(keyword, na=False)
                scores = scores + matches

        # Get the top k matches
        top_indices = scores.nlargest(k).index
        return self.df.loc[top_indices].copy()

**## 11.5.4 Handling Large CSV Datasets**

In [13]:
def convert_csv_to_parquet(csv_path: str, parquet_path: str) -> Dict:
    """Convert a CSV file to Parquet format for better performance."""
    import pyarrow as pa
    import pyarrow.csv as csv
    import pyarrow.parquet as pq

    start_time = time.time()

    # Load CSV into a PyArrow Table
    table = csv.read_csv(csv_path)

    # Write to Parquet
    pq.write_table(table, parquet_path)

    # Calculate metrics
    csv_size_mb = os.path.getsize(csv_path) / (1024 * 1024)
    parquet_size_mb = os.path.getsize(parquet_path) / (1024 * 1024)

    return {
        "csv_size_mb": csv_size_mb,
        "parquet_size_mb": parquet_size_mb,
        "compression_ratio": csv_size_mb / parquet_size_mb,
        "conversion_time_seconds": time.time() - start_time
    }

def process_large_csv_in_batches(file_path: str,
                               batch_size: int = 10000,
                               process_func = None) -> List[Any]:
    """Process a large CSV file in batches to manage memory."""
    results = []

    # Process the CSV in chunks
    for chunk in pd.read_csv(file_path, chunksize=batch_size):
        if process_func:
            # Apply the processing function
            batch_result = process_func(chunk)
            results.append(batch_result)
        else:
            # Default processing: just count rows
            results.append(len(chunk))

    return results

**## 11.5.5 Memory and Performance Considerations**

In [14]:
def profile_operation(func, *args, **kwargs):
    """Profile the performance of a function."""
    import time
    import psutil

    # Record start stats
    start_time = time.time()
    process = psutil.Process(os.getpid())
    start_memory = process.memory_info().rss / 1024 / 1024  # MB

    # Execute function
    result = func(*args, **kwargs)

    # Record end stats
    end_time = time.time()
    end_memory = process.memory_info().rss / 1024 / 1024  # MB

    # Calculate metrics
    execution_time = end_time - start_time
    memory_increase = end_memory - start_memory

    print(f"Execution time: {execution_time:.2f} seconds")
    print(f"Memory increase: {memory_increase:.2f} MB")

    return result

from functools import lru_cache

@lru_cache(maxsize=100)
def cached_embedding_generation(text):
    """Generate embeddings with caching."""
    embeddings = OpenAIEmbeddings()
    return embeddings.embed_query(text)

def batch_process_with_memory_monitoring(df, batch_size=1000, func=None):
    """Process a dataframe in batches with memory monitoring."""
    import gc

    results = []

    for i in range(0, len(df), batch_size):
        # Process batch
        batch = df.iloc[i:min(i+batch_size, len(df))]

        if func:
            batch_result = func(batch)
        else:
            batch_result = len(batch)

        results.append(batch_result)

        # Force garbage collection after each batch
        del batch
        gc.collect()

    return results

**Complete Example: Titanic Dataset Q&A System**

In [None]:
# Download the Titanic dataset
!wget -q https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv -O titanic.csv

# Let's build a Q&A system for the Titanic dataset
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Create our CSV QA system
qa_system = CSVQuestionAnsweringSystem()
qa_system.add_csv("titanic.csv")

# 2. Optimize the dataframe for memory efficiency
original_df = qa_system.dataframes["titanic"]
optimized_df = optimize_dataframe_memory(original_df)
qa_system.dataframes["titanic"] = optimized_df

# 3. Create a search system
search_system = CSVSearchSystem()
search_system.index_dataframe(optimized_df)

# 4. Let's try some searches
print("\nKeyword search for 'first class female':")
keyword_results = search_system.keyword_search("first class female")
print(f"Found {len(keyword_results)} matches")
print(keyword_results.head(3))

print("\nSemantic search for 'wealthy women passengers':")
semantic_results = search_system.semantic_search("wealthy women passengers")
print(f"Found {len(semantic_results)} matches")
print(semantic_results.head(3))

# 5. Let's visualize some of the search results
plt.figure(figsize=(10, 6))
sns.countplot(data=semantic_results, x='Survived')
plt.title('Survival Counts for "wealthy women passengers" search')
plt.show()

# 6. Ask questions using the QA system
questions = [
    "What was the survival rate for women compared to men?",
    "How many passengers were in each class?",
    "Were children more likely to survive than adults?"
]

for question in questions:
    print(f"\nQuestion: {question}")
    answer = qa_system.generate_answer(question)
    print(f"Answer: {answer}")

print("\nCSV Q&A System demonstration complete!")