#Building Sequential Tasks with LangChain and Google GenAI
Made by: Wilfredo Aaron Sosa Ramos (AI Lab Manager at RealityAI Labs)

In [1]:
!pip install -q langchain langchain_community langchain_core langchain_google_genai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/2.5 MB[0m [31m14.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m2.3/2.5 MB[0m [31m30.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m29.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
!pip install -q rich

In [2]:
import os
from google.colab import userdata

if not os.environ.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

##1. The Pipe Operator (|)


In [3]:
# Setup
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# Initialize the model
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

# Define a prompt to generate Python code
code_prompt = ChatPromptTemplate.from_template(
    "Write Python code for the following task: {task}"
)

# Create a chain for code generation
code_chain = code_prompt | model | StrOutputParser()

# Invoke the chain
result = code_chain.invoke({"task": "read a CSV file and calculate the mean of a column"})
print("Generated Code:\n", result)

Generated Code:
 Several methods exist for calculating the mean of a column in a CSV file using Python. Here are two common approaches, one using the `csv` module and another using the more efficient `pandas` library:


**Method 1: Using the `csv` module (for smaller files)**

This method is suitable for smaller CSV files where loading the entire file into memory isn't a concern.

```python
import csv

def calculate_column_mean(filepath, column_name):
    """Calculates the mean of a specified column in a CSV file.

    Args:
        filepath: Path to the CSV file.
        column_name: The name of the column to calculate the mean for.

    Returns:
        The mean of the column, or None if the column is not found or contains non-numeric values.  
    """
    try:
        with open(filepath, 'r', newline='') as csvfile:
            reader = csv.DictReader(csvfile)  # Use DictReader for easier column access
            column_data = []
            for row in reader:
                try:


In [5]:
from rich.console import Console
from rich.markdown import Markdown

console = Console()

def print_md(result):
  markdown = Markdown(result)
  console.print(markdown)

In [6]:
print_md(result)

##2. Coercion for Code Analysis


In [7]:
# Define a prompt for analyzing code
analysis_prompt = ChatPromptTemplate.from_template(
    "Analyze the following Python code and provide suggestions for optimization: {code}"
)

# Create a composed chain using coercion
composed_chain = {"code": code_chain} | analysis_prompt | model | StrOutputParser()

# Invoke the chain
result_analysis = composed_chain.invoke(
    {"task": "read a CSV file and calculate the mean of a column"}
)
print("Code Analysis:\n", result_analysis)

# Adding lambda for custom coercion
composed_chain_with_lambda = (
    code_chain
    | (lambda input: {"code": input})
    | analysis_prompt
    | model
    | StrOutputParser()
)

result_lambda = composed_chain_with_lambda.invoke(
    {"task": "read a CSV file and calculate the mean of a column"}
)
print("Code Analysis with Lambda:\n", result_lambda)

Code Analysis:
 The analysis shows that the Pandas method is significantly more efficient, especially for larger files.  The `csv` module method is inherently slower because it iterates row by row and performs manual type conversion and error handling. Pandas leverages optimized vectorized operations.  However, both methods can be improved.


**Improvements for the `csv` module method:**

1. **Error Handling:** The `try-except` block inside the loop is inefficient.  It's better to filter for numeric values *after* reading the entire column. This avoids repeated exception handling.

2. **Type hinting:** Adding type hints improves code readability and maintainability.

3. **More informative error messages:**  The error messages could provide more context (e.g., the row number).


```python
import csv
from typing import List, Union

def calculate_column_mean_csv_improved(filepath: str, column_name: str) -> Union[float, None]:
    """Calculates the mean of a specified column in a CSV file 

In [8]:
print_md(result_analysis)

In [9]:
print_md(result_lambda)

##3. The .pipe() Method for Optimization

In [11]:
from langchain_core.runnables import RunnableParallel

# Define a prompt for code optimization
optimization_prompt = ChatPromptTemplate.from_template(
    "Optimize the following Python code to improve performance and readability: {code}"
)

# Create a composed chain with the pipe method
composed_chain_with_pipe = (
    RunnableParallel({"code": code_chain})
    .pipe(optimization_prompt)
    .pipe(model)
    .pipe(StrOutputParser())
)

# Invoke the chain
result_pipe = composed_chain_with_pipe.invoke(
    {"task": "Create the Quicksort algorithm and use it in a ML algorithm"}
)
print("Optimized Code:\n", result_pipe)

Optimized Code:
 ```python
import random
import heapq  # For efficient k-NN search

def euclidean_distance(x1, x2):
    """Calculates the Euclidean distance between two data points."""
    return sum((a - b)**2 for a, b in zip(x1, x2))**0.5


def k_nearest_neighbors(X_train, y_train, X_test, k):
    """k-Nearest Neighbors algorithm using a heap for efficiency."""
    y_pred = []
    for test_point in X_test:
        # Use a heap to efficiently find the k-nearest neighbors
        distances = []
        for train_point, label in zip(X_train, y_train):
            dist = euclidean_distance(test_point, train_point)
            if len(distances) < k:
                heapq.heappush(distances, (-dist, label)) #negate distance for min-heap
            else:
                if -distances[0][0] > dist:
                    heapq.heapreplace(distances, (-dist, label))
        
        k_nearest = distances
        labels = [label for _, label in k_nearest]
        y_pred.append(max(set(labels), k

In [13]:
print_md(result_pipe)