In [5]:
import os

#os.environ["GROQ_API_KEY"] = ""
api_key = os.environ.get("GROQ_API_KEY")
#print(f"GROQ_API_KEY: {api_key}")


In [6]:
import os

from groq import Groq

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Explain the importance of fast language models",
        }
    ],
    model="llama3-8b-8192",
)

print(chat_completion.choices[0].message.content)

Fast language models, also known as efficient language models or Accelerated Language Models, have gained significant attention in recent years due to their importance in various applications. Here are some reasons why fast language models are crucial:

1. **Real-time processing**: Fast language models enable real-time processing of natural language data, making them essential for applications like chatbots, voice assistants, and voice-to-text systems. These systems require immediate responses to user inputs, and fast language models can provide them quickly.
2. **Efficient inference**: Traditional language models are computationally expensive, which can lead to slow inference times. Fast language models, on the other hand, are designed to have faster inference times, making them suitable for applications where speed and efficiency are critical, such as search engines and recommendation systems.
3. **Scalability**: As the amount of user-generated content grows, traditional language mod

### Base Test

In [11]:
%pip install -qU langchain-groq

Note: you may need to restart the kernel to use updated packages.


In [10]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

chat = ChatGroq(
    temperature=0,
    model="llama3-70b-8192",
    # api_key="" # Optional if not set as an environment variable
)

system = "You are a helpful assistant."
human = "{text}"
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

chain = prompt | chat
chain.invoke({"text": "Explain the importance of low latency for LLMs."})

AIMessage(content='Low latency is crucial for Large Language Models (LLMs) because it directly impacts the user experience, model performance, and overall efficiency of language-based applications. Here are some reasons why low latency is essential for LLMs:\n\n1. **Real-time Interaction**: LLMs are often used in applications that require real-time interaction, such as chatbots, virtual assistants, and language translation systems. Low latency ensures that the model responds quickly to user input, providing a seamless and engaging experience.\n2. **Conversational Flow**: In conversational AI, latency can disrupt the natural flow of conversation. High latency can lead to awkward pauses, making the interaction feel unnatural and frustrating. Low latency helps maintain a smooth conversation, allowing users to engage more naturally with the model.\n3. **User Engagement**: High latency can lead to user frustration, causing them to abandon the application or lose interest. Low latency, on th

In [11]:
from typing import Optional

from langchain_core.tools import tool


@tool
def get_current_weather(location: str, unit: Optional[str]):
    """Get the current weather in a given location"""
    return "Cloudy with a chance of rain."


tool_model = chat.bind_tools([get_current_weather], tool_choice="auto")

res = tool_model.invoke("What is the weather like in San Francisco and Tokyo?")

res.tool_calls

[{'name': 'get_current_weather',
  'args': {'location': 'San Francisco', 'unit': 'Celsius'},
  'id': 'call_mh86'},
 {'name': 'get_current_weather',
  'args': {'location': 'Tokyo', 'unit': 'Celsius'},
  'id': 'call_yexd'}]

In [12]:
from langchain_core.pydantic_v1 import BaseModel, Field


class Joke(BaseModel):
    """Joke to tell user."""

    setup: str = Field(description="The setup of the joke")
    punchline: str = Field(description="The punchline to the joke")
    rating: Optional[int] = Field(description="How funny the joke is, from 1 to 10")


structured_llm = chat.with_structured_output(Joke)

structured_llm.invoke("Tell me a joke about cats")

Joke(setup='Why did the cat join a band?', punchline='Because it wanted to be the purr-cussionist!', rating=None)

In [13]:
chat = ChatGroq(temperature=0, model="llama3-70b-8192")
prompt = ChatPromptTemplate.from_messages([("human", "Write a Limerick about {topic}")])
chain = prompt | chat
await chain.ainvoke({"topic": "The Sun"})

AIMessage(content='Here is a limerick about the sun:\n\nThere once was a sun in the sky,\nWhose warmth and light caught the eye.\nIt shone bright and bold,\nWith a heat that was told,\nAnd brought life to all, as it flew by.', response_metadata={'token_usage': {'completion_tokens': 54, 'prompt_tokens': 18, 'total_tokens': 72, 'completion_time': 0.144831532, 'prompt_time': 0.005151873, 'queue_time': None, 'total_time': 0.14998340500000001}, 'model_name': 'llama3-70b-8192', 'system_fingerprint': 'fp_753a4aecf6', 'finish_reason': 'stop', 'logprobs': None}, id='run-cb464bc3-7a3b-4a17-8916-cb8952d6301f-0')

In [14]:
chat = ChatGroq(temperature=0, model="llama3-70b-8192")
prompt = ChatPromptTemplate.from_messages([("human", "Write a haiku about {topic}")])
chain = prompt | chat
for chunk in chain.stream({"topic": "The Moon"}):
    print(chunk.content, end="", flush=True)

Silver glowing face
Luna's gentle light descends
Midnight's peaceful hush

In [15]:
chat = ChatGroq(
    model="llama3-70b-8192", model_kwargs={"response_format": {"type": "json_object"}}
)

system = """
You are a helpful assistant.
Always respond with a JSON object with two string keys: "response" and "followup_question".
"""
human = "{question}"
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

chain = prompt | chat

chain.invoke({"question": "what bear is best?"})

AIMessage(content='{"response": "That\'s a tough question! There are eight species of bears found in the world, and each has its own unique characteristics. However, if I had to pick one, I\'d say the giant panda is a popular favorite due to its distinct black and white markings and gentle nature.", "followup_question": "What do you think makes a bear \'best\' - its size, its habitat, or something else? "}', response_metadata={'token_usage': {'completion_tokens': 89, 'prompt_tokens': 50, 'total_tokens': 139, 'completion_time': 0.244468874, 'prompt_time': 0.011064902, 'queue_time': None, 'total_time': 0.255533776}, 'model_name': 'llama3-70b-8192', 'system_fingerprint': 'fp_c1a4bcec29', 'finish_reason': 'stop', 'logprobs': None}, id='run-157fd267-4523-450f-bb78-c0dbda683d68-0')

### Data Augmentation

In [17]:
import os
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
import pandas as pd
import re

# Set your API key (ensure you have it set in your environment)
#os.environ['GROQ_API_KEY'] = 'your_api_key_here'

# Initialize the Groq chat model
chat = ChatGroq(
    temperature=0.7,
    model="llama3-70b-8192",
    api_key=os.getenv('GROQ_API_KEY')
)

def sanitize_data(df):
    # Remove rows with NaN values
    df = df.dropna(subset=['concerns', 'anything else'])
    
    # Function to clean text by removing special characters and leading/trailing whitespaces
    def clean_text(text):
        # Remove leading/trailing whitespaces
        text = text.strip()
        # Remove special characters (except common punctuation and spaces)
        text = re.sub(r'[^A-Za-z0-9.,?!\s]', '', text)
        return text

    # Apply cleaning function to 'concerns' and 'anything else' columns
    df['concerns'] = df['concerns'].apply(clean_text)
    df['anything else'] = df['anything else'].apply(clean_text)
    
    return df

# Load the CSV file
input_file = '../../Data/Merged_data.csv'
data = pd.read_csv(input_file)

data = sanitize_data(data)

# Define the prompt template
system = "You are a helpful assistant."
human_template = "Augment this response: {text}"

# Create the prompt template
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human_template)])

# Function to generate augmented data
def augment_text(text):
    chain = prompt | chat
    augmented_response = chain.invoke({"text": text})
    return augmented_response.content

# Generate augmented data for each row
augmented_data = []

for index, row in data.iterrows():
    augmented_concerns = augment_text(row['concerns'])
    augmented_anything_else = augment_text(row['anything else'])
    
    augmented_data.append({
        'response id': row['response id'],
        'concerns': augmented_concerns,
        'concerns category': row['concerns category'],
        'anything else': augmented_anything_else,
        'anything else category': row['anything else category']
    })

# Convert augmented data to DataFrame
augmented_df = pd.DataFrame(augmented_data)

# Save augmented data to a new CSV file
output_file = '../../Data/augmented_data.csv'
augmented_df.to_csv(output_file, index=False)

print(f"Augmented data saved to {output_file}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['concerns'] = df['concerns'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['anything else'] = df['anything else'].apply(clean_text)


Augmented data saved to ../../Data/augmented_data.csv


### Synthetic Data Generation

In [18]:
import os
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
import re

# Set your API key (ensure you have it set in your environment)
#os.environ['GROQ_API_KEY'] = 'your_api_key_here'

# Initialize the Groq chat model
chat = ChatGroq(
    temperature=0.7,
    model="llama3-70b-8192",
    api_key=os.getenv('GROQ_API_KEY')
)

def sanitize_data(df):
    # Remove rows with NaN values
    df = df.dropna(subset=['concerns', 'anything else'])
    
    # Function to clean text by removing special characters and leading/trailing whitespaces
    def clean_text(text):
        # Remove leading/trailing whitespaces
        text = text.strip()
        # Remove special characters (except common punctuation and spaces)
        text = re.sub(r'[^A-Za-z0-9.,?!\s]', '', text)
        return text

    # Apply cleaning function to 'concerns' and 'anything else' columns
    df['concerns'] = df['concerns'].apply(clean_text)
    df['anything else'] = df['anything else'].apply(clean_text)
    
    return df

# Load the CSV file
input_file = '../../Data/Merged_data.csv'
data = pd.read_csv(input_file)

data = sanitize_data(data)

# Define the prompt template for generating synthetic data
system = "You are a helpful assistant that generates realistic synthetic data for a survey."
human_template = "Generate a synthetic response for the concerns category: {concerns_category} and anything else category: {anything_else_category}."

# Create the prompt template
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human_template)])

# Function to generate synthetic data
def generate_synthetic_data(concerns_category, anything_else_category):
    chain = prompt | chat
    synthetic_response = chain.invoke({
        "concerns_category": concerns_category,
        "anything_else_category": anything_else_category
    })
    return synthetic_response.content

# Generate synthetic data for each row
synthetic_data = []

for index, row in data.iterrows():
    synthetic_response = generate_synthetic_data(row['concerns category'], row['anything else category'])
    synthetic_concerns, synthetic_anything_else = synthetic_response.split('\n', 1)
    
    synthetic_data.append({
        'response id': row['response id'],
        'concerns': synthetic_concerns.strip(),
        'concerns category': row['concerns category'],
        'anything else': synthetic_anything_else.strip(),
        'anything else category': row['anything else category']
    })

# Convert synthetic data to DataFrame
synthetic_df = pd.DataFrame(synthetic_data)

# Save synthetic data to a new CSV file
output_file = '../../Data/synthetic_data.csv'
synthetic_df.to_csv(output_file, index=False)

print(f"Synthetic data saved to {output_file}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['concerns'] = df['concerns'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['anything else'] = df['anything else'].apply(clean_text)


Synthetic data saved to ../../Data/synthetic_data.csv


In [20]:
import os
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
import re

# Set your API key (ensure you have it set in your environment)
#os.environ['GROQ_API_KEY'] = 'your_api_key_here'

# Initialize the Groq chat model
chat = ChatGroq(
    temperature=0.7,
    model="llama3-70b-8192",
    api_key=os.getenv('GROQ_API_KEY')
)

def sanitize_data(df):
    # Remove rows with NaN values
    df = df.dropna(subset=['concerns', 'anything else'])
    
    # Function to clean text by removing special characters and leading/trailing whitespaces
    def clean_text(text):
        # Remove leading/trailing whitespaces
        text = text.strip()
        # Remove special characters (except common punctuation and spaces)
        text = re.sub(r'[^A-Za-z0-9.,?!\s]', '', text)
        return text

    # Apply cleaning function to 'concerns' and 'anything else' columns
    df['concerns'] = df['concerns'].apply(clean_text)
    df['anything else'] = df['anything else'].apply(clean_text)
    
    return df

# Load the CSV file
input_file = '../../Data/Merged_data.csv'
data = pd.read_csv(input_file)

data = sanitize_data(data)

# Define the prompt template for generating synthetic data
system = "You are a helpful assistant that generates realistic synthetic data for a survey. Generate responses that are similar in style and context to the provided examples."
human_template = "Given the following categories: \nConcerns Category: {concerns_category}\nAnything Else Category: {anything_else_category} \
\nHere, AC stands for Academic Concerns, TC for Technical Concerns, PC for Personal Concerns, and NC for No Concerns. The {concerns} and the {anything_else}\
represent open-ended responses entered by students after participating in the start-of-semester survey. {concerns_category} label corresponds to {concerns} and {anything_else_category} label corresponds to {anything_else}.This survey is for a distance Multivariable Calculus course.\
\nGenerate a synthetic response similar to the examples but covering more real-world variations."

# Create the prompt template
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human_template)])

# Function to generate synthetic data
def generate_synthetic_data(concerns, anything_else, concerns_category, anything_else_category):
    chain = prompt | chat
    synthetic_response = chain.invoke({
        "concerns": concerns,
        "anything_else": anything_else,
        "concerns_category": concerns_category,
        "anything_else_category": anything_else_category
    })
    return synthetic_response.content

# Generate synthetic data for each row
synthetic_data = []

for index, row in data.iterrows():
    # Generate synthetic response for concerns
    synthetic_concerns = generate_synthetic_data(row['concerns category'], row['anything else category'], row['concerns'], row['anything else'])
    
    synthetic_data.append({
        'response id': f'synthetic_{index}',
        'concerns': synthetic_concerns,
        'concerns category': row['concerns category'],
        'anything else': synthetic_concerns,
        'anything else category': row['anything else category']
    })

# Convert synthetic data to DataFrame
synthetic_df = pd.DataFrame(synthetic_data)

# Save synthetic data to a new CSV file
output_file = '../../Data/synthetic_data_final.csv'
synthetic_df.to_csv(output_file, index=False)

print(f"Synthetic data saved to {output_file}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['concerns'] = df['concerns'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['anything else'] = df['anything else'].apply(clean_text)


Synthetic data saved to ../../Data/synthetic_data_final.csv


### Other Trial

In [None]:
import pandas as pd
import requests

# Load the dataset
file_path = '../Data/After/2023.02.StartOfSemester.Coded.csv'  # Update with your file path
# Try different encodings if the default 'utf-8' fails
encodings = ['latin1', 'iso-8859-1', 'cp1252']

for encoding in encodings:
    try:
        df = pd.read_csv(file_path, encoding=encoding)
        print(f"File successfully read with encoding: {encoding}")
        break
    except UnicodeDecodeError:
        print(f"Failed to read file with encoding: {encoding}")

# Select the columns to be augmented
columns_to_generate = ['concerns', 'anything else']

def call_groq_api_for_generation(prompt, api_key):
    url = "https://api.groq.com/generate"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "prompt": prompt,
        "max_length": 200  # Adjust based on your requirements
    }
    response = requests.post(url, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()["generated_text"]
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return prompt

api_key = os.environ.get("GROQ_API_KEY")  # Replace with your Groq API key

synthetic_data = []

for index, row in df.iterrows():
    for col in columns_to_generate:
        text = row[col]
        synthetic_text = call_groq_api_for_generation(text, api_key)
        synthetic_data.append({col: synthetic_text, 'original_column': col})

synthetic_df = pd.DataFrame(synthetic_data)

synthetic_file_path = '../Data/After/synthetic_file.csv'  # Update with your desired file path
synthetic_df.to_csv(synthetic_file_path, index=False)


In [None]:
import os
import math
import requests
from groq import Groq
import pandas as pd


# Your message data
messages = [
    {
        "role": "user",
        "content": "Hello, how are you?"
    }
]

# Sanitize the data
messages = sanitize_data(messages)

# Initialize the client
client = Groq(api_key=os.environ["GROQ_API_KEY"])

# Correct endpoint for chat completions
try:
    chat_completion = client.chat.completions.create(
        messages=messages,
        model="llama3-8b-8192",
    )
    print(chat_completion.choices[0].message.content)
except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")
except ValueError as e:
    print(f"Value error: {e}")
