In [17]:
import pandas as pd
import csv
from dotenv import load_dotenv
from groq import Groq

**Pandas**  
When there is a CSV data, efficient and effective way to handle is to use pandas  
This can be used for various data handlind and manipulation  
That makes a good choice for Data Retirieval mechanism

In [None]:
Data = pd.read_csv ('Student_Performance.csv')
print (Data.dtypes)

>Check pandas functions that can be used for data calculations and inference

In [None]:
# various data computation possible
print ('Number of records : ', len (Data))
print ('Min, Avg, Max Study Hours : ', Data['study_hours_per_day'].min(), Data['study_hours_per_day'].mean(), Data['study_hours_per_day'].max())
print ("Values in Paretal Education :", Data['parental_education_level'].unique())
print ("Values in Internet Quality :", Data['internet_quality'].unique())

**Filtering**  
Pandas provides conditional filter / slicing functions that can be used as part of retrieval  
Result again in another pandas dataframe

In [None]:
# filter for poor internet quality and score more than 90
# All columns considered
Filtered = Data[(Data['internet_quality'] =='Poor') & (Data['exam_score'] > 90.0)]
Filtered

In [None]:
# Filter for above average study hours and attendance, but low marks
Filtered = Data[(Data['study_hours_per_day'] >= Data['study_hours_per_day'].mean()) & 
                (Data['attendance_percentage'] >= Data['attendance_percentage'].mean()) & 
                (Data['exam_score'] < 60)]
Filtered

**Use as Context**  
The data filtered using pandas used as context to provide LLM the data  
This data retrieval can be quick and effective  


In [None]:
load_dotenv()
client = Groq()

>Specific instructions in terms of Response

In [19]:
R_Instr = "Using the context given, provide response to the user question or statement.\
            Context is provided as CSV formatted string.\
            Answer to the question with details"

In [None]:
# User prompt
Prompt = "Why do you think students score low marks despite attending class and studying well?"

Filtered = Data[(Data['study_hours_per_day'] >= Data['study_hours_per_day'].mean()) & 
                (Data['attendance_percentage'] >= Data['attendance_percentage'].mean()) & 
                (Data['exam_score'] < 60)]
Context = Filtered.to_csv (index=False, float_format='%.1f')

# Invoke LLM with prompt and context
messages=[
    {
        "role": "system",
        "content": R_Instr
    },

    {
        "role": "user",
        "content":"Context : \n"+ Context + "Query : \n" + Prompt
    }
]
completion = client.chat.completions.create(
    messages=messages,
    model="llama3-70b-8192",
)

print (completion.choices[0].message.content)

>Let's try adapting the response

In [89]:
R_Instr = "Using the context given, provide response to the user question or statement.\
            Context is provided as CSV formatted string.\
            Provide comprehensive response"

In [None]:
# User prompt
Prompt = "Students who have good sleep patterns, are they utlising the time properly?"

# Filter relevant rows for above average sleep time and >75 mark. Only relevant columns
Filtered = Data.loc[(Data['sleep_hours'] >= Data['sleep_hours'].mean()*1.1) & 
                (Data['exam_score'] >= 75),
                ['study_hours_per_day', 'social_media_hours', 'netflix_hours', 'part_time_job', 'attendance_percentage']]
Context = Filtered.to_csv (index=False, float_format='%.1f')

# Invoke LLM with prompt and context
messages=[
    {
        "role": "system",
        "content": R_Instr
    },

    {
        "role": "user",
        "content":"Context : \n"+ Context + "Query : \n" + Prompt
    }
]
completion = client.chat.completions.create(
    messages=messages,
    model="llama3-70b-8192",
)

print (completion.choices[0].message.content)

**Get Code from LLM**  
Like in SQL, in pandas also we can take help of LLM to generate code  
This would require specific instructions to be provided for code generation  
The generated code can be used to extract data from dataframe and then use it as context

In [69]:
C_Instr = "For the given pandas dtypes, write code lines that can filter out data to answer user question.\
            Study the pandas dtypes and user question clearly\
            Give only the python code lines for necessary filtering of the Dataframe variable.\
            Code that can filter and return dataframe. Assign results to 'Result'\
            No additional code / string"

In [None]:
# Invoke LLM with prompt and context
Prompt = "Who all have good score despite having attendance below 75?"
# Prompt = "how many scored > 90% with less than 70% attendance?"
# Prompt = "What are the students who score good despite of low study time do?"

Data_Frame_Name = "Data"

Dtype = str(Data.dtypes)

messages=[
    {
        "role": "system",
        "content": C_Instr
    },

    {
        "role": "user",
        "content": "Data.dtype : \n"+ Dtype + "Dataframe variable : 'Data'\n Question : \n" + Prompt
    }
]
completion = client.chat.completions.create(
    messages=messages,
    model="llama3-70b-8192",
)

Code = completion.choices[0].message.content
Code_Clean = Code.strip ("`")
print (Code_Clean)

exec (Code_Clean)
# print (Result)

Context = Result.to_csv (index=False, float_format='%.1f')

# Invoke LLM with prompt and context
messages=[
    {
        "role": "system",
        "content": R_Instr
    },

    {
        "role": "user",
        "content":"Context : \n"+ Context + "Query : \n" + Prompt
    }
]
completion = client.chat.completions.create(
    messages=messages,
    model="llama3-70b-8192",
)

print (completion.choices[0].message.content)

**Pandas SQL**  
using the pandasql library, SQL query can be raised on data frame (treatig like a table)  
The psql library can handle basic SQL queries which are typically handled by SQLite  
Since it can be used as SQL queries, it can be integrated in RAG pipe line (query by LLM etc)

In [None]:
import pandasql as psql

In [None]:
Query = "SELECT * FROM Data WHERE extracurricular_participation = 'Yes'"

result = psql.sqldf(Query)

print(result)