In [37]:
# Step 1: Import Required Libraries
import pandas as pd
import pdfplumber
import pytesseract
from PIL import Image
import docx
import matplotlib.pyplot as plt
import seaborn as sns
import io
import base64
import os

# Step 2: Setup API Key
from together import Together
client = Together(api_key="90863deefe531b99b55eb70578c3e5bf11d9d8cec32a40f8a367075ae63dd9df")

# Step 3: File Text Extraction
def extract_text_from_file(file_path, file_type):
    if file_type == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    elif file_type == ".pdf":
        with pdfplumber.open(file_path) as pdf:
            return "\n".join(page.extract_text() or "" for page in pdf.pages)
    elif file_type == ".docx":
        doc = docx.Document(file_path)
        return "\n".join(p.text for p in doc.paragraphs)
    elif file_type in [".png", ".jpg", ".jpeg"]:
        image = Image.open(file_path)
        return pytesseract.image_to_string(image)
    else:
        return "Unsupported file type"
    

# Step 4: Read Structured Tabular Data
def read_tabular_data(file_path, file_type):
    
    if file_type == ".csv":
        return pd.read_csv(file_path)
    elif file_type == ".xlsx":
        return pd.read_excel(file_path)
    else:
        return None

# Step 5: Ask Llama via Together SDK
def ask_llama(prompt, stream=False):
    response = client.chat.completions.create(
        model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        messages=[
            {"role": "system", "content": "You are a helpful data analyst assistant."},
            {"role": "user", "content": prompt}
        ],
        stream=stream
    )

    if stream:
        for token in response:
            if hasattr(token, 'choices') and token.choices[0].delta.content:
                print(token.choices[0].delta.content, end='', flush=True)
    else:
        return response.choices[0].message.content


In [39]:
# Step 6: Upload Handling and File Type Detection
import os

def detect_file_type(file_path):
    _, ext = os.path.splitext(file_path)
    return ext.lower()

# Step 7: Summarization Function (Context Builder for LLM)
def build_context_from_data(df, max_rows=5):
    context = "Here is a sample of the uploaded data:\n"
    context += df.head(max_rows).to_string(index=False)
    context += "\n\nColumn Summary:\n"
    for col in df.columns:
        context += f"- {col}: {df[col].dtype}, missing values: {df[col].isnull().sum()}\n"
    return context

# Step 8: Visualization Function
def generate_plot(df, x_col, y_col, chart_type="bar"):
    plt.figure(figsize=(8, 5))
    if chart_type == "bar":
        sns.barplot(data=df, x=x_col, y=y_col)
    elif chart_type == "line":
        sns.lineplot(data=df, x=x_col, y=y_col)
    elif chart_type == "scatter":
        sns.scatterplot(data=df, x=x_col, y=y_col)
    elif chart_type == "pie":
        df.groupby(x_col)[y_col].sum().plot.pie(autopct='%1.1f%%')
    else:
        return "Unsupported chart type"
    
    plt.title(f"{chart_type.capitalize()} Chart of {y_col} vs {x_col}")
    buf = io.BytesIO()
    plt.tight_layout()
    plt.savefig(buf, format='png')
    buf.seek(0)
    encoded = base64.b64encode(buf.read()).decode('utf-8')
    plt.close()
    return encoded  # Return base64 string of image
def generate_plot_from_prompt(df, prompt):
    import io
    import matplotlib.pyplot as plt

    # Combine the prompt with column info for more context
    full_prompt = (
        f"You are a Python data analyst. Based on the following DataFrame columns: {list(df.columns)}\n"
        f"Write Python code using matplotlib or seaborn to: {prompt}.\n"
        "Only return the code, no explanation or markdown."
    )

    response = client.chat.completions.create(
        model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        messages=[
            {"role": "system", "content": "You are a helpful data analyst assistant."},
            {"role": "user", "content": full_prompt}
        ],
        max_tokens=512,
        temperature=0.2
    )

    generated_code = response.choices[0].message.content

    # Display and execute generated code safely
    try:
        exec_globals = {"df": df, "plt": plt, "sns": sns}
        exec(generated_code, exec_globals)
        st.pyplot(plt.gcf())
    except Exception as e:
        st.error(f"Error in generated code:\n{e}")
        st.code(generated_code)

# Step 9: Wrapper Function for Asking Questions with Context
def ask_question_about_data(df, question):
    context = build_context_from_data(df)
    full_prompt = context + "\n\nQuestion:\n" + question
    return ask_llama(full_prompt)



In [40]:
# Step 10: Demo Section – Test with a Sample CSV

# Create a small sample dataset for testing
sample_data = {
    "Product": ["Laptop", "Phone", "Tablet", "Monitor", "Keyboard"],
    "Sales": [1200, 850, 600, 400, 150],
    "Units Sold": [3, 5, 4, 2, 10]
}
df_sample = pd.DataFrame(sample_data)

# Show sample data
df_sample.head()


Unnamed: 0,Product,Sales,Units Sold
0,Laptop,1200,3
1,Phone,850,5
2,Tablet,600,4
3,Monitor,400,2
4,Keyboard,150,10


In [41]:
# Step 11: Test LLM Answering a Data Question
test_question = "Which product had the highest sales and how many units were sold of it?"
llm_response = ask_question_about_data(df_sample, test_question)

llm_response


"## Step 1: Understand the given data\nThe given data includes a table with three columns: Product, Sales, and Units Sold. The table contains information about different products, their total sales, and the number of units sold.\n\n## Step 2: Identify the task\nWe need to determine which product had the highest sales and the number of units sold for that product.\n\n## Step 3: Analyze the data\nTo find the product with the highest sales, we need to look at the 'Sales' column and identify the row with the maximum value.\n\n## Step 4: Find the product with the highest sales\nBy examining the 'Sales' column, we can see that the values are: 1200 for Laptop, 850 for Phone, 600 for Tablet, 400 for Monitor, and 150 for Keyboard. The highest sales value is 1200, which corresponds to the 'Laptop'.\n\n## Step 5: Determine the units sold for the product with the highest sales\nThe 'Laptop' has the highest sales with 1200. According to the data, 3 units of 'Laptop' were sold.\n\n## Step 6: Provide