In [14]:
from Models import llama3b
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from OprFuncs import csvpd
from OprFuncs import data_infer, extract_code



In [15]:
df = csvpd("Test_Datasets\supply_chain_data.csv")
df.head()

  df = csvpd("Test_Datasets\supply_chain_data.csv")


Unnamed: 0,Product type,SKU,Price,Availability,Number of products sold,Revenue generated,Customer demographics,Stock levels,Supplier Lead times,Order quantities,...,Location,Inventory Lead time,Production volumes,Manufacturing lead time,Manufacturing costs,Inspection results,Defect rates,Transportation modes,Routes,Costs
0,haircare,SKU0,69.808006,55,802,8661.996792,Non-binary,58,7,96,...,Mumbai,29,215,29,46.279879,Pending,0.22641,Road,Route B,187.752075
1,skincare,SKU1,14.843523,95,736,7460.900065,Female,53,30,37,...,Mumbai,23,517,30,33.616769,Pending,4.854068,Road,Route B,503.065579
2,haircare,SKU2,11.319683,34,8,9577.749626,Unknown,1,10,88,...,Mumbai,12,971,27,30.688019,Pending,4.580593,Air,Route C,141.920282
3,skincare,SKU3,61.163343,68,83,7766.836426,Non-binary,23,13,59,...,Kolkata,24,937,18,35.624741,Fail,4.746649,Rail,Route A,254.776159
4,skincare,SKU4,4.805496,26,871,2686.505152,Non-binary,5,3,56,...,Delhi,5,414,3,92.065161,Fail,3.14558,Air,Route A,923.440632


In [16]:
llm = llama3b
llm

OllamaLLM(model='llama3.2:3b')

In [None]:
def analysis_data(dataframe, llm):
    data_info = data_infer(dataframe)

    # Prompt and Chain for Analysis Information for each column in the data
    analysis_prompt = '''
    You are a data analyst. You are provided with a dataset about {data_info}
    Here is the dataset structure:
    {data_info}

    Please analyze the data and provide insights about:
    1. Key trends and patterns in the supply chain (e.g., delays, high-demand items, etc.).
    2. Any anomalies or outliers in the data.
    3. Recommendations or actionable insights based on the analyzed data.
    '''
    analysis_template = PromptTemplate(
        input_variables=["data_info"],
        template=analysis_prompt
    )
    analysis_chain = LLMChain(llm=llm, prompt=analysis_template)

    analysis = analysis_chain.run(data_info=data_info)
    print("Analysis Data:\n", analysis)

In [20]:
analysis_data(dataframe=df, llm=llm)

Analysis Data:
 Based on the provided dataset, here are some key trends, patterns, and insights:

1. Key Trends and Patterns in the Supply Chain:
   - **Supplier Lead Times**: The supplier lead times range from 2 to 20 days, with an average of around 8 days. This indicates that most suppliers take between 2 to 10 days to deliver products after placing orders.
   - **Production Volumes**: The production volumes are generally low, ranging from 100 to 5,000 units per month. This suggests that the product demand is moderate and relatively stable.
   - **Manufacturing Lead Times**: The manufacturing lead times are usually shorter than supplier lead times, taking around 3-7 days to produce products after receiving raw materials.

2. Anomalies or Outliers in the Data:
   - **Inspection Results**: There are some observations with incomplete inspection results, indicating potential issues with data collection or quality.
   - **Stock Levels**: Some suppliers have stock levels above their maximu

In [6]:
def drop_nulls(dataframe, llm):
    data_info = data_infer(dataframe)
    
    # Prompt and Chain for dropping nulls
    drop_nulls_prompt = '''
    create a code to drop the nulls from the DataFrame named 'df',
    only include the dropping part,
    insure that inplace = True, no extra context or reading the file.
    '''
    drop_nulls_template = PromptTemplate(
        input_variables=["data_info"],
        template=drop_nulls_prompt
    )
    drop_nulls_chain = LLMChain(llm=llm, prompt=drop_nulls_template)
    
    # Extracting code for dropping nulls
    drop_nulls_code = extract_code(drop_nulls_chain.run(data_info=data_info))
    print("Code for dropping nulls:\n", drop_nulls_code)
    
    exec_env = {"df": dataframe}
    exec(drop_nulls_code, exec_env)
    updated_df = exec_env["df"]
    return updated_df.info()

In [7]:
drop_nulls(dataframe=df, llm=llm)

Code for dropping nulls:
 df.dropna(inplace=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Product type             100 non-null    object 
 1   SKU                      100 non-null    object 
 2   Price                    100 non-null    float64
 3   Availability             100 non-null    int64  
 4   Number of products sold  100 non-null    int64  
 5   Revenue generated        100 non-null    float64
 6   Customer demographics    100 non-null    object 
 7   Stock levels             100 non-null    int64  
 8   Supplier  Lead times     100 non-null    int64  
 9   Order quantities         100 non-null    int64  
 10  Shipping times           100 non-null    int64  
 11  Shipping carriers        100 non-null    object 
 12  Shipping costs           100 non-null    float64
 13  Supplier name            100 no

In [8]:
def quetions_gen (num,llm, dataframe):
    data_info = data_infer(dataframe)
    
    # Prompt and Chain for Question Generation
    question_prompt = '''
    create {num} anlysis questions about the following data {data_info}
    '''
    
    # Define the prompt template
    question_template = PromptTemplate(
        input_variables=["num", "data_info"],
        template=question_prompt
    )
    
    # Create the chain
    question_chain = LLMChain(
        llm=llm,
        prompt=question_template
    )
    
    # Generate the questions
    questions = question_chain.run(num=num, data_info=data_info)
    
    print("Generated Questions:\n", questions)
    return questions


In [9]:
question = quetions_gen(num=5, llm=llm, dataframe=df)
print(question)

Generated Questions:
 Here are five analysis questions based on the provided data:

1. **What is the average revenue generated per product sold, and how does it compare to the number of products sold?**

To answer this question, you can calculate the mean of the "Revenue generated" column and divide it by the "Number of products sold" column. This will give you an idea of the average revenue generated per unit sold.

2. **Which supplier has the highest lead time, and what is the corresponding stock level?**

To answer this question, you can sort the data by the "Supplier Lead times" column in ascending order and select the first row (i.e., the supplier with the longest lead time). Then, check the value of the "Stock levels" column for that supplier.

3. **What is the correlation between the "Revenue generated" and "Price" columns?**

To answer this question, you can calculate the Pearson correlation coefficient using a statistical function or a library like NumPy or Pandas in Python. T

In [12]:
# Visualization of the Data
def visualiz_ques(llm, data_info, questions):
    # Prompt and Chain for Visualization
    visualization_prompt = '''
    create matplotlib code for each question in:
    {questions}
    with the columns:
    {data_info} in one python script
    '''
    
    # Define the prompt template
    visualization_template = PromptTemplate(
        input_variables=["data_info", "questions"],
        template=visualization_prompt
    )
    
    # Create the chain
    visualization_chain = LLMChain(llm=llm, prompt=visualization_template)
    
    # Generate the visualization
    visualization = visualization_chain.run(data_info=data_info, questions=questions)
    
    print("Generated Visualization:\n", visualization)
    return visualization

In [13]:
visualiz_ques(llm=llm, data_info=df, questions=question)

Generated Visualization:
 Below is a Python code snippet using the pandas library to perform the requested analysis questions:

```python
# Import necessary libraries
import pandas as pd

# Load your data into a DataFrame, assuming it's saved in 'data.csv'
df = pd.read_csv('data.csv')

# 1. What is the average revenue generated per product sold, and how does it compare to the number of products sold?
average_revenue_per_product_sold = df['Revenue generated'].mean() / df['Number of products sold'].mean()
print("Average Revenue Per Product Sold:", round(average_revenue_per_product_sold, 2))

# 2. Which supplier has the highest lead time, and what is the corresponding stock level?
supplier_with_highest_lead_time_and_stock_levels = df.loc[df['Lead times'].idxmax()]
print("Supplier with Highest Lead Time:", supplier_with_highest_lead_time_and_stock_levels['Supplier'])
print("Corresponding Stock Levels:", supplier_with_highest_lead_time_and_stock_levels['Stock levels'])

# 3. What is the cor

'Below is a Python code snippet using the pandas library to perform the requested analysis questions:\n\n```python\n# Import necessary libraries\nimport pandas as pd\n\n# Load your data into a DataFrame, assuming it\'s saved in \'data.csv\'\ndf = pd.read_csv(\'data.csv\')\n\n# 1. What is the average revenue generated per product sold, and how does it compare to the number of products sold?\naverage_revenue_per_product_sold = df[\'Revenue generated\'].mean() / df[\'Number of products sold\'].mean()\nprint("Average Revenue Per Product Sold:", round(average_revenue_per_product_sold, 2))\n\n# 2. Which supplier has the highest lead time, and what is the corresponding stock level?\nsupplier_with_highest_lead_time_and_stock_levels = df.loc[df[\'Lead times\'].idxmax()]\nprint("Supplier with Highest Lead Time:", supplier_with_highest_lead_time_and_stock_levels[\'Supplier\'])\nprint("Corresponding Stock Levels:", supplier_with_highest_lead_time_and_stock_levels[\'Stock levels\'])\n\n# 3. What is