#### COMPANIES NER MODEL TRAINING



In [None]:
# Load the Excel file
companies_df = pd.read_csv("/content/companies.csv")

# Extract the list of companies
companies = companies_df['Entity'].tolist()

In [None]:
import spacy
from spacy.training import Example

# Load a blank model
nlp = spacy.blank("en")

# Add the NER component to the pipeline if not already present
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")

# Add label to the NER
ner.add_label("COMPANY")

# Prepare training data
TRAIN_DATA = []
for company in companies:
    # Generate a dummy sentence containing the parameter name
    text = f"{company} is a Company."
    entities = [(text.index(company), text.index(company) + len(company), "COMPANY")]
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, {"entities": entities})
    TRAIN_DATA.append(example)

# Print a sample of the training data
print(TRAIN_DATA)

In [None]:
optimizer = nlp.begin_training()
for epoch in range(10):
    losses = {}
    for example in TRAIN_DATA:
        nlp.update([example], drop=0.5, losses=losses)
    print(f"Epoch {epoch}: {losses}")


In [None]:
nlp.to_disk("company_ner_model")


#### PARAMETERS NER MODEL TRAINING


In [None]:
import pandas as pd

# Load the Excel file
parameter_df = pd.read_csv("/content/parameters.csv")

# Extract the list of companies
parameters = parameter_df['Parameter'].tolist()

In [None]:
import spacy
from spacy.training import Example

# Load a blank model
nlp = spacy.blank("en")

# Add the NER component to the pipeline if not already present
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")

# Add label to the NER
ner.add_label("PARAMETER")

# Prepare training data
TRAIN_DATA = []
for parameter in parameters:
    # Generate a dummy sentence containing the parameter name
    text = f"{parameter} is an important metric."
    entities = [(text.index(parameter), text.index(parameter) + len(parameter), "PARAMETER")]
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, {"entities": entities})
    TRAIN_DATA.append(example)

# Print a sample of the training data
print(TRAIN_DATA)

In [None]:
optimizer = nlp.begin_training()
for epoch in range(10):
    losses = {}
    for example in TRAIN_DATA:
        nlp.update([example], drop=0.5, losses=losses)
    print(f"Epoch {epoch}: {losses}")


Epoch 0: {'ner': 394.3893275024216}
Epoch 1: {'ner': 20.405189905991286}
Epoch 2: {'ner': 12.463982714850822}
Epoch 3: {'ner': 23.62208231996258}
Epoch 4: {'ner': 10.125772987221822}
Epoch 5: {'ner': 0.012997428566311484}
Epoch 6: {'ner': 2.0766805450501833}
Epoch 7: {'ner': 0.00012774651432994125}
Epoch 8: {'ner': 10.991360883314835}
Epoch 9: {'ner': 0.05212024174994202}


In [None]:
nlp.to_disk("parameter_ner_model")


In [None]:
import spacy

# Load the custom model
nlp = spacy.load("/content/company_ner_model/")

def extract_companies(text):
    doc = nlp(text)
    companies = [ent.text for ent in doc.ents if ent.label_ == "COMPANY"]
    return companies

# Example usage
text = "Apple Inc. and Microsoft Corporation are major tech companies."
companies = extract_companies(text)
print(companies)  # Output should include detected company names

In [None]:
cookies = 'ENTER YOUR COOKIES'

In [None]:
from datetime import datetime, timedelta

In [None]:
!pip install python-gemini-api

Collecting python-gemini-api
  Downloading python_gemini_api-2.4.12-py3-none-any.whl.metadata (38 kB)
Collecting httpx>=0.20.0 (from httpx[http2]>=0.20.0->python-gemini-api)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting browser-cookie3 (from python-gemini-api)
  Downloading browser_cookie3-0.19.1-py3-none-any.whl.metadata (632 bytes)
Collecting loguru (from python-gemini-api)
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Collecting httpcore==1.* (from httpx>=0.20.0->httpx[http2]>=0.20.0->python-gemini-api)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx>=0.20.0->httpx[http2]>=0.20.0->python-gemini-api)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Collecting h2<5,>=3 (from httpx[http2]>=0.20.0->python-gemini-api)
  Downloading h2-4.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting lz4 (from browser-cookie3->python-gemini-api)
  Downloading lz4-4.3.3-cp310-cp310-m

## Approach 1 with Custom NER Model

In [None]:
def extract_entities(text, models, entity_type="ORG"):
    entities = set()

    # Ensure models is a list
    if not isinstance(models, list):
        models = [models]

    for model in models:
        doc = model(text)
        for ent in doc.ents:
            if ent.label_ == entity_type:
                entities.add(ent.text)

    return entities

In [None]:
def process_query(client, query, history=[]):
    # Extract entities directly from the query
    company_entities = extract_entities(query, nlp_model_1, entity_type="COMPANY")
    parameter_entities = extract_entities(query, nlp_model_2, entity_type="PARAMETER")

    # Determine the company name and parameter from the extracted entities
    company_name = next(iter(company_entities), None)
    parameter = next(iter(parameter_entities), None)

    # Use defaults or history for missing information
    start_date, end_date = parse_dates(query)
    if not company_name and history:
        company_name = history[-1].get("entity")
    if not parameter and history:
        parameter = history[-1].get("parameter")

    # Build the result JSON
    result = {
        "entity": company_name,
        "parameter": parameter,
        "startDate": start_date,
        "endDate": end_date
    }

    return result

In [None]:
def handle_queries(queries):
    final_output = []
    history = []

    for query in queries:
        result = process_query(client, query, history)
        history.append(result)
        final_output.append(result)

    return final_output

# Example queries
queries = [
    "What is the GMV of Flipkart for 2023?",
    "Compare our product with Amazon's offerings."
]
nlp_model_1 = spacy.load("/content/drive/MyDrive/company_ner_model")
nlp_model_2 = spacy.load("/content/drive/MyDrive/parameter_ner_model")

# Call handle_queries with your queries
output = handle_queries(queries)
print(output)

[{'entity': 'What', 'parameter': 'What', 'startDate': '2023-08-12', 'endDate': '2024-08-11'}, {'entity': "Amazon's", 'parameter': "Compare our product with Amazon's offerings", 'startDate': '2023-08-12', 'endDate': '2024-08-11'}]


## Approach 2 getting Ouptut from LLM




In [None]:
from gemini import Gemini


In [None]:
client = Gemini(cookies=cookies)


In [None]:
import re

In [None]:
import json
import re
from datetime import datetime, timedelta

def generate_query(user_query):
    prompt = f"""
    Extract the following information from the user query:
    - Entity: Company name
    - Parameter: Performance metric
    - Start Date: Start date of the period (default to one year ago if not mentioned)
    - End Date: End date of the period (default to today if not mentioned)

    User Query: "{user_query}"

    Your response should be in the following JSON format:
    [
        {{
            "entity": "<company_name>",
            "parameter": "<metric_name>",
            "startDate": "<start_date_iso>",
            "endDate": "<end_date_iso>"
        }}
    ]
    """

    # Generate content using Gemini API
    response = client.generate_content(prompt)

    # Print the raw response to debug
    print("Raw response payload:", response.payload)

    # Extract the text from the response payload
    response_text = response.payload.get('candidates', [{}])[0].get('text', '')

    # Strip Markdown code block formatting and parse JSON
    json_text = re.sub(r'^```json\n|\n```$', '', response_text).strip()

    return json_text

def process_query(user_query):
    # Set default dates
    today = datetime.now()
    start_date = (today - timedelta(days=365)).strftime("%Y-%m-%d")
    end_date = today.strftime("%Y-%m-%d")

    # Generate the response using Gemini API
    response_text = generate_query(user_query)

    try:
        # Ensure the response is valid JSON
        result = json.loads(response_text)
        # Replace placeholders for default dates
        for item in result:
            if 'startDate' not in item or not item['startDate']:
                item['startDate'] = start_date
            if 'endDate' not in item or not item['endDate']:
                item['endDate'] = end_date
    except json.JSONDecodeError:
        print("JSON Decode Error:", response_text)
        result = {"error": "Failed to decode JSON from the response"}
    except Exception as e:
        print("Exception:", str(e))
        result = {"error": f"An error occurred: {str(e)}"}

    return result

def handle_queries(queries):
    combined_result = []
    parameter_from_first_query = None

    for i, query in enumerate(queries):
        result = process_query(query)
        if isinstance(result, list):
            for item in result:
                if item.get('parameter'):
                    parameter_from_first_query = item['parameter']

                # Fill in missing parameters with the first parameter if it's available
                if item.get('parameter') is None and parameter_from_first_query:
                    item['parameter'] = parameter_from_first_query

            combined_result.extend(result)
        else:
            print(f"Error processing query '{query}': {result}")

    return combined_result

# Example usage
queries = [
    "Get me Tesla revenue for the last one year",
    "Amazon profit for last one year"
]

print(handle_queries(queries))

Raw response payload: {'metadata': ['c_5baad7c4b0d684a7', 'r_00e275e05d307cc2'], 'prompt_class': None, 'prompt_candidates': [], 'candidates': [{'rcid': 'rc_f3aea45add4cfe47', 'text': '```json\n[\n    {\n        "entity": "Tesla",\n        "parameter": "revenue",\n        "startDate": "2023-08-11",\n        "endDate": "2024-08-11"\n    }\n]\n```\n', 'code': {'snippett_01': '```json\n[\n    {\n        "entity": "Tesla",\n        "parameter": "revenue",\n        "startDate": "2023-08-11",\n        "endDate": "2024-08-11"\n    }\n]\n```'}, 'web_images': [], 'generated_images': []}]}
Raw response payload: {'metadata': ['c_5baad7c4b0d684a7', 'r_ea636d2a05140ca3'], 'prompt_class': None, 'prompt_candidates': [], 'candidates': [{'rcid': 'rc_accc98f6ac036743', 'text': '```json\n[\n    {\n        "entity": "Amazon",\n        "parameter": "profit",\n        "startDate": "2023-08-11",\n        "endDate": "2024-08-11"\n    }\n]\n```\n', 'code': {'snippett_01': '```json\n[\n    {\n        "entity": "