# Pushing Data to ChromaDB

In [12]:
import json
import chromadb
from chromadb.utils import embedding_functions

# Initialize Chroma Client
client = chromadb.PersistentClient(path='./MasterDatabase')

# Load JSON data
with open("./data.json", "r") as file:
    data = json.load(file)

# # Define the collection
# collection = client.get_or_create_collection("financial_data")

# # Process the JSON data
# for company, details in data.items():
#     # Prepare metadata
#     metadata = {
#         "industry_classification": details.get("Detailed Industry Classification", {}),
#         "high_lows": details.get("Detailed High Lows", {}),
#         "previous_day": details.get("Previous Day", {}),
#         "price_bands": details.get("Price Bands", {}),
#         "market_capitalization": details.get("Market Capitalization", {}),
#         "financials": details.get("Financials", {}),
#         "classification": details.get("Classification", {}),
#         "peer_comparison": details.get("Peer Comparison", {}),
#         "quarterly_results": details.get("Quarterly_Results", {}),
#         "report_url": details.get("report_url", "")
#     }
    
#     # Add data to the collection
#     collection.add(
#         ids=[company],  # Use the company name as ID
#         embeddings=[[0.0] * 128],  # Placeholder embedding (128-dimensional vector of zeros)
#         metadatas=[metadata],  # Metadata with the detailed information
#         documents=[f"Data for {company}"]  # Optional document field
#     )

# print("Data successfully pushed to ChromaDB!")

In [15]:
# Function to flatten nested dictionaries
def flatten_dict(d, parent_key='', sep='__'):
    """
    Flattens a nested dictionary into a single-level dictionary.
    Nested keys are joined with `sep`.
    """
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)


# Define the collection
collection = client.get_or_create_collection(name="stock_data")

# Process the JSON data
for company, details in data.items():
    # Flatten the nested metadata
    metadata = flatten_dict({
        "industry_classification": details.get("Detailed Industry Classification", {}),
        "high_lows": details.get("Detailed High Lows", {}),
        "previous_day": details.get("Previous Day", {}),
        "price_bands": details.get("Price Bands", {}),
        "market_capitalization": details.get("Market Capitalization", {}),
        "financials": details.get("Financials", {}),
        "classification": details.get("Classification", {}),
        "peer_comparison": details.get("Peer Comparison", {}),
        "quarterly_results": details.get("Quarterly_Results", {}),
        "report_url": details.get("report_url", "")
    })

    # Add data to the collection
    collection.add(
        ids=[company],  # Use the company name as ID
        embeddings=[[0.0] * 128],  # Placeholder embedding
        metadatas=[metadata],  # Flattened metadata
        documents=[f"Data for {company}"]  # Optional document field
    )

print("Data successfully pushed to ChromaDB!")

Data successfully pushed to ChromaDB!


In [17]:
# Function to unflatten dictionary (reverse of flatten_dict)
def unflatten_dict(d, sep='__'):
    """
    Unflattens a single-level dictionary back into a nested dictionary.
    Keys split by `sep` become nested structures.
    """
    result = {}
    for key, value in d.items():
        parts = key.split(sep)
        target = result
        for part in parts[:-1]:
            target = target.setdefault(part, {})
        target[parts[-1]] = value
    return result

# Function to get company data
def get_company_data(company_name):
    """
    Retrieves data for a specific company from ChromaDB.
    
    Args:
        company_name (str): Name of the company to query
        
    Returns:
        dict: Company data in nested format
    """
    # Query the collection
    result = collection.get(
        ids=[company_name],
        include=['metadatas', 'documents']
    )
    
    if not result['ids']:
        return f"No data found for company: {company_name}"
    
    # Unflatten the metadata
    unflattened_data = unflatten_dict(result['metadatas'][0])
    return unflattened_data

# Example usage
company_name = "INFY"  # Replace with your company name
company_data = get_company_data(company_name)

# Display results using pretty print
from pprint import pprint
print(f"\nData for {company_name}:")
pprint(company_data)


Data for INFY:
{'classification': {'Basic Industry': 'Computers - Software & Consulting',
                    'Category': 'Listed',
                    'Group / Settlement Type': 'A /T+1',
                    'Index': 'BSE SENSEX'},
 'financials': {'CEPS (TTM)': '73.31',
                'EPS (TTM)': '66.50',
                'Face Value': '5.00',
                'PE/PB': '30.07 / 10.03',
                'ROE': '33.37'},
 'high_lows': {'52 Week High (Unadjusted)': '2006.80 (13/12/2024)',
               '52 Week High (adjusted)': '2,006.80 (13/12/2024)',
               '52 Week Low (Unadjusted)': '1359.10 (04/06/2024)',
               '52 Week Low (adjusted)': '1,359.10 (04/06/2024)',
               'Month H/L': '2006.80 / 1795.75',
               'Week H/L': '2006.80 / 1897.50'},
 'industry_classification': {'Basic Industry': 'Computers - Software & '
                                               'Consulting',
                             'Industry': 'IT - Software',
                  