valid_p_list = {
    ")":"(",
    "]":"[",
    "}":"{"
}
stack = []

def valid_parentheses(s):
    for p in s:
        if p in valid_p_list.values():
            stack.append(p)
        elif stack and stack[-1]==valid_p_list[p]:
            stack.pop()
    return stack == []
    

    s = "()[]{"
    ys= "()[]"
    valid_parentheses(ys)

In [19]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="dslim/bert-large-NER")

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from docx import Document
import re
from typing import List, Dict, Tuple, Set
import numpy as np
from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import spacy

class BertNERProcessor:
    def __init__(self):
        """Initialize models and NLP tools"""
        # BERT NER model
        self.tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        self.model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        self.label_list = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
        
        # Sentence transformer for semantic similarity
        self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # SpaCy for additional NLP tasks
        self.nlp = spacy.load("en_core_web_sm")
        
    def process_docx(self, file_path: str) -> Dict[str, List[Dict[str, List[str]]]]:
        """Process DOCX file and extract grouped entities"""
        doc = Document(file_path)
        full_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
        chunks = self._split_into_chunks(full_text)
        
        # Extract initial entities
        raw_entities = {
            'PERSON': [],
            'ORGANIZATION': [],
            'LOCATION': [],
            'MISCELLANEOUS': []
        }
        
        # Process chunks and extract entities
        for chunk in chunks:
            entities = self._extract_entities(chunk)
            for entity_type, entities_list in entities.items():
                raw_entities[entity_type].extend(entities_list)
        
        # Group similar entities
        grouped_entities = self._group_entities(raw_entities)
        
        return grouped_entities

    def _group_entities(self, raw_entities: Dict[str, List[Dict[str, str]]]) -> Dict[str, List[Dict[str, List[str]]]]:
        """Group similar entities together using multiple similarity measures"""
        grouped_results = {}
        
        for entity_type, entities in raw_entities.items():
            if not entities:
                grouped_results[entity_type] = []
                continue
                
            # Extract unique entity texts
            unique_entities = list({e['text'] for e in entities})
            
            if len(unique_entities) == 0:
                grouped_results[entity_type] = []
                continue
                
            # Calculate similarity matrix using multiple measures
            embeddings = self.semantic_model.encode(unique_entities)
            
            # Perform clustering
            clusters = self._cluster_entities(embeddings, unique_entities)
            
            # Post-process clusters with rule-based refinements
            refined_clusters = self._refine_clusters(clusters, entity_type)
            
            grouped_results[entity_type] = refined_clusters
            
        return grouped_results

    def _cluster_entities(self, embeddings: np.ndarray, entities: List[str]) -> List[List[str]]:
        """Cluster entities using DBSCAN"""
        # Perform DBSCAN clustering
        clustering = DBSCAN(eps=0.3, min_samples=1, metric='cosine').fit(embeddings)
        
        # Group entities by cluster
        clusters = {}
        for idx, label in enumerate(clustering.labels_):
            if label not in clusters:
                clusters[label] = []
            clusters[label].append(entities[idx])
            
        return list(clusters.values())

    def _refine_clusters(self, clusters: List[List[str]], entity_type: str) -> List[Dict[str, List[str]]]:
        """Apply rule-based refinements to clusters"""
        refined_clusters = []
        
        for cluster in clusters:
            main_entity = self._find_main_entity(cluster, entity_type)
            variations = [e for e in cluster if e != main_entity]
            
            # Apply type-specific rules
            if entity_type == 'PERSON':
                variations = self._refine_person_cluster(main_entity, variations)
            elif entity_type == 'ORGANIZATION':
                variations = self._refine_org_cluster(main_entity, variations)
                
            refined_clusters.append({
                'main': main_entity,
                'variations': variations
            })
            
        return refined_clusters

    def _find_main_entity(self, cluster: List[str], entity_type: str) -> str:
        """Determine the main entity name from a cluster"""
        if entity_type == 'PERSON':
            # Prefer full names
            full_names = [name for name in cluster if len(name.split()) > 1]
            if full_names:
                return max(full_names, key=len)
        
        # Default to longest name
        return max(cluster, key=len)

    def _refine_person_cluster(self, main_entity: str, variations: List[str]) -> List[str]:
        """Apply person-specific refinement rules"""
        main_doc = self.nlp(main_entity)
        refined_variations = set(variations)
        
        # Extract main name components
        main_names = set()
        for token in main_doc:
            if token.pos_ == "PROPN":
                main_names.add(token.text.lower())
        
        # Filter variations
        for var in variations:
            var_doc = self.nlp(var)
            var_names = set()
            for token in var_doc:
                if token.pos_ == "PROPN":
                    var_names.add(token.text.lower())
            
            # Remove if no name overlap
            if not (main_names & var_names):
                refined_variations.discard(var)
                
        return list(refined_variations)

    def _refine_org_cluster(self, main_entity: str, variations: List[str]) -> List[str]:
        """Apply organization-specific refinement rules"""
        main_tokens = set(self.nlp(main_entity.lower()))
        refined_variations = set()
        
        for var in variations:
            var_tokens = set(self.nlp(var.lower()))
            # Keep if significant token overlap
            if len(main_tokens & var_tokens) / len(main_tokens) > 0.3:
                refined_variations.add(var)
                
        return list(refined_variations)

    def _extract_entities(self, text: str) -> Dict[str, List[Dict[str, str]]]:
        """Extract named entities from text chunk"""
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        outputs = self.model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
        
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        labels = [self.label_list[p] for p in predictions[0]]
        
        entities = {
            'PERSON': [],
            'ORGANIZATION': [],
            'LOCATION': [],
            'MISCELLANEOUS': []
        }
        
        current_entity = {'type': None, 'text': ''}
        
        for token, label in zip(tokens, labels):
            if label.startswith('B-'):
                if current_entity['text']:
                    self._add_entity(entities, current_entity)
                current_entity = {
                    'type': label[2:],
                    'text': token.replace('##', '')
                }
            elif label.startswith('I-') and current_entity['text']:
                current_entity['text'] += token.replace('##', '')
            elif label == 'O':
                if current_entity['text']:
                    self._add_entity(entities, current_entity)
                current_entity = {'type': None, 'text': ''}
        
        if current_entity['text']:
            self._add_entity(entities, current_entity)
            
        return entities

    def _split_into_chunks(self, text: str, max_length: int = 400) -> List[str]:
        """Split text into processable chunks"""
        sentences = re.split('([.!?])', text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            if len(current_chunk) + len(sentence) < max_length:
                current_chunk += sentence
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence
                
        if current_chunk:
            chunks.append(current_chunk.strip())
            
        return chunks

    def _add_entity(self, entities: Dict[str, List[Dict[str, str]]], 
                   entity: Dict[str, str]) -> None:
        """Add entity to appropriate category"""
        if not entity['type']:
            return
            
        entity_text = entity['text'].strip()
        if not entity_text:
            return
            
        entity_type = entity['type']
        entity_dict = {'text': entity_text, 'type': entity_type}
        
        if entity_type == 'PER':
            entities['PERSON'].append(entity_dict)
        elif entity_type == 'ORG':
            entities['ORGANIZATION'].append(entity_dict)
        elif entity_type == 'LOC':
            entities['LOCATION'].append(entity_dict)
        elif entity_type == 'MISC':
            entities['MISCELLANEOUS'].append(entity_dict)

def process_document(file_path: str) -> None:
    """Process document and print grouped entities"""
    processor = BertNERProcessor()
    try:
        entities = processor.process_docx(file_path)
        
        print("\nExtracted and Grouped Named Entities:")
        print("-----------------------------------")
        
        for category, clusters in entities.items():
            if clusters:
                print(f"\n{category}:")
                for idx, cluster in enumerate(clusters, 1):
                    print(f"\nGroup {idx}:")
                    print(f"Main: {cluster['main']}")
                    if cluster['variations']:
                        print("Variations:")
                        for var in cluster['variations']:
                            print(f"- {var}")
                    
    except Exception as e:
        print(f"Error processing document: {str(e)}")

if __name__ == "__main__":
    file_path = "path/to/your/document.docx"
    process_document(file_path)

ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
import pandas as pd

In [None]:
x =1 
x

1

In [None]:
# SQL 
pays = "FRANCE1234"
F R A N C E 1 2 3 4
1 2 3 4 5 6 7 8 9 10


2

In [None]:
# PYTHON 
pays = "FRANCE1234"
F R A N C E 1 2 3 4
0 1 2 3 4 5 6 7 8 9 

In [None]:
# python 
eleves_list = ["phillipe", "Mohammed", "les"]

for eleve in eleves_list:
    print(eleve)

phillipe
Mohammed
les


In [None]:
eleves_list = ["phillipe", "Mohammed"]
print("STEP1: voici la liste des eleves:", eleves_list, "se termine ici.")
print("hello world")
print("les")

STEP1: voici la liste des eleves: ['phillipe', 'Mohammed'] se termine ici.
hello world
les


In [None]:
for numero in range(1,10):
    print(numero)



1
2
3
4
5
6
7
8
9


In [None]:
import pandas as pd
import numpy as np

# Create a sample DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 28, 22],
    'City': ['New York', 'San Francisco', 'Chicago', 'Boston', 'Seattle'],
    'Salary': [50000, 75000, 60000, 65000, 45000],
    'Department': ['HR', 'IT', 'Finance', 'Marketing', 'Sales'],
    'Performance': [4.5, 4.2, 4.7, 3.9, 4.1]
})

# Optionally, add a date column
df['Hire_Date'] = pd.date_range(start='2022-01-01', periods=5)

# Display the DataFrame
print(df)

      Name  Age           City  Salary Department  Performance  Hire_Date
0    Alice   25       New York   50000         HR          4.5 2022-01-01
1      Bob   30  San Francisco   75000         IT          4.2 2022-01-02
2  Charlie   35        Chicago   60000    Finance          4.7 2022-01-03
3    David   28         Boston   65000  Marketing          3.9 2022-01-04
4      Eve   22        Seattle   45000      Sales          4.1 2022-01-05


In [None]:
list_a = [1,3,4]

if list_a[1] != 5 and list_a[0]!=8:
    print("ok")

ok


In [None]:

categories_article = ["col1", "col2", "col3"]
# Create the activities table if it doesn't exist using BOOLEAN
create_table_query = """
CREATE TABLE IF NOT EXISTS activities (
    entity TEXT,
    activities BOOLEAN,
    timestamp TIMESTAMP,
    comments TEXT,
    flagged BOOLEAN,
    PRIMARY KEY(entity, timestamp)
)
"""
conn.execute(create_table_query)

# Create the table_query_db if it doesn't exist
create_table_query_db = """
CREATE TABLE IF NOT EXISTS table_query_db (
    entity TEXT,
    activities BOOLEAN,
    timestamp TIMESTAMP,
    comments TEXT,
    flagged BOOLEAN,
    PRIMARY KEY(entity, timestamp)
)
"""
conn.execute(create_table_query_db)

insert_data = []
for row in dr.iterrows():
    step_data = row[1]
    entity = step_data["entity"].replace("'", "''") # Escape single quotes
    summary = step_data["summary"].replace("'", "''") # Escape single quotes
    activities = tuple(bool(row["activity"]) for activity in categories_article if activity is "no label") # Convert activities to boolean
    current_timestamp = datetime.now().isoformat() # Get current timestamp in ISO format
    comments = row["comments"].replace("'", "''")
    flagged = bool(row["flagged"])

    data_row = (entity, activities, current_timestamp, comments, flagged)

    # Check if the last entry for this entity is different
    query = f"SELECT * FROM table_query_db WHERE entity = %s ORDER BY timestamp DESC LIMIT 1"
    last_entry = conn.execute(query, (entity,)).fetchone()

    # Check if there is a change in the activities
    if last_entry:
        if last_entry[1] != activities: # Exclude entity and timestamp for comparison
            insert_data.append(data_row)
    else:
        insert_data.append(data_row) # This

if insert_data:
    query = f"""
    INSERT INTO {table_name} VALUES ({', '.join(['?'] * (len(categories_article) + 4))})
    """
    conn.executemany(query,insert_data)
    conn.commit()
    

0 Name                         Alice
Age                             25
City                      New York
Salary                       50000
Department                      HR
Performance                    4.5
Hire_Date      2022-01-01 00:00:00
Name: 0, dtype: object
1 Name                           Bob
Age                             30
City                 San Francisco
Salary                       75000
Department                      IT
Performance                    4.2
Hire_Date      2022-01-02 00:00:00
Name: 1, dtype: object
2 Name                       Charlie
Age                             35
City                       Chicago
Salary                       60000
Department                 Finance
Performance                    4.7
Hire_Date      2022-01-03 00:00:00
Name: 2, dtype: object
3 Name                         David
Age                             28
City                        Boston
Salary                       65000
Department               Marketing
Performance  

In [None]:
# Create the activities table if it doesn't exist using BOOLEAN
create_table_query = """
CREATE TABLE IF NOT EXISTS activities (
    entity TEXT,
    {",".join([f'"{activity}" BOOLEAN' for activity in categories_article if activity != 'no label'])},
    timestamp TIMESTAMP,
    comments TEXT,
    flagged BOOLEAN,
    PRIMARY KEY(entity, timestamp)
)
"""
conn.execute(create_table_query)
# Create the table_query_db if it doesn't exist
create_table_query_db = """
CREATE TABLE IF NOT EXISTS table_query_db (
    entity TEXT,
    {",".join([f'"{activity}" BOOLEAN' for activity in categories_article if activity != 'no label'])},
    timestamp TIMESTAMP,
    comments TEXT,
    flagged BOOLEAN,
    PRIMARY KEY(entity, timestamp)
)
"""
conn.execute(create_table_query_db)
insert_data = []
for row in dr.iterrows():
    step_data = row[1]
    entity = step_data["entity"].replace("'", "''") # Escape single quotes
    summary = step_data["summary"].replace("'", "''") # Escape single quotes
    activities = tuple(bool(row["activity"]) for activity in categories_article if activity is "no label") # Convert activities to boolean
    current_timestamp = datetime.now().isoformat() # Get current timestamp in ISO format
    comments = row["comments"].replace("'", "''")
    flagged = bool(row["flagged"])
    data_row = (entity, activities, current_timestamp, comments, flagged)
    # Check if the last entry for this entity is different
    query = f"SELECT * FROM table_query_db WHERE entity = %s ORDER BY timestamp DESC LIMIT 1"
    last_entry = conn.execute(query, (entity,)).fetchone()
    # Check if there is a change in the activities
    if last_entry:
        if last_entry[1] != activities: # Exclude entity and timestamp for comparison
            insert_data.append(data_row)
    else:
        insert_data.append(data_row) # This is a new entry for an entity

if insert_data:
    query = f"""
    INSERT INTO {table_name} VALUES ({', '.join(['?'] * (len(categories_article) + 4))})
    """
    conn.executemany(query,insert_data)
    conn.commit()
    

In [None]:
for i in range: 
    print(i)
    def(in)

387209.73000000004

In [None]:
!pip install regex


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import regex as re

# Step 1: Define the input string
input_string = "Hello, Wörld! 123. Grüße"

print("Original String:")
print(input_string)

# Step 2: Replace non-alphanumeric characters with whitespace
# Use the Unicode property \p{L} to match any kind of letter and \p{N} to match any kind of number
output_string = re.sub(r'[^\p{L}\p{N}]', ' ', input_string)

print("\nProcessed String:")
print(output_string)

Original String:
Hello, Wörld! 123. Grüße

Processed String:
Hello  Wörld  123  Grüße


In [None]:
import re

# Step 1: Define the input string
input_string = "$2.8M"

print("Original String:")
print(input_string)

# Step 2: Replace non-alphanumeric characters except dots, commas, and dollar signs with whitespace
output_string = re.sub(r'[^a-zA-Z0-9.,]', ' ', input_string)

print("\nProcessed String:")
print(output_string)

Original String:
$2.8M

Processed String:
 2.8M


In [None]:
import re

def clean_text(text):
   # Pattern includes Latin alphabet extensions used in European languages
   pattern = r'[^a-zA-ZàáâäãåąčćęèéêëėįìíîïłńòóôöõøùúûüųūÿýżźñçčšžÀÁÂÄÃÅĄĆČĖĘÈÉÊËÌÍÎÏĮŁŃÒÓÔÖÕØÙÚÛÜŲŪŸÝŻŹÑßÇČŠŽ0-9.,]'
   return re.sub(pattern, ' ', text)

In [None]:
import streamlit as st
import streamlit.components.v1 as components
import json

def load_graph_data():
    # Load your JSON files
    try:
        with open('nodes.json', 'r') as f:
            nodes = json.load(f)
        with open('links.json', 'r') as f:
            links = json.load(f)
    except FileNotFoundError:
        # Default data if files not found
        nodes = [
            {"id": "Person1", "type": "person"},
            {"id": "Person2", "type": "person"},
            {"id": "Person3", "type": "person"},
            {"id": "CompanyA", "type": "company"},
            {"id": "CompanyB", "type": "company"}
        ]
        links = [
            {"source": "Person1", "target": "CompanyA"},
            {"source": "Person2", "target": "CompanyA"},
            {"source": "Person2", "target": "CompanyB"},
            {"source": "Person3", "target": "CompanyB"}
        ]
    return {"nodes": nodes, "links": links}

def force_graph(graph_data):
    html = """
    <!DOCTYPE html>
    <html>
    <head>
        <script src="https://cdnjs.cloudflare.com/ajax/libs/d3/7.8.5/d3.min.js"></script>
        <style>
            .node-person { fill: #69b3a2; }
            .node-company { fill: #404080; }
            .link { stroke: #999; stroke-opacity: 0.6; }
            .node-label { font-size: 12px; }
        </style>
    </head>
    <body>
        <div id="graph"></div>
        <script>
            const data = """ + json.dumps(graph_data) + """;

            const width = 1500;
            const height = 800;
            
            const svg = d3.select("#graph")
                .append("svg")
                .attr("width", width)
                .attr("height", height);

            const simulation = d3.forceSimulation(data.nodes)
                .force("link", d3.forceLink(data.links).id(d => d.id))
                .force("charge", d3.forceManyBody().strength(-400))
                .force("center", d3.forceCenter(width / 2, height / 2))
                .force("y", d3.forceY(d => d.type === "company" ? height * 0.7 : height * 0.3).strength(1));

            const link = svg.append("g")
                .selectAll("line")
                .data(data.links)
                .join("line")
                .attr("class", "link");

            const node = svg.append("g")
                .selectAll("circle")
                .data(data.nodes)
                .join("circle")
                .attr("class", d => "node-" + d.type)
                .attr("r", d => d.type === "company" ? 25 : 20)
                .call(drag(simulation));

            const labels = svg.append("g")
                .selectAll("text")
                .data(data.nodes)
                .join("text")
                .attr("class", "node-label")
                .text(d => d.id)
                .attr("dx", 0)
                .attr("dy", 25)
                .attr("text-anchor", "middle");

            simulation.on("tick", () => {
                link
                    .attr("x1", d => d.source.x)
                    .attr("y1", d => d.source.y)
                    .attr("x2", d => d.target.x)
                    .attr("y2", d => d.target.y);

                node
                    .attr("cx", d => d.x)
                    .attr("cy", d => d.y);

                labels
                    .attr("x", d => d.x)
                    .attr("y", d => d.y);
            });

            function drag(simulation) {
                function dragstarted(event) {
                    if (!event.active) simulation.alphaTarget(0.3).restart();
                    event.subject.fx = event.subject.x;
                    event.subject.fy = event.subject.y;
                }
                
                function dragged(event) {
                    event.subject.fx = event.x;
                    event.subject.fy = event.y;
                }
                
                function dragended(event) {
                    if (!event.active) simulation.alphaTarget(0);
                    // Nodes will stay where they are dragged
                }
                
                return d3.drag()
                    .on("start", dragstarted)
                    .on("drag", dragged)
                    .on("end", dragended);
            }
        </script>
    </body>
    </html>
    """
    
    components.html(html, height=850)

def main():
    st.title("People-Company Network Graph")
    graph_data = load_graph_data()
    force_graph(graph_data)

if __name__ == "__main__":
    main()

In [None]:
def create_bidirectional_dict(data_dict):
   result = data_dict.copy()
   
   for entity, linked in data_dict.items():
       for linked_entity in linked:
           if linked_entity not in result:
               result[linked_entity] = [e for e, l in data_dict.items() 
                                      if linked_entity in l]
   
   return result

# Example
data = {'entity1': ['entity2', 'entity3']}
result = create_bidirectional_dict(data)
result = create_bidirectional_dict(data)
result = create_bidirectional_dict(data)

In [None]:

import pandas as pd

def create_bidirectional_df(data_dict):
    rows = []
    
    # Add initial relationships
    for entity, linked in data_dict.items():
        rows.append({
            'entity': entity,
            'linked_entities': ', '.join(linked)
        })
        
        # Add bidirectional relationships for each linked entity
        for linked_entity in linked:
            # Get all entities that this one is linked to
            related = []
            for e, l in data_dict.items():
                if linked_entity in l:
                    related.append(e)
                if e == linked_entity:
                    related.extend(l)
            
            rows.append({
                'entity': linked_entity,
                'linked_entities': ', '.join(set(related))
            })
    
    return pd.DataFrame(rows).drop_duplicates()

# Example usage
data = {
    'entity1': ['entity2', 'entity3']
}
df = create_bidirectional_df(data)
print(df)



    entity   linked_entities
0  entity1  entity2, entity3
1  entity2           entity1
2  entity3           entity1


In [None]:

import pandas as pd

def create_bidirectional_df(data_dict):
    rows = []
    
    # Add initial relationships
    for entity, linked in data_dict.items():
        rows.append({
            'entity': entity,
            'linked_entities': ', '.join(linked)
        })
        
        # Add bidirectional relationships for each linked entity
        for linked_entity in linked:
            # Get all entities that this one is linked to
            related = []
            for e, l in data_dict.items():
                if linked_entity in l:
                    related.append(e)
                if e == linked_entity:
                    related.extend(l)
            
            rows.append({
                'entity': linked_entity,
                'linked_entities': ', '.join(set(related))
            })
    
    return pd.DataFrame(rows).drop_duplicates()

# Example usage
data = {
    'entity1': ['entity2', 'entity3']
}
df = create_bidirectional_df(data)
print(df)


    entity   linked_entities
0  entity1  entity2, entity3
1  entity2           entity1
2  entity3           entity1


In [None]:

def create_bidirectional_df(data_dict):
    rows = []
    for entity, linked in data_dict.items():
        rows.append({
            'entity': entity, 
            'linked_entities': ', '.join(linked)
        })
        
        # Add rows for linked entities
        for linked_entity in linked:
            other_entities = [e for e in linked if e != linked_entity]
            if entity not in other_entities:
                other_entities.append(entity)
            rows.append({
                'entity': linked_entity,
                'linked_entities': ', '.join(other_entities)
            })
            
    return pd.DataFrame(rows).drop_duplicates()

# Example
data = {'entity1': ['entity2', 'entity3']}
df = create_bidirectional_df(data)
print(df)


    entity   linked_entities
0  entity1  entity2, entity3
1  entity2  entity3, entity1
2  entity3  entity2, entity1


In [None]:
import pandas as pd

# Step 1: Create DataFrames
left_df = pd.DataFrame({
    'key': [1, 2, 3, 4, 5],
    'value': ['A', 'B', 'C', 'D', 'E']
})

right_df = pd.DataFrame({
    'key': [3, 4, 5, 6, 7],
    'value': ['C', 'D', 'E', 'F', 'G']
})

print("Left DataFrame:")
print(left_df)

print("\nRight DataFrame:")
print(right_df)

# Step 2: Perform the left anti join
merged_df = left_df.merge(right_df, on=['key', 'value'], how='left', indicator=True)
left_anti_join_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])

print("\nLeft Anti Join Result:")
print(left_anti_join_dfon

In [None]:
import pandas as pd

# Step 1: Create a DataFrame
data = {
    'id': [1, 2, 3],
    'list_column': [['apple', 'banana', 'cherry'], ['dog', 'elephant'], ['fish', 'goat', 'horse']]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 2: Convert list column to string column
df['string_column'] = df['list_column'].apply(lambda x: ' '.join(x))

print("\nDataFrame with String Column:")
print(df)

Original DataFrame:
   id              list_column
0   1  [apple, banana, cherry]
1   2          [dog, elephant]
2   3      [fish, goat, horse]

DataFrame with String Column:
   id              list_column        string_column
0   1  [apple, banana, cherry]  apple banana cherry
1   2          [dog, elephant]         dog elephant
2   3      [fish, goat, horse]      fish goat horse


In [None]:
[
    {
        "source": "Person1",
        "target": "CompanyA"
    },
    {
        "source": "Person2",
        "target": "CompanyA"
    }
]

In [None]:
import streamlit as st
import streamlit.components.v1 as components
import json

def main():
    # File uploader
    uploaded_file = st.file_uploader("Upload JSON file", type=['json'])
    
    if uploaded_file:
        data = json.load(uploaded_file)
        
        html_content = f"""
            <html>
                <head>
                    <script src="https://unpkg.com/3d-force-graph"></script>
                </head>
                <body>
                    <div id="3d-graph" style="width: 100%; height: 600px;"></div>
                    <script>
                        const myDict = {json.dumps(data)};
                        const graphData = {{
                            nodes: [],
                            links: []
                        }};
                        
                        function processDict(dict, parent = null) {{
                            Object.entries(dict).forEach(([key, value]) => {{
                                if (!graphData.nodes.find(n => n.id === key)) {{
                                    graphData.nodes.push({{id: key, name: key, group: 1}});
                                }}
                                
                                if (parent) {{
                                    graphData.links.push({{source: parent, target: key}});
                                }}ca de
                                
                                if (typeof value === 'object' && value !== null) {{
                                    processDict(value, key);
                                }} else {{
                                    if (!graphData.nodes.find(n => n.id === value)) {{
                                        graphData.nodes.push({{id: value, name: value, group: 2}});
                                    }}
                                    graphData.links.push({{source: key, target: value}});
                                }}
                            }});
                        }}
                        
                        processDict(myDict);
                        
                        const graph = ForceGraph3D()
                            .graphData(graphData)
                            .nodeLabel('name')
                            .nodeColor(node => node.group === 1 ? '#ff4444' : '#4444ff')
                            .linkWidth(1)
                            .linkOpacity(0.8)
                            .backgroundColor('#ffffff')
                            (document.getElementById('3d-graph'));
                    </script>
                </body>
            </html>
        """
        
        components.html(html_content, height=600)

if __name__ == "__main__":
    main()

In [None]:
import streamlit as st
import streamlit.components.v1 as components
import json

# Configure the page to use wide mode
st.set_page_config(layout="wide")

def main():
    # Remove default padding
    st.markdown("""
        <style>
            .block-container {
                padding-top: 1rem;
                padding-bottom: 0rem;
                padding-left: 1rem;
                padding-right: 1rem;
            }
        </style>
    """, unsafe_allow_html=True)
    
    # File uploader
    uploaded_file = st.file_uploader("Upload JSON file", type=['json'])
    
    if uploaded_file:
        data = json.load(uploaded_file)
        
        html_content = f"""
            <html>
                <head>
                    <script src="https://unpkg.com/3d-force-graph"></script>
                    <style>
                        #3d-graph {{
                            width: 100vw !important;
                            height: 100vh !important;
                            position: fixed;
                            left: 0;
                            top: 0;
                        }}
                    </style>
                </head>
                <body>
                    <div id="3d-graph"></div>
                    <script>
                        const myDict = {json.dumps(data)};
                        const graphData = {{
                            nodes: [],
                            links: []
                        }};
                        
                        function processDict(dict, parent = null) {{
                            Object.entries(dict).forEach(([key, value]) => {{
                                if (!graphData.nodes.find(n => n.id === key)) {{
                                    graphData.nodes.push({{id: key, name: key, group: 1}});
                                }}
                                
                                if (parent) {{
                                    graphData.links.push({{source: parent, target: key}});
                                }}
                                
                                if (typeof value === 'object' && value !== null) {{
                                    processDict(value, key);
                                }} else {{
                                    if (!graphData.nodes.find(n => n.id === value)) {{
                                        graphData.nodes.push({{id: value, name: value, group: 2}});
                                    }}
                                    graphData.links.push({{source: key, target: value}});
                                }}
                            }});
                        }}
                        
                        processDict(myDict);
                        
                        const graph = ForceGraph3D()
                            .graphData(graphData)
                            .nodeLabel('name')
                            .nodeColor(node => node.group === 1 ? '#ff4444' : '#4444ff')
                            .linkWidth(1)
                            .linkOpacity(0.8)
                            .backgroundColor('#ffffff')
                            (document.getElementById('3d-graph'));
                            
                        // Adjust graph size on window resize
                        window.addEventListener('resize', () => {
                            graph.width(window.innerWidth)
                                .height(window.innerHeight);
                        });
                    </script>
                </body>
            </html>
        """
        
        components.html(html_content, height=800)

if __name__ == "__main__":
    main()

In [None]:
define a new prompt because this one is keeping entity like "officials", "senior officials"

### entity resolution 


In [None]:
# Import required libraries 
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re

def clean_text(text):
    """Simple text cleaning function"""
    if not isinstance(text, str):
        return str(text)
    
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(text.split())
    return text

def calculate_similarity(str1, str2):
    """Calculate string similarity using SequenceMatcher"""
    # Clean both strings
    str1_clean = clean_text(str1)
    str2_clean = clean_text(str2)
    return SequenceMatcher(None, str1_clean, str2_clean).ratio()

def create_blocking_key(row, blocking_fields):
    """Create a simple blocking key from the first 3 letters of each blocking field"""
    key_parts = []
    for field in blocking_fields:
        if field in row and pd.notna(row[field]):
            # Get first 3 letters of cleaned value
            cleaned_value = clean_text(str(row[field]))
            key_parts.append(cleaned_value[:3])
    return '_'.join(key_parts)

def find_matches(df, blocking_fields, comparison_fields, threshold=0.7):
    """Main function to find matching entities"""
    
    # Step 1: Create blocking keys
    print("Creating blocking keys...")
    df['blocking_key'] = df.apply(lambda x: create_blocking_key(x, blocking_fields), axis=1)
    print("\nBlocking keys created:")
    print(df[['blocking_key'] + blocking_fields])
    
    # Step 2: Group by blocking key
    print("\nGrouping records...")
    blocks = df.groupby('blocking_key')
    
    # Step 3: Compare records within blocks
    matches = []
    print("\nComparing records...")
    
    for block_key, block in blocks:
        if len(block) > 1:  # Only process blocks with multiple records
            print(f"\nProcessing block: {block_key} with {len(block)} records")
            # Compare all pairs in the block
            for i in range(len(block)):
                for j in range(i + 1, len(block)):
                    record1 = block.iloc[i]
                    record2 = block.iloc[j]
                    
                    # Calculate similarity for each field
                    field_similarities = {}
                    for field in comparison_fields:
                        if pd.notna(record1[field]) and pd.notna(record2[field]):
                            similarity = calculate_similarity(str(record1[field]), str(record2[field]))
                            field_similarities[field] = similarity
                    
                    if field_similarities:
                        avg_similarity = np.mean(list(field_similarities.values()))
                        
                        # Debug print
                        print(f"\nComparing records {record1.name} and {record2.name}:")
                        print(f"Field similarities: {field_similarities}")
                        print(f"Average similarity: {avg_similarity:.2f}")
                        
                        if avg_similarity >= threshold:
                            matches.append({
                                'record1_id': record1.name,
                                'record2_id': record2.name,
                                'similarity': avg_similarity,
                                'blocking_key': record1['blocking_key'],
                                'field_similarities': field_similarities
                            })
    
    # Convert matches to DataFrame
    matches_df = pd.DataFrame(matches)
    return matches_df

# Test the code with sample data
print("Creating sample data...")
data = {
    'id': range(1, 5),
    'name': ['John Smith', 'Jon Smith', 'Jane Doe', 'J. Smith'],
    'address': ['123 Main St', '123 Main Street', '456 Oak Ave', '123 Main St.'],
    'phone': ['555-0123', '5550123', '555-4567', '555-0123']
}

# Convert to DataFrame
df = pd.DataFrame(data)
print("\nInput data:")
print(df)

# Define fields for blocking and comparison
blocking_fields = ['name']
comparison_fields = ['name', 'address', 'phone']

# Find matches with detailed output
print("\nFinding matches...")
matches = find_matches(df, blocking_fields, comparison_fields, threshold=0.7)

# Display results
if len(matches) > 0:
    print("\nMatches found:")
    for _, match in matches.iterrows():
        print(f"\nMatch pair (similarity: {match['similarity']:.2f}):")
        print("Record 1:")
        print(df.loc[match['record1_id']])
        print("\nRecord 2:")
        print(df.loc[match['record2_id']])
else:
    print("\nNo matches found!")

Creating sample data...

Input data:
   id        name          address     phone
0   1  John Smith      123 Main St  555-0123
1   2   Jon Smith  123 Main Street   5550123
2   3    Jane Doe      456 Oak Ave  555-4567
3   4    J. Smith     123 Main St.  555-0123

Finding matches...
Creating blocking keys...

Blocking keys created:
  blocking_key        name
0          joh  John Smith
1          jon   Jon Smith
2          jan    Jane Doe
3          j s    J. Smith

Grouping records...

Comparing records...

No matches found!


In [None]:

# Similarities for entity and adress 

# Import required libraries 
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re

def clean_text(text):
    """Simple text cleaning function"""
    if not isinstance(text, str):
        return str(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join(text.split())
    return text

def calculate_similarity(str1, str2):
    """Calculate string similarity using SequenceMatcher"""
    str1_clean = clean_text(str1)
    str2_clean = clean_text(str2)
    return SequenceMatcher(None, str1_clean, str2_clean).ratio()

def find_matches(df, comparison_fields, threshold=0.7):
    """Simplified matching function that compares all pairs"""
    matches = []
    
    # Compare all pairs
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            record1 = df.iloc[i]
            record2 = df.iloc[j]
            
            # Calculate similarities for each field
            similarities = []
            for field in comparison_fields:
                if pd.notna(record1[field]) and pd.notna(record2[field]):
                    sim = calculate_similarity(str(record1[field]), str(record2[field]))
                    similarities.append(sim)
                    
            if similarities:
                avg_similarity = np.mean(similarities)
                
                if avg_similarity >= threshold:
                    match = {
                        'Record 1': f"ID: {record1['id']}, Name: {record1['name']}",
                        'Record 2': f"ID: {record2['id']}, Name: {record2['name']}",
                        'Similarity': f"{avg_similarity:.2f}",
                        'Matching Fields': {
                            field: f"{calculate_similarity(str(record1[field]), str(record2[field])):.2f}"
                            for field in comparison_fields
                        }
                    }
                    matches.append(match)
    
    return matches

# Create sample data
data = {
    'id': range(1, 5),
    'name': ['John Smith', 'Jon Smith', 'Jane Doe', 'J. Smith'],
    'address': ['123 Main St', '123 Main Street', '456 Oak Ave', '123 Main St.'],
    'phone': ['555-0123', '5550123', '555-4567', '555-0123']
}

# Convert to DataFrame
df = pd.DataFrame(data)
print("Input Data:")
print(df)
print("\n" + "="*50 + "\n")

# Find matches
comparison_fields = ['name', 'address', 'phone']
matches = find_matches(df, comparison_fields, threshold=0.7)

# Display results
print("Matching Results:")
for match in matches:
    print("\nMatch Found:")
    print(f"Record 1: {match['Record 1']}")
    print(f"Record 2: {match['Record 2']}")
    print(f"Overall Similarity: {match['Similarity']}")
    print("Field-by-field similarities:")
    for field, sim in match['Matching Fields'].items():
        print(f"  {field}: {sim}")

Input Data:
   id        name          address     phone
0   1  John Smith      123 Main St  555-0123
1   2   Jon Smith  123 Main Street   5550123
2   3    Jane Doe      456 Oak Ave  555-4567
3   4    J. Smith     123 Main St.  555-0123


Matching Results:

Match Found:
Record 1: ID: 1, Name: John Smith
Record 2: ID: 2, Name: Jon Smith
Overall Similarity: 0.93
Field-by-field similarities:
  name: 0.95
  address: 0.85
  phone: 1.00

Match Found:
Record 1: ID: 1, Name: John Smith
Record 2: ID: 4, Name: J. Smith
Overall Similarity: 0.94
Field-by-field similarities:
  name: 0.82
  address: 1.00
  phone: 1.00

Match Found:
Record 1: ID: 2, Name: Jon Smith
Record 2: ID: 4, Name: J. Smith
Overall Similarity: 0.91
Field-by-field similarities:
  name: 0.88
  address: 0.85
  phone: 1.00


In [None]:
!pip install jellyfish


Collecting jellyfish
  Using cached jellyfish-1.1.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.6 kB)
Using cached jellyfish-1.1.3-cp312-cp312-macosx_11_0_arm64.whl (311 kB)
Installing collected packages: jellyfish
Successfully installed jellyfish-1.1.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
# with name and entity description

# Import required libraries 
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

def clean_text(text):
    """Clean text by removing special characters and standardizing format"""
    if not isinstance(text, str):
        return str(text)
    
    # Convert to lowercase and remove special characters
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

def calculate_name_similarity(name1, name2):
    """Calculate similarity between names using SequenceMatcher"""
    name1_clean = clean_text(name1)
    name2_clean = clean_text(name2)
    return SequenceMatcher(None, name1_clean, name2_clean).ratio()

def calculate_description_similarity(desc1, desc2):
    """Calculate similarity between descriptions using TF-IDF and cosine similarity"""
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        lowercase=True,
        stop_words='english',
        token_pattern=r'\b\w+\b',  # Match whole words only
        min_df=1  # Include all terms since we're only comparing two documents
    )
    
    # Fit and transform the descriptions
    try:
        tfidf_matrix = vectorizer.fit_transform([clean_text(desc1), clean_text(desc2)])
        # Calculate cosine similarity
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        return float(similarity)
    except:
        return 0.0

def find_matches(df, name_threshold=0.8, desc_threshold=0.3, name_weight=0.6):
    """Find matching entities based on name and description similarity
    
    Args:
        df: DataFrame with 'name' and 'description' columns
        name_threshold: Minimum similarity threshold for names
        desc_threshold: Minimum similarity threshold for descriptions
        name_weight: Weight given to name similarity (1 - name_weight for description)
    """
    matches = []
    
    # Compare all pairs
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            record1 = df.iloc[i]
            record2 = df.iloc[j]
            
            # Calculate name similarity
            name_sim = calculate_name_similarity(record1['name'], record2['name'])
            
            # Only proceed if names are similar enough
            if name_sim >= name_threshold:
                # Calculate description similarity
                desc_sim = calculate_description_similarity(
                    record1['description'], 
                    record2['description']
                )
                
                # Calculate weighted average similarity
                weighted_sim = (name_sim * name_weight + 
                              desc_sim * (1 - name_weight))
                
                # Check if description similarity meets threshold
                if desc_sim >= desc_threshold:
                    match = {
                        'Record 1': f"ID: {record1.name}, Name: {record1['name']}",
                        'Record 2': f"ID: {record2.name}, Name: {record2['name']}",
                        'Overall Similarity': f"{weighted_sim:.2f}",
                        'Similarities': {
                            'name': f"{name_sim:.2f}",
                            'description': f"{desc_sim:.2f}"
                        }
                    }
                    matches.append(match)
    
    return matches

# Example usage with sample data
data = {
    'name': [
        'John Smith',
        'Jon Smith',
        'Jane Doe',
        'J. Smith'
    ],
    'description': [
        'A software engineer with 10 years of experience in Python and Java development',
        'Experienced software developer specializing in Python and Java programming',
        'Marketing professional with expertise in digital campaigns',
        'Senior software engineer with Python and Java background'
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)
print("Input Data:")
print(df)
print("\n" + "="*80 + "\n")

# Find matches
matches = find_matches(df, name_threshold=0.7, desc_threshold=0.3, name_weight=0.6)

# Display results
print("Matching Results:")
for match in matches:
    print("\nMatch Found:")
    print(f"Record 1: {match['Record 1']}")
    print(f"Record 2: {match['Record 2']}")
    print(f"Overall Similarity: {match['Overall Similarity']}")
    print("Field-by-field similarities:")
    for field, sim in match['Similarities'].items():
        print(f"  {field}: {sim}")

Input Data:
         name                                        description
0  John Smith  A software engineer with 10 years of experienc...
1   Jon Smith  Experienced software developer specializing in...
2    Jane Doe  Marketing professional with expertise in digit...
3    J. Smith  Senior software engineer with Python and Java ...


Matching Results:

Match Found:
Record 1: ID: 0, Name: John Smith
Record 2: ID: 3, Name: J. Smith
Overall Similarity: 0.66
Field-by-field similarities:
  name: 0.82
  description: 0.41

Match Found:
Record 1: ID: 1, Name: Jon Smith
Record 2: ID: 3, Name: J. Smith
Overall Similarity: 0.65
Field-by-field similarities:
  name: 0.88
  description: 0.30


In [None]:
# Import required libraries 
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch

class EntityMatcher:
    def __init__(self, embedding_model='all-MiniLM-L6-v2'):
        """
        Initialize the entity matcher with specified embedding model
        
        Args:
            embedding_model: Model name for sentence-transformers or 'openai' for OpenAI embeddings
        """
        self.embedding_model = embedding_model
        if embedding_model != 'openai':
            # Load local sentence-transformers model
            self.model = SentenceTransformer(embedding_model)
    
    def clean_text(self, text):
        """Clean text by removing special characters and standardizing format"""
        if not isinstance(text, str):
            return str(text)
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        text = ' '.join(text.split())
        return text

    def calculate_name_similarity(self, name1, name2):
        """Calculate similarity between names using SequenceMatcher"""
        name1_clean = self.clean_text(name1)
        name2_clean = self.clean_text(name2)
        return SequenceMatcher(None, name1_clean, name2_clean).ratio()

    async def get_openai_embedding(self, text):
        """Get embeddings using OpenAI's API"""
        try:
            response = await openai.Embedding.acreate(
                model="text-embedding-ada-002",
                input=text
            )
            return response['data'][0]['embedding']
        except Exception as e:
            print(f"Error getting OpenAI embedding: {e}")
            return None

    def get_local_embedding(self, text):
        """Get embeddings using sentence-transformers"""
        try:
            embedding = self.model.encode(text, convert_to_tensor=True)
            return embedding
        except Exception as e:
            print(f"Error getting local embedding: {e}")
            return None

    def calculate_description_similarity(self, desc1, desc2):
        """Calculate similarity between descriptions using embeddings"""
        # Clean descriptions
        desc1_clean = self.clean_text(desc1)
        desc2_clean = self.clean_text(desc2)
        
        # Get embeddings
        if self.embedding_model == 'openai':
            # Use OpenAI embeddings (requires async)
            emb1 = await self.get_openai_embedding(desc1_clean)
            emb2 = await self.get_openai_embedding(desc2_clean)
        else:
            # Use local model embeddings
            emb1 = self.get_local_embedding(desc1_clean)
            emb2 = self.get_local_embedding(desc2_clean)
        
        if emb1 is None or emb2 is None:
            return 0.0
        
        # Calculate cosine similarity
        if isinstance(emb1, torch.Tensor):
            emb1 = emb1.cpu().numpy()
            emb2 = emb2.cpu().numpy()
        
        similarity = cosine_similarity(
            emb1.reshape(1, -1),
            emb2.reshape(1, -1)
        )[0][0]
        
        return float(similarity)

    def find_matches(self, df, name_threshold=0.8, desc_threshold=0.7, name_weight=0.4):
        """Find matching entities based on name and description similarity"""
        matches = []
        total_comparisons = len(df) * (len(df) - 1) // 2
        
        print(f"Processing {total_comparisons} comparisons...")
        
        # Get all description embeddings first for efficiency
        descriptions = df['description'].apply(self.clean_text).tolist()
        if self.embedding_model != 'openai':
            all_embeddings = self.model.encode(descriptions, convert_to_tensor=True)
        
        # Compare all pairs
        for i in range(len(df)):
            for j in range(i + 1, len(df)):
                record1 = df.iloc[i]
                record2 = df.iloc[j]
                
                # Calculate name similarity
                name_sim = self.calculate_name_similarity(record1['name'], record2['name'])
                
                # Only proceed if names are similar enough
                if name_sim >= name_threshold:
                    # Calculate description similarity using cached embeddings
                    if self.embedding_model != 'openai':
                        desc_sim = float(cosine_similarity(
                            all_embeddings[i].reshape(1, -1),
                            all_embeddings[j].reshape(1, -1)
                        )[0][0])
                    else:
                        desc_sim = self.calculate_description_similarity(
                            record1['description'],
                            record2['description']
                        )
                    
                    # Calculate weighted similarity
                    weighted_sim = (name_sim * name_weight + 
                                  desc_sim * (1 - name_weight))
                    
                    # Check if description similarity meets threshold
                    if desc_sim >= desc_threshold:
                        match = {
                            'Record 1': f"ID: {record1.name}, Name: {record1['name']}",
                            'Record 2': f"ID: {record2.name}, Name: {record2['name']}",
                            'Overall Similarity': f"{weighted_sim:.2f}",
                            'Similarities': {
                                'name': f"{name_sim:.2f}",
                                'description': f"{desc_sim:.2f}"
                            },
                            'Description 1': record1['description'],
                            'Description 2': record2['description']
                        }
                        matches.append(match)
        
        return matches

# Example usage
def main():
    # Sample data
    data = {
        'name': [
            'John Smith',
            'Jon Smith',
            'Jane Doe',
            'J. Smith'
        ],
        'description': [
            'A software engineer with 10 years of experience in Python and Java development',
            'Experienced software developer specializing in Python and Java programming',
            'Marketing professional with expertise in digital campaigns',
            'Senior software engineer with Python and Java background'
        ]
    }

    # Convert to DataFrame
    df = pd.DataFrame(data)
    print("Input Data:")
    print(df)
    print("\n" + "="*80 + "\n")

    # Initialize matcher with chosen model
    matcher = EntityMatcher(embedding_model='all-MiniLM-L6-v2')  # or 'openai' for OpenAI embeddings
    
    # Find matches
    matches = matcher.find_matches(
        df,
        name_threshold=0.7,
        desc_threshold=0.7,
        name_weight=0.4  # Give more weight to description similarity
    )

    # Display results
    print("Matching Results:")
    for match in matches:
        print("\nMatch Found:")
        print(f"Record 1: {match['Record 1']}")
        print(f"Record 2: {match['Record 2']}")
        print(f"Overall Similarity: {match['Overall Similarity']}")
        print("Field-by-field similarities:")
        for field, sim in match['Similarities'].items():
            print(f"  {field}: {sim}")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
import openai
import re
import json
from typing import List, Dict, Tuple

class LLMEntityMatcher:
    def __init__(self, api_key: str):
        """Initialize the matcher with OpenAI API key"""
        openai.api_key = api_key
        
    def clean_text(self, text: str) -> str:
        """Basic text cleaning"""
        if not isinstance(text, str):
            return str(text)
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        return ' '.join(text.split())

    def calculate_name_similarity(self, name1: str, name2: str) -> float:
        """Calculate basic name similarity"""
        name1_clean = self.clean_text(name1)
        name2_clean = self.clean_text(name2)
        return SequenceMatcher(None, name1_clean, name2_clean).ratio()

    async def compare_descriptions(self, desc1: str, desc2: str) -> Dict:
        """Use LLM to directly compare two descriptions"""
        prompt = f"""Compare these two entity descriptions and analyze their similarity:

Description 1: {desc1}
Description 2: {desc2}

Provide a JSON response with:
1. A similarity score between 0 and 1
2. Key matching aspects
3. Key differences
4. Confidence in the comparison

Format:
{{
    "similarity_score": float,
    "matching_aspects": [str],
    "differences": [str],
    "confidence": float
}}"""

        try:
            response = await openai.ChatCompletion.acreate(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are an expert in entity resolution and semantic analysis."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.2
            )
            
            result = json.loads(response.choices[0].message.content)
            return result
            
        except Exception as e:
            print(f"Error in LLM comparison: {e}")
            return {
                "similarity_score": 0.0,
                "matching_aspects": [],
                "differences": ["Error in comparison"],
                "confidence": 0.0
            }

    async def find_matches(
        self,
        df: pd.DataFrame,
        name_threshold: float = 0.7,
        desc_threshold: float = 0.7,
        name_weight: float = 0.3
    ) -> List[Dict]:
        """Find matching entities using LLM comparison"""
        matches = []
        
        for i in range(len(df)):
            for j in range(i + 1, len(df)):
                record1 = df.iloc[i]
                record2 = df.iloc[j]
                
                # First check name similarity to filter obvious non-matches
                name_sim = self.calculate_name_similarity(record1['name'], record2['name'])
                
                if name_sim >= name_threshold:
                    # Get LLM comparison for descriptions
                    llm_result = await self.compare_descriptions(
                        record1['description'],
                        record2['description']
                    )
                    
                    desc_sim = llm_result['similarity_score']
                    
                    # Calculate weighted similarity
                    weighted_sim = (name_sim * name_weight + 
                                  desc_sim * (1 - name_weight))
                    
                    if desc_sim >= desc_threshold:
                        match = {
                            'record1_id': record1.name,
                            'record2_id': record2.name,
                            'record1_name': record1['name'],
                            'record2_name': record2['name'],
                            'overall_similarity': weighted_sim,
                            'name_similarity': name_sim,
                            'description_similarity': desc_sim,
                            'matching_aspects': llm_result['matching_aspects'],
                            'differences': llm_result['differences'],
                            'confidence': llm_result['confidence']
                        }
                        matches.append(match)
        
        return matches

# Example usage:
"""
# Initialize matcher
matcher = LLMEntityMatcher(api_key='your-api-key')

# Sample data
data = {
    'name': [
        'John Smith',
        'Jon Smith',
        'Jane Doe',
        'J. Smith'
    ],
    'description': [
        'A software engineer with 10 years of experience in Python and Java development',
        'Experienced software developer specializing in Python and Java programming',
        'Marketing professional with expertise in digital campaigns',
        'Senior software engineer with Python and Java background'
    ]
}

df = pd.DataFrame(data)

# Find matches
matches = await matcher.find_matches(
    df,
    name_threshold=0.7,
    desc_threshold=0.7,
    name_weight=0.3
)

# Display results
for match in matches:
    print(f"\nMatch Found (Overall Similarity: {match['overall_similarity']:.2f}):")
    print(f"Record 1: {match['record1_name']}")
    print(f"Record 2: {match['record2_name']}")
    print("\nMatching Aspects:")
    for aspect in match['matching_aspects']:
        print(f"- {aspect}")
    print("\nKey Differences:")
    for diff in match['differences']:
        print(f"- {diff}")
    print(f"Confidence: {match['confidence']:.2f}")
"""

# Import required libraries 
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

def clean_text(text):
    """Clean text by removing special characters and standardizing format"""
    if not isinstance(text, str):
        return str(text)
    
    # Convert to lowercase and remove special characters
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

def prepare_dataframe(df):
    """Prepare DataFrame by adding cleaned versions while keeping originals"""
    # Create a copy to avoid modifying the original
    df_prepared = df.copy()
    
    # Add cleaned name while keeping original
    df_prepared['name_cleaned'] = df_prepared['name'].apply(clean_text)
    
    # Add cleaned description while keeping original
    df_prepared['description_cleaned'] = df_prepared['description'].apply(clean_text)
    
    return df_prepared

def calculate_name_similarity(name1, name2):
    """Calculate similarity between names using SequenceMatcher"""
    return SequenceMatcher(None, name1, name2).ratio()

def calculate_description_similarity(desc1, desc2):
    """Calculate similarity between descriptions using TF-IDF and cosine similarity"""
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        lowercase=True,
        stop_words='english',
        token_pattern=r'\b\w+\b',  # Match whole words only
        min_df=1  # Include all terms since we're only comparing two documents
    )
    
    # Fit and transform the descriptions
    try:
        tfidf_matrix = vectorizer.fit_transform([desc1, desc2])
        # Calculate cosine similarity
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        return float(similarity)
    except:
        return 0.0

def find_matches(df, name_threshold=0.8, desc_threshold=0.3, name_weight=0.6):
    """Find matching entities based on name and description similarity
    
    Args:
        df: DataFrame with 'name' and 'description' columns
        name_threshold: Minimum similarity threshold for names
        desc_threshold: Minimum similarity threshold for descriptions
        name_weight: Weight given to name similarity (1 - name_weight for description)
    """
    # Prepare DataFrame with cleaned versions
    df_prepared = prepare_dataframe(df)
    matches = []
    
    # Compare all pairs
    for i in range(len(df_prepared)):
        for j in range(i + 1, len(df_prepared)):
            record1 = df_prepared.iloc[i]
            record2 = df_prepared.iloc[j]
            
            # Calculate name similarity using cleaned names
            name_sim = calculate_name_similarity(
                record1['name_cleaned'],
                record2['name_cleaned']
            )
            
            # Only proceed if names are similar enough
            if name_sim >= name_threshold:
                # Calculate description similarity using cleaned descriptions
                desc_sim = calculate_description_similarity(
                    record1['description_cleaned'],
                    record2['description_cleaned']
                )
                
                # Calculate weighted average similarity
                weighted_sim = (name_sim * name_weight + 
                              desc_sim * (1 - name_weight))
                
                # Check if description similarity meets threshold
                if desc_sim >= desc_threshold:
                    match = {
                        'Record 1': f"ID: {record1.name}, Name: {record1['name']}",  # Original name
                        'Record 2': f"ID: {record2.name}, Name: {record2['name']}",  # Original name
                        'Overall Similarity': f"{weighted_sim:.2f}",
                        'Similarities': {
                            'name': f"{name_sim:.2f}",
                            'description': f"{desc_sim:.2f}"
                        },
                        'Original Names': {
                            'name1': record1['name'],
                            'name2': record2['name']
                        },
                        'Cleaned Names': {
                            'name1': record1['name_cleaned'],
                            'name2': record2['name_cleaned']
                        }
                    }
                    matches.append(match)
    
    return matches

# Example usage with sample data
data = {
    'name': [
        'John Smith, PhD',
        'Jon Smith Jr.',
        'Jane Doe, MBA',
        'J. Smith III'
    ],
    'description': [
        'A software engineer with 10 years of experience in Python and Java development',
        'Experienced software developer specializing in Python and Java programming',
        'Marketing professional with expertise in digital campaigns',
        'Senior software engineer with Python and Java background'
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)
print("Input Data:")
print(df)
print("\n" + "="*80 + "\n")

# Find matches
matches = find_matches(df, name_threshold=0.7, desc_threshold=0.3, name_weight=0.6)

# Display results
print("Matching Results:")
for match in matches:
    print("\nMatch Found:")
    print(f"Record 1: {match['Record 1']}")
    print(f"Record 2: {match['Record 2']}")
    print(f"Overall Similarity: {match['Overall Similarity']}")
    print("Original Names:")
    print(f"  Name 1: {match['Original Names']['name1']}")
    print(f"  Name 2: {match['Original Names']['name2']}")
    print("Cleaned Names:")
    print(f"  Name 1: {match['Cleaned Names']['name1']}")
    print(f"  Name 2: {match['Cleaned Names']['name2']}")
    print("Field-by-field similarities:")
    for field, sim in match['Similarities'].items():
        print(f"  {field}: {sim}")

In [None]:
def merge_connected_entries(dictionary):
    """
    Merges dictionary entries that share common names.
    
    Args:
        dictionary: A dictionary with keys mapping to lists of names
        
    Returns:
        A new dictionary with connected entries merged
    """
    # Map each name to all keys it appears in
    name_to_keys = {}
    for key, names in dictionary.items():
        for name in names:
            if name not in name_to_keys:
                name_to_keys[name] = []
            name_to_keys[name].append(key)
    
    # Track which keys have been processed
    processed_keys = set()
    result = {}
    
    # Process each key
    for key in dictionary:
        # Skip if already processed
        if key in processed_keys:
            continue
        
        # Start with current key and its names
        connected_keys = [key]
        connected_names = set(dictionary[key])
        
        # Find all connected keys through common names
        changed = True
        while changed:
            changed = False
            
            # For current set of names, find all related keys
            for name in list(connected_names):
                for related_key in name_to_keys[name]:
                    if related_key not in connected_keys:
                        connected_keys.append(related_key)
                        # Add all names from this key
                        for related_name in dictionary[related_key]:
                            if related_name not in connected_names:
                                connected_names.add(related_name)
                                changed = True
        
        # Mark all these keys as processed
        for k in connected_keys:
            processed_keys.add(k)
        
        # Add to result using the first key
        result[key] = list(connected_names)
    
    return result

# Example usage
if __name__ == "__main__":
    dic_a = {
        "match1": ["name1", "name2"],
        "match2": ["name2", "name3"],
        "match3": ["name5", "name4"]
    }
    
    new_dic_a = merge_connected_entries(dic_a)
    print("Original dictionary:", dic_a)
    print("New dictionary:", new_dic_a)

In [None]:
!= 

In [None]:
pip install llama-cpp-python

In [None]:
def filter_one_only(pairs, flagged_entities):
    one_only = []
    for pair in pairs:
        entity1, entity2 = pair
        if (entity1 in flagged_entities) != (entity2 in flagged_entities):
            one_only.append(pair)
    return one_only

# Example usage:
pairs = [("A", "B"), ("C", "D"), ("E", "F"), ("G", "H")]
flagged_entities = ["A", "C", "E", "H"]

result = filter_one_only(pairs, flagged_entities)
print(result)


In [None]:
pairs = [("A", "B"), ("C", "D"), ("E", "F"), ("G", "H")]
flagged_entities = {"A", "E", "G"}

one_only = [pair for pair in pairs if (pair[0] in flagged_entities) ^ (pair[1] in flagged_entities)]

print(one_only)

[('A', 'B'), ('E', 'F'), ('G', 'H')]


In [None]:
pairs = [("B", "A"), ("C", "D"), ("E", "F"), ("G", "H")]
flagged_entities = {"A", "E", "G"}

one_only = [
    (pair[0], pair[1]) if pair[0] in flagged_entities else (pair[1], pair[0])
    for pair in pairs
    if (pair[0] in flagged_entities) ^ (pair[1] in flagged_entities)
]

print(one_only)

[('A', 'B'), ('E', 'F'), ('G', 'H')]


In [None]:
test1 = {
    "desc1":{
        "entities": ["entity1", "entity2"],
        "relationships": "OWNER"
        "reason": "the two entities are related"
    }
    "desc2":{
        "entities": ["entity3", "entity4"],
        "relationships": "OWNER"
        "reason": "the two entities are related"
    }
}

In [None]:
import streamlit as st
from st_link_analysis import st_link_analysis, NodeStyle, EdgeStyle

st.set_page_config(layout="wide")

# Sample Data
elements = {
    "nodes": [
        {"data": {"id": 1, "label": "PERSON", "name": "Streamlit"}},
        {"data": {"id": 2, "label": "PERSON", "name": "Hello"}},
        {"data": {"id": 3, "label": "PERSON", "name": "World"}},
        {"data": {"id": 4, "label": "POST", "content": "x"}},
        {"data": {"id": 5, "label": "POST", "content": "y"}},
    ],
    "edges": [
        {"data": {"id": 6, "label": "FOLLOWS", "source": 1, "target": 2}},
        {"data": {"id": 7, "label": "FOLLOWS", "source": 2, "target": 3}},
        {"data": {"id": 8, "label": "POSTED", "source": 3, "target": 4}},
        {"data": {"id": 9, "label": "POSTED", "source": 1, "target": 5}},
        {"data": {"id": 10, "label": "QUOTES", "source": 5, "target": 4}},
    ],
}

# Style node & edge groups
node_styles = [
    NodeStyle("PERSON", "#FF7F3E", "name", "person"),
    NodeStyle("POST", "#2A629A", "content", "description"),
]

edge_styles = [
    EdgeStyle("OWNER", caption='label', directed=True),
    EdgeStyle("SUPPLIER", caption='label', directed=True),
    EdgeStyle("ASSOCIATE", caption='label', directed=True),
    EdgeStyle("SHAREHOLDER", caption='label', directed=True),
    EdgeStyle("COMPETITOR", caption='label', directed=True),
    EdgeStyle("EMPLOYER", caption='label', directed=True),
    EdgeStyle("ADVISOR", caption='label', directed=True),
    EdgeStyle("CONTRIBUTOR", caption='label', directed=True),
    EdgeStyle("REPRESENTATIVE", caption='label', directed=True),
    EdgeStyle("BENEFICIARY", caption='label', directed=True),
    EdgeStyle("LICENSOR", caption='label', directed=True),
    EdgeStyle("VICTIM", caption='label', directed=True),
    EdgeStyle("WITNESS", caption='label', directed=True),
    EdgeStyle("OTHER TYPES", caption='label', directed=True)
]

# Render the component
st.markdown("### st-link-analysis: Example")
st_link_analysis(elements, "cose", node_styles, edge_styles)


In [None]:
# Input dictionary
test1 = {
    "desc1": {
        "entities": ["entity1", "entity2"],
        "relationships": "OWNER",
        "reason": "the two entities are related"
    },
    "desc2": {
        "entities": ["entity3", "entity4"],
        "relationships": "OWNER",
        "reason": "the two entities are related"
    }
}

# Assign unique IDs to entities
entity_to_id = {}
current_id = 1

# Extract all unique entities and assign IDs
for desc in test1.values():
    for entity in desc["entities"]:
        if entity not in entity_to_id:
            entity_to_id[entity] = current_id
            current_id += 1

# Create node list
nodes = [{"data": {"id": id, "label": "ENTITY", "name": name}} for name, id in entity_to_id.items()]

# Create edge list
edges = [
    {
        "data": {
            "id": idx + 1,
            "label": desc["relationships"],
            "source": entity_to_id[desc["entities"][0]],
            "target": entity_to_id[desc["entities"][1]],
            "reason": desc["reason"]
        }
    }
    for idx, desc in enumerate(test1.values())
]

# Final structure
elements = {
    "nodes": nodes,
    "edges": edges
}

# Print output
import json
print(json.dumps(elements, indent=4))

{
    "nodes": [
        {
            "data": {
                "id": 1,
                "label": "ENTITY",
                "name": "entity1"
            }
        },
        {
            "data": {
                "id": 2,
                "label": "ENTITY",
                "name": "entity2"
            }
        },
        {
            "data": {
                "id": 3,
                "label": "ENTITY",
                "name": "entity3"
            }
        },
        {
            "data": {
                "id": 4,
                "label": "ENTITY",
                "name": "entity4"
            }
        }
    ],
    "edges": [
        {
            "data": {
                "id": 1,
                "label": "OWNER",
                "source": 1,
                "target": 2,
                "reason": "the two entities are related"
            }
        },
        {
            "data": {
                "id": 2,
                "label": "OWNER",
                "source": 3,
       

In [None]:
import subprocess

def compress_pdf(input_pdf, output_pdf):
    gs_command = [
        "gs", "-sDEVICE=pdfwrite", "-dCompatibilityLevel=1.4",
        "-dPDFSETTINGS=/screen",  # Change to /ebook for better quality
        "-dNOPAUSE", "-dQUIET", "-dBATCH",
        f"-sOutputFile={output_pdf}", input_pdf
    ]
    subprocess.run(gs_command)
    print(f"Compressed PDF saved as: {output_pdf}")

compress_pdf("doc.pdf", "output.pdf")

Compressed PDF saved as: output.pdf


In [None]:
import pandas as pd

# Dictionary
test1 = {
    "desc1": {
        "entities": ["entity1", "entity2"],
        "relationships": "OWNER",
        "reason": "the two entities are related"
    },
    "desc2": {
        "entities": ["entity3", "entity4"],
        "relationships": "OWNER",
        "reason": "the two entities are related"
    }
}

# Convert to DataFrame
data = []
for key, value in test1.items():
    entity, entity_connected = value["entities"]  # Extract the two entities
    relationships = value["relationships"]  # Extract relationship type
    data.append([entity, entity_connected, relationships])  # Append as a row

df = pd.DataFrame(data, columns=["entity", "entity connected", "relationships"])

# Display DataFrame
print(df)

SyntaxError: invalid syntax (1650711533.py, line 5)

In [None]:
import streamlit as st
import pandas as pd

# Sample DataFrame (Replace with your actual DataFrame)
test1 = {
    f"desc{i}": {
        "entities": [f"entity{i}", f"entity{i+1}"],
        "relationships": "OWNER",
        "reason": "the two entities are related"
    }
    for i in range(1, 51)  # Creating 50 rows for testing
}

# Convert to DataFrame
data = []
for key, value in test1.items():
    entity, entity_connected = value["entities"]
    relationships = value["relationships"]
    data.append([entity, entity_connected, relationships])

df = pd.DataFrame(data, columns=["Entity", "Entity Connected", "Relationships"])

# **1️⃣ Show a preview (first 10 rows)**
st.subheader("🔹 Data Preview (First 10 Rows)")
st.dataframe(df.head(10), height=300, width=600)

# **2️⃣ Full table inside an expander**
with st.expander("🔍 View Full Data (50 Rows)"):
    st.data_editor(df, height=500, use_container_width=True)

# **3️⃣ Allow user to select number of rows displayed**
num_rows = st.slider("Select number of rows to display:", min_value=5, max_value=50, value=10, step=5)
st.dataframe(df.head(num_rows), height=400, width=700)

In [None]:
my problem is pip dependency resolver does not currently take into account all the packages that are installed. this behaviour is the source of the following depedency conflits. azure cli 2.70 requires smver==2.13.0 but you have semver 3.04 which is incompatible.

In [None]:
pip install semver==2.13.0 --force-reinstall
pip check

pip uninstall azure-cli
pip install azure-cli

python -m venv myenv
source myenv/bin/activate  # On macOS/Linux
myenv\Scripts\activate  # On Windows
pip install azure-cli


pip dependency resolver does not currently take into account all the packages that are installed. flair 0.13.1 requires semver<4.0.0>=3.0.0 but you have semver 2.13.0 which is incomptabile 

In [None]:
Input: nums = [2,7,11,15], target = 9
Output: [0,1]
Explanation: Because nums[0] + nums[1] == 9, we return [0, 1].


In [None]:
nums = [2, 7,11,15]
target=9
seen=[] 
for i,v in nums:
    if v in seen:
        
    



[2, 7, 11, 15]

In [None]:
300*80


24000

In [None]:
# What are the main values of those things 
    # try to f120*ind the best places available in this marke50

100*1000

100000

In [None]:
Reformulate properly: as of today there are 5 hits to migrate.

45000

In [None]:
reformulate and enhance: I have a beeter vision of your question, No indeed in that case it won't generate any hits because we are not considering connected us domicile heirs. In that case, we should update the scenario to take into consideration connected relationships.


In [None]:
chf_price=0.87557
quantity=1
stock_price=184.3350
quantity*stock_price*chf_price

what are the duffer


161.39819595

In [None]:
AAPL APPLE INC 265598 US0378331005 AAPL NASDAQ 1 COMMON
AMZN AMAZON.COM INC 3691937 US0231351067 AMZN NASDAQ 1 COMMON
CMG CHIPOTLE MEXICAN GRILL
INC 37655664 US1696561059 CMG NYSE 1 COMMON
GOOGL ALPHABET INC-CL A 208813719 US02079K3059 GOOGL NASDAQ 1 COMMON
IYW ISHARES USTECHNOLOGY
ETF 10158652 US4642877215 IYW ARCA 1 ETF
MA MASTERCARD INC - A 38685693 US57636Q1040 MA NYSE 1 COMMON
MSFT MICROSOFT CORP 272093 US5949181045 MSFT NASDAQ 1 COMMON
NVDA NVIDIA CORP 4815747 US67066G1040 NVDA NASDAQ 1 COMMON
SOXX ISHARES SEMICONDUCTOR
ETF 12658194 US4642875235 NASDAQ 1 ETF
TSLA TESLA INC 76792991 US88160R1014 NASDAQ 1 COMMON
VOO VANGUARD S&P 500 ETF 136155102 US9229083632 VOO ARCA 1 ETF
VT VANGUARD TOT WORLD STK
ETF 52197301 US9220427424 VT ARCA 1 ETF

0.590909064049588

In [None]:
03.09.2024	USA	1 USD	0.86072
30.09.2024	USA	1 USD	0.84971
23.10.2024	USA	1 USD	0.87557

In [None]:
303.91+157.84+161.39

623.14

In [None]:
795/3

265.0

In [None]:
I have a meet

In [None]:
795-(198.82+157.84)


438.34000000000003

In [None]:
nums = [1,2,3,4]
product_list = []
len_nums = len(nums)
summ = []
print(len_nums)
for i,v in enumerate(nums):
    product = 1
    i_valid = [num for num in range(len(nums)) if num != i]
    print(i_valid)
    for num in i_valid:
        product *= nums[num]
        print(product)
    summ.append(product)
return summ



4
[1, 2, 3]
2
6
24
[0, 2, 3]
1
3
12
[0, 1, 3]
1
2
8
[0, 1, 2]
1
2
6


In [18]:
summ

[24, 12, 8, 6]

In [None]:
Here’s the updated note with a concise question added about data storage:

⸻

Designing a scalable data architecture on Azure and Databricks starts with proper layering. Ingest data from sources like CRMs, apps, APIs, and files using Azure Data Factory for batch or Event Hubs for streaming. Use Databricks Autoloader for file-based incremental loads.

Store everything in Azure Data Lake Storage Gen2 using a medallion architecture—Bronze for raw data, Silver for cleaned/enriched data, and Gold for business-ready aggregates.

Process data with Azure Databricks using Spark notebooks. Apply business logic, schema enforcement, and use Delta Lake for versioning and updates.

Model and analyze data using Databricks SQL or Power BI. Use the Lakehouse approach to unify analytics and AI.

For ML, manage experiments and models with MLflow; optionally use Azure ML. Use Silver/Gold data layers for training features.

Ensure governance with Microsoft Purview for data lineage and cataloging. Secure access with RBAC, POSIX ACLs, and Unity Catalog.

Orchestrate workflows with Data Factory or Databricks Workflows. Integrate Git for CI/CD and automate pipelines.

Key principles: ensure data quality with tools like Great Expectations, build modular pipelines, document lineage, and monitor using Azure Monitor and job alerts.

⸻

When meeting with teams using other architectures, ask:
	•	What are your main data goals (e.g., reporting, ML)?
	•	What platforms and tools do you use (cloud, storage, ETL)?
	•	How is your data stored—data lake, warehouse, or a hybrid? What formats do you use (e.g., Parquet, Delta, CSV)?
	•	Is your setup batch, real-time, or both?
	•	How do you manage ingestion, modeling, and transformations?
	•	What BI tools are used? Is data self-service?
	•	How do you handle ML workflows and deployment?
	•	How is data access, governance, and compliance managed?
	•	Can our systems align or integrate? Are there blockers we can address together?

In [None]:
nums = [2,7,11,15]
target = 9
nums[2]


11

In [None]:

import os 
name, ext = os.path.sploneitext("/ok/ok/ok/ok.txt")
print(name)
print(ext)xsswz 

/ok/ok/ok/ok
.txt


In [None]:
3319 87.330

11000.0

In [9]:
try_ounce = 31.1035
gram = 605
chf_price = 87.280
worth_ounce = gram / try_ounce

In [10]:
gram * chf_price

52804.4

In [12]:
buy = 3504 + 4972 + 9993 + 9815 + 15133 + 2460 + 1324 + 440 + 87 + 174 + 86 + 25615 + 177 + 175 + 164 +442
sold = 12805 + 8534 

In [13]:
buy - sold

53222