In [10]:
pip install fuzzywuzzy python-Levenshtein



In [11]:
pip install sentence_transformers



In [12]:
import spacy
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import re


In [13]:
filepath = '/content/drive/My Drive/MLOps/news_alert'

In [14]:
companies = [
            {'entity_id': 'C001', 'name': 'Microsoft Corporation', 'description': 'Technology company, software, cloud computing', 'ticker': 'MSFT'},
            {'entity_id': 'C002', 'name': 'JPMorgan Chase & Co.', 'description': 'Financial services, banking, investment', 'ticker': 'JPM'},
            {'entity_id': 'C003', 'name': 'Google DeepMind', 'description': 'Artificial intelligence research company', 'ticker': 'GOOGL'},
            {'entity_id': 'C004', 'name': 'Apple Inc.', 'description': 'Technology company, consumer electronics', 'ticker': 'AAPL'},
            {'entity_id': 'C005', 'name': 'Lumier AI', 'description': 'Artificial intelligence startup, enterprise solutions', 'ticker': ''},
            {'entity_id': 'C006', 'name': 'ChainTech', 'description': 'Blockchain technology company', 'ticker': ''},
            {'entity_id': 'C007', 'name': 'Lumier LED', 'description': 'LED lighting manufacturer', 'ticker': ''},
            {'entity_id': 'C008', 'name': 'Acma Retail Inc', 'description': 'Retail company, supply chain', 'ticker': ''},
            {'entity_id': 'C009', 'name': 'Acma Furniture LLC', 'description': 'Furniture manufacturing company', 'ticker': ''}
        ]
compny = pd.DataFrame(companies)
compny.head()

Unnamed: 0,entity_id,name,description,ticker
0,C001,Microsoft Corporation,"Technology company, software, cloud computing",MSFT
1,C002,JPMorgan Chase & Co.,"Financial services, banking, investment",JPM
2,C003,Google DeepMind,Artificial intelligence research company,GOOGL
3,C004,Apple Inc.,"Technology company, consumer electronics",AAPL
4,C005,Lumier AI,"Artificial intelligence startup, enterprise so...",


In [15]:
class EntityLinker:
    def __init__(self):
        # Load spaCy model for NER
        self.nlp = spacy.load("en_core_web_sm")

        # Load sentence transformer for semantic similarity
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Company knowledge base (in real scenario, this would be JPMC's KG)
        self.companies_df = self._load_company_knowledge_base()

    def _load_company_knowledge_base(self):
        """Load company knowledge base - simulating JPMC's internal KG"""
        companies = [
            {'entity_id': 'C001', 'name': 'Microsoft Corporation', 'description': 'Technology company, software, cloud computing', 'ticker': 'MSFT'},
            {'entity_id': 'C002', 'name': 'JPMorgan Chase & Co.', 'description': 'Financial services, banking, investment', 'ticker': 'JPM'},
            {'entity_id': 'C003', 'name': 'Google DeepMind', 'description': 'Artificial intelligence research company', 'ticker': 'GOOGL'},
            {'entity_id': 'C004', 'name': 'Apple Inc.', 'description': 'Technology company, consumer electronics', 'ticker': 'AAPL'},
            {'entity_id': 'C005', 'name': 'Lumier AI', 'description': 'Artificial intelligence startup, enterprise solutions', 'ticker': ''},
            {'entity_id': 'C006', 'name': 'ChainTech', 'description': 'Blockchain technology company', 'ticker': ''},
            {'entity_id': 'C007', 'name': 'Lumier LED', 'description': 'LED lighting manufacturer', 'ticker': ''},
            {'entity_id': 'C008', 'name': 'Acma Retail Inc', 'description': 'Retail company, supply chain', 'ticker': ''},
            {'entity_id': 'C009', 'name': 'Acma Furniture LLC', 'description': 'Furniture manufacturing company', 'ticker': ''}
        ]
        return pd.DataFrame(companies)

    def extract_mentions(self, text):
        """Extract company mentions from text using spaCy NER"""
        doc = self.nlp(text)
        mentions = []

        for ent in doc.ents:
            if ent.label_ in ['ORG', 'PRODUCT']:
                mentions.append({
                    'text': ent.text,
                    'start_char': ent.start_char,
                    'end_char': ent.end_char,
                    'label': ent.label_
                })

        return mentions

    def calculate_similarity_score(self, mention, entity_name, entity_description):
        """Calculate combined similarity score using fuzzy matching and semantic similarity"""
        # String similarity (Wide component - like JEL)
        string_similarity = fuzz.token_set_ratio(mention.lower(), entity_name.lower()) / 100.0

        # Semantic similarity (Deep component - like JEL)
        mention_embedding = self.sentence_model.encode([mention])
        entity_embedding = self.sentence_model.encode([f"{entity_name} {entity_description}"])
        semantic_similarity = cosine_similarity(mention_embedding, entity_embedding)[0][0]

        # Combined score (like JEL's Wide & Deep)
        combined_score = 0.6 * string_similarity + 0.4 * semantic_similarity

        return combined_score

    def link_entities(self, text):
        """Link extracted mentions to knowledge base entities"""
        mentions = self.extract_mentions(text)
        linked_entities = []

        for mention in mentions:
            mention_text = mention['text']
            best_match = None
            best_score = 0
            best_entity = None

            for _, entity in self.companies_df.iterrows():
                score = self.calculate_similarity_score(
                    mention_text,
                    entity['name'],
                    entity['description']
                )

                if score > best_score and score > 0.3:  # Threshold
                    best_score = score
                    best_match = mention_text
                    best_entity = entity

            if best_entity is not None:
                linked_entities.append({
                    'mention': best_match,
                    'entity_id': best_entity['entity_id'],
                    'entity_name': best_entity['name'],
                    'confidence': best_score,
                    'description': best_entity['description']
                })

        return linked_entities

    def save_model(self, filepath):
        """Save the entity linker model"""
        with open(filepath, 'wb') as f:
            pickle.dump(self, f)

    @classmethod
    def load_model(cls, filepath):
        """Load the entity linker model"""
        with open(filepath, 'rb') as f:
            return pickle.load(f)

# if __name__ == "__main__":
linker = EntityLinker()

# Test
test_text = "Microsoft acquires AI startup Lumier for $500 million"
entities = linker.link_entities(test_text)
print("Linked entities:", entities)

Linked entities: [{'mention': 'Microsoft', 'entity_id': 'C001', 'entity_name': 'Microsoft Corporation', 'confidence': np.float32(0.8441816), 'description': 'Technology company, software, cloud computing'}, {'mention': 'Lumier', 'entity_id': 'C007', 'entity_name': 'Lumier LED', 'confidence': np.float32(0.8061914), 'description': 'LED lighting manufacturer'}]


In [16]:
import joblib
joblib.dump(linker,f"{filepath}/entityLinker.pkl")

['/content/drive/My Drive/MLOps/news_alert/entityLinker.pkl']