In [2]:
valid_p_list = {
    ")":"(",
    "]":"[",
    "}":"{"
}
stack = []

def valid_parentheses(s):
    for p in s:
        if p in valid_p_list.values():
            stack.append(p)
        elif stack and stack[-1]==valid_p_list[p]:
            stack.pop()
    return stack == []
    

s = "()[]{"
ys= "()[]"
valid_parentheses(ys)

True

In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="dslim/bert-large-NER")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from docx import Document
import re
from typing import List, Dict, Tuple, Set
import numpy as np
from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import spacy

class BertNERProcessor:
    def __init__(self):
        """Initialize models and NLP tools"""
        # BERT NER model
        self.tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        self.model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
        self.label_list = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
        
        # Sentence transformer for semantic similarity
        self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # SpaCy for additional NLP tasks
        self.nlp = spacy.load("en_core_web_sm")
        
    def process_docx(self, file_path: str) -> Dict[str, List[Dict[str, List[str]]]]:
        """Process DOCX file and extract grouped entities"""
        doc = Document(file_path)
        full_text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
        chunks = self._split_into_chunks(full_text)
        
        # Extract initial entities
        raw_entities = {
            'PERSON': [],
            'ORGANIZATION': [],
            'LOCATION': [],
            'MISCELLANEOUS': []
        }
        
        # Process chunks and extract entities
        for chunk in chunks:
            entities = self._extract_entities(chunk)
            for entity_type, entities_list in entities.items():
                raw_entities[entity_type].extend(entities_list)
        
        # Group similar entities
        grouped_entities = self._group_entities(raw_entities)
        
        return grouped_entities

    def _group_entities(self, raw_entities: Dict[str, List[Dict[str, str]]]) -> Dict[str, List[Dict[str, List[str]]]]:
        """Group similar entities together using multiple similarity measures"""
        grouped_results = {}
        
        for entity_type, entities in raw_entities.items():
            if not entities:
                grouped_results[entity_type] = []
                continue
                
            # Extract unique entity texts
            unique_entities = list({e['text'] for e in entities})
            
            if len(unique_entities) == 0:
                grouped_results[entity_type] = []
                continue
                
            # Calculate similarity matrix using multiple measures
            embeddings = self.semantic_model.encode(unique_entities)
            
            # Perform clustering
            clusters = self._cluster_entities(embeddings, unique_entities)
            
            # Post-process clusters with rule-based refinements
            refined_clusters = self._refine_clusters(clusters, entity_type)
            
            grouped_results[entity_type] = refined_clusters
            
        return grouped_results

    def _cluster_entities(self, embeddings: np.ndarray, entities: List[str]) -> List[List[str]]:
        """Cluster entities using DBSCAN"""
        # Perform DBSCAN clustering
        clustering = DBSCAN(eps=0.3, min_samples=1, metric='cosine').fit(embeddings)
        
        # Group entities by cluster
        clusters = {}
        for idx, label in enumerate(clustering.labels_):
            if label not in clusters:
                clusters[label] = []
            clusters[label].append(entities[idx])
            
        return list(clusters.values())

    def _refine_clusters(self, clusters: List[List[str]], entity_type: str) -> List[Dict[str, List[str]]]:
        """Apply rule-based refinements to clusters"""
        refined_clusters = []
        
        for cluster in clusters:
            main_entity = self._find_main_entity(cluster, entity_type)
            variations = [e for e in cluster if e != main_entity]
            
            # Apply type-specific rules
            if entity_type == 'PERSON':
                variations = self._refine_person_cluster(main_entity, variations)
            elif entity_type == 'ORGANIZATION':
                variations = self._refine_org_cluster(main_entity, variations)
                
            refined_clusters.append({
                'main': main_entity,
                'variations': variations
            })
            
        return refined_clusters

    def _find_main_entity(self, cluster: List[str], entity_type: str) -> str:
        """Determine the main entity name from a cluster"""
        if entity_type == 'PERSON':
            # Prefer full names
            full_names = [name for name in cluster if len(name.split()) > 1]
            if full_names:
                return max(full_names, key=len)
        
        # Default to longest name
        return max(cluster, key=len)

    def _refine_person_cluster(self, main_entity: str, variations: List[str]) -> List[str]:
        """Apply person-specific refinement rules"""
        main_doc = self.nlp(main_entity)
        refined_variations = set(variations)
        
        # Extract main name components
        main_names = set()
        for token in main_doc:
            if token.pos_ == "PROPN":
                main_names.add(token.text.lower())
        
        # Filter variations
        for var in variations:
            var_doc = self.nlp(var)
            var_names = set()
            for token in var_doc:
                if token.pos_ == "PROPN":
                    var_names.add(token.text.lower())
            
            # Remove if no name overlap
            if not (main_names & var_names):
                refined_variations.discard(var)
                
        return list(refined_variations)

    def _refine_org_cluster(self, main_entity: str, variations: List[str]) -> List[str]:
        """Apply organization-specific refinement rules"""
        main_tokens = set(self.nlp(main_entity.lower()))
        refined_variations = set()
        
        for var in variations:
            var_tokens = set(self.nlp(var.lower()))
            # Keep if significant token overlap
            if len(main_tokens & var_tokens) / len(main_tokens) > 0.3:
                refined_variations.add(var)
                
        return list(refined_variations)

    def _extract_entities(self, text: str) -> Dict[str, List[Dict[str, str]]]:
        """Extract named entities from text chunk"""
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        outputs = self.model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
        
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        labels = [self.label_list[p] for p in predictions[0]]
        
        entities = {
            'PERSON': [],
            'ORGANIZATION': [],
            'LOCATION': [],
            'MISCELLANEOUS': []
        }
        
        current_entity = {'type': None, 'text': ''}
        
        for token, label in zip(tokens, labels):
            if label.startswith('B-'):
                if current_entity['text']:
                    self._add_entity(entities, current_entity)
                current_entity = {
                    'type': label[2:],
                    'text': token.replace('##', '')
                }
            elif label.startswith('I-') and current_entity['text']:
                current_entity['text'] += token.replace('##', '')
            elif label == 'O':
                if current_entity['text']:
                    self._add_entity(entities, current_entity)
                current_entity = {'type': None, 'text': ''}
        
        if current_entity['text']:
            self._add_entity(entities, current_entity)
            
        return entities

    def _split_into_chunks(self, text: str, max_length: int = 400) -> List[str]:
        """Split text into processable chunks"""
        sentences = re.split('([.!?])', text)
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            if len(current_chunk) + len(sentence) < max_length:
                current_chunk += sentence
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence
                
        if current_chunk:
            chunks.append(current_chunk.strip())
            
        return chunks

    def _add_entity(self, entities: Dict[str, List[Dict[str, str]]], 
                   entity: Dict[str, str]) -> None:
        """Add entity to appropriate category"""
        if not entity['type']:
            return
            
        entity_text = entity['text'].strip()
        if not entity_text:
            return
            
        entity_type = entity['type']
        entity_dict = {'text': entity_text, 'type': entity_type}
        
        if entity_type == 'PER':
            entities['PERSON'].append(entity_dict)
        elif entity_type == 'ORG':
            entities['ORGANIZATION'].append(entity_dict)
        elif entity_type == 'LOC':
            entities['LOCATION'].append(entity_dict)
        elif entity_type == 'MISC':
            entities['MISCELLANEOUS'].append(entity_dict)

def process_document(file_path: str) -> None:
    """Process document and print grouped entities"""
    processor = BertNERProcessor()
    try:
        entities = processor.process_docx(file_path)
        
        print("\nExtracted and Grouped Named Entities:")
        print("-----------------------------------")
        
        for category, clusters in entities.items():
            if clusters:
                print(f"\n{category}:")
                for idx, cluster in enumerate(clusters, 1):
                    print(f"\nGroup {idx}:")
                    print(f"Main: {cluster['main']}")
                    if cluster['variations']:
                        print("Variations:")
                        for var in cluster['variations']:
                            print(f"- {var}")
                    
    except Exception as e:
        print(f"Error processing document: {str(e)}")

if __name__ == "__main__":
    file_path = "path/to/your/document.docx"
    process_document(file_path)

ModuleNotFoundError: No module named 'docx'

In [2]:
import pandas as pd

In [2]:
x =1 
x

1

In [3]:
# SQL 
pays = "FRANCE1234"
F R A N C E 1 2 3 4
1 2 3 4 5 6 7 8 9 10


2

In [None]:
# PYTHON 
pays = "FRANCE1234"
F R A N C E 1 2 3 4
0 1 2 3 4 5 6 7 8 9 

In [10]:
# python 
eleves_list = ["phillipe", "Mohammed", "les"]

for eleve in eleves_list:
    print(eleve)

phillipe
Mohammed
les


In [8]:
eleves_list = ["phillipe", "Mohammed"]
print("STEP1: voici la liste des eleves:", eleves_list, "se termine ici.")
print("hello world")
print("les")

STEP1: voici la liste des eleves: ['phillipe', 'Mohammed'] se termine ici.
hello world
les


In [11]:
for numero in range(1,10):
    print(numero)



1
2
3
4
5
6
7
8
9


In [1]:
import pandas as pd
import numpy as np

# Create a sample DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 28, 22],
    'City': ['New York', 'San Francisco', 'Chicago', 'Boston', 'Seattle'],
    'Salary': [50000, 75000, 60000, 65000, 45000],
    'Department': ['HR', 'IT', 'Finance', 'Marketing', 'Sales'],
    'Performance': [4.5, 4.2, 4.7, 3.9, 4.1]
})

# Optionally, add a date column
df['Hire_Date'] = pd.date_range(start='2022-01-01', periods=5)

# Display the DataFrame
print(df)

      Name  Age           City  Salary Department  Performance  Hire_Date
0    Alice   25       New York   50000         HR          4.5 2022-01-01
1      Bob   30  San Francisco   75000         IT          4.2 2022-01-02
2  Charlie   35        Chicago   60000    Finance          4.7 2022-01-03
3    David   28         Boston   65000  Marketing          3.9 2022-01-04
4      Eve   22        Seattle   45000      Sales          4.1 2022-01-05


In [11]:
list_a = [1,3,4]

if list_a[1] != 5 and list_a[0]!=8:
    print("ok")

ok


In [6]:

categories_article = ["col1", "col2", "col3"]
# Create the activities table if it doesn't exist using BOOLEAN
create_table_query = """
CREATE TABLE IF NOT EXISTS activities (
    entity TEXT,
    activities BOOLEAN,
    timestamp TIMESTAMP,
    comments TEXT,
    flagged BOOLEAN,
    PRIMARY KEY(entity, timestamp)
)
"""
conn.execute(create_table_query)

# Create the table_query_db if it doesn't exist
create_table_query_db = """
CREATE TABLE IF NOT EXISTS table_query_db (
    entity TEXT,
    activities BOOLEAN,
    timestamp TIMESTAMP,
    comments TEXT,
    flagged BOOLEAN,
    PRIMARY KEY(entity, timestamp)
)
"""
conn.execute(create_table_query_db)

insert_data = []
for row in dr.iterrows():
    step_data = row[1]
    entity = step_data["entity"].replace("'", "''") # Escape single quotes
    summary = step_data["summary"].replace("'", "''") # Escape single quotes
    activities = tuple(bool(row["activity"]) for activity in categories_article if activity is "no label") # Convert activities to boolean
    current_timestamp = datetime.now().isoformat() # Get current timestamp in ISO format
    comments = row["comments"].replace("'", "''")
    flagged = bool(row["flagged"])

    data_row = (entity, activities, current_timestamp, comments, flagged)

    # Check if the last entry for this entity is different
    query = f"SELECT * FROM table_query_db WHERE entity = %s ORDER BY timestamp DESC LIMIT 1"
    last_entry = conn.execute(query, (entity,)).fetchone()

    # Check if there is a change in the activities
    if last_entry:
        if last_entry[1] != activities: # Exclude entity and timestamp for comparison
            insert_data.append(data_row)
    else:
        insert_data.append(data_row) # This

if insert_data:
    query = f"""
    INSERT INTO {table_name} VALUES ({', '.join(['?'] * (len(categories_article) + 4))})
    """
    conn.executemany(query,insert_data)
    conn.commit()
    

0 Name                         Alice
Age                             25
City                      New York
Salary                       50000
Department                      HR
Performance                    4.5
Hire_Date      2022-01-01 00:00:00
Name: 0, dtype: object
1 Name                           Bob
Age                             30
City                 San Francisco
Salary                       75000
Department                      IT
Performance                    4.2
Hire_Date      2022-01-02 00:00:00
Name: 1, dtype: object
2 Name                       Charlie
Age                             35
City                       Chicago
Salary                       60000
Department                 Finance
Performance                    4.7
Hire_Date      2022-01-03 00:00:00
Name: 2, dtype: object
3 Name                         David
Age                             28
City                        Boston
Salary                       65000
Department               Marketing
Performance  

In [None]:
# Create the activities table if it doesn't exist using BOOLEAN
create_table_query = """
CREATE TABLE IF NOT EXISTS activities (
    entity TEXT,
    {",".join([f'"{activity}" BOOLEAN' for activity in categories_article if activity != 'no label'])},
    timestamp TIMESTAMP,
    comments TEXT,
    flagged BOOLEAN,
    PRIMARY KEY(entity, timestamp)
)
"""
conn.execute(create_table_query)
# Create the table_query_db if it doesn't exist
create_table_query_db = """
CREATE TABLE IF NOT EXISTS table_query_db (
    entity TEXT,
    {",".join([f'"{activity}" BOOLEAN' for activity in categories_article if activity != 'no label'])},
    timestamp TIMESTAMP,
    comments TEXT,
    flagged BOOLEAN,
    PRIMARY KEY(entity, timestamp)
)
"""
conn.execute(create_table_query_db)
insert_data = []
for row in dr.iterrows():
    step_data = row[1]
    entity = step_data["entity"].replace("'", "''") # Escape single quotes
    summary = step_data["summary"].replace("'", "''") # Escape single quotes
    activities = tuple(bool(row["activity"]) for activity in categories_article if activity is "no label") # Convert activities to boolean
    current_timestamp = datetime.now().isoformat() # Get current timestamp in ISO format
    comments = row["comments"].replace("'", "''")
    flagged = bool(row["flagged"])
    data_row = (entity, activities, current_timestamp, comments, flagged)
    # Check if the last entry for this entity is different
    query = f"SELECT * FROM table_query_db WHERE entity = %s ORDER BY timestamp DESC LIMIT 1"
    last_entry = conn.execute(query, (entity,)).fetchone()
    # Check if there is a change in the activities
    if last_entry:
        if last_entry[1] != activities: # Exclude entity and timestamp for comparison
            insert_data.append(data_row)
    else:
        insert_data.append(data_row) # This is a new entry for an entity

if insert_data:
    query = f"""
    INSERT INTO {table_name} VALUES ({', '.join(['?'] * (len(categories_article) + 4))})
    """
    conn.executemany(query,insert_data)
    conn.commit()
    

In [12]:
for i in range: 
    print(i)
    def(in)

387209.73000000004

In [4]:
!pip install regex


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import regex as re

# Step 1: Define the input string
input_string = "Hello, Wörld! 123. Grüße"

print("Original String:")
print(input_string)

# Step 2: Replace non-alphanumeric characters with whitespace
# Use the Unicode property \p{L} to match any kind of letter and \p{N} to match any kind of number
output_string = re.sub(r'[^\p{L}\p{N}]', ' ', input_string)

print("\nProcessed String:")
print(output_string)

Original String:
Hello, Wörld! 123. Grüße

Processed String:
Hello  Wörld  123  Grüße


In [7]:
import re

# Step 1: Define the input string
input_string = "$2.8M"

print("Original String:")
print(input_string)

# Step 2: Replace non-alphanumeric characters except dots, commas, and dollar signs with whitespace
output_string = re.sub(r'[^a-zA-Z0-9.,]', ' ', input_string)

print("\nProcessed String:")
print(output_string)

Original String:
$2.8M

Processed String:
 2.8M


In [8]:
import re

def clean_text(text):
   # Pattern includes Latin alphabet extensions used in European languages
   pattern = r'[^a-zA-ZàáâäãåąčćęèéêëėįìíîïłńòóôöõøùúûüųūÿýżźñçčšžÀÁÂÄÃÅĄĆČĖĘÈÉÊËÌÍÎÏĮŁŃÒÓÔÖÕØÙÚÛÜŲŪŸÝŻŹÑßÇČŠŽ0-9.,]'
   return re.sub(pattern, ' ', text)