In [4]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


 **Example Workflow with Fuzzy Matching + NER**

In [93]:
# List of project names
projects = [
    "Projekt Mozart II",
    "Projekt Mozart I",
    "The Five",
    "DFI Zukunftspark Oberfranken IV",
    "Berliner Flair in Friedrichshain II",
    "DFI Zukunftspark Oberfranken V",
    "DFI Zukunftspark Oberfranken III",
    "Am Akkonplatz",
    "Berliner Flair in Friedrichshain",
    "Projekt Mozart III",    
    "Tonhallen-Passage II"
]

projects = sorted(projects, reverse=True)

In [65]:
# Function to preprocess the email text
def preprocess_email(email):
    # Lowercase the text to ensure case-insensitive matching
    email = email.lower()
    # Optional: You can add more cleaning steps if necessary (e.g., removing greetings, signatures)

    return email

In [69]:
from transformers import pipeline
from fuzzywuzzy import process

# Function to extract project names from an email
def extract_project_names(email, projects):
    result = []
    email = preprocess_email(email)

    # Load the German NER model from Hugging Face
    ner = pipeline("ner", model="deepset/gbert-base")

    # Step 1: Exact string matching
    exact_matches = [project for project in projects if project.lower() in email]

    if exact_matches:
        project_name = exact_matches[0]
        confidence = 100

        result.append({project_name: project_name, confidence: confidence})
        return result
        

    # Step 2: NER to detect project-like entities (if exact matches fail)
    entities = ner(email)
    ner_matches = [
        (entity['word'], entity['score'])
        for entity in entities if any(proj.lower() in entity['word'].lower() for proj in projects)
        #and entity['score'] > 0.1  # Filter based on confidence threshold
    ]

    # Step 3: Fuzzy matching (fallback if exact and NER fail)
    if not exact_matches:
        fuzzy_match, confidence = process.extractOne(email, projects)
        fuzzy_matches = [fuzzy_match] if confidence > 70 else []  # Only accept if confidence is reasonably high
    else:
        fuzzy_matches = []

    # Combine the results
    combined_matches = list(set(exact_matches + ner_matches + fuzzy_matches))

    return combined_matches


In [94]:
# Example: User email in German

# DFI Park Oberfranken V. 
# Projekt Musik III
email = (
    "Hallo, "
    "ich habe eine Frage zu den Zinsen für das Projekt Musik III. "
    "Könnten Sie mir bitte den aktuellen Zinssatz nennen? "
    "Vielen Dank und viele Grüße, "
    "Max Mustermann"
)

# Extract project names
project_names = extract_project_names(email, projects)

print(f"Extracted Project Names: {project_names}")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracted Project Names: ['DFI Zukunftspark Oberfranken III']


In [29]:
from transformers import pipeline
from fuzzywuzzy import process

# Load the German NER model from Hugging Face
ner = pipeline("ner", model="deepset/gbert-base")

# List of project names
projects = [
    "Projekt Mozart II",
    "Projekt Mozart I",
    "The Five",
    "Berliner Flair in Friedrichshain II",
    "DFI Zukunftspark Oberfranken V",
    "Projekt Mozart III",
    "Am Akkonplatz",
    "Berliner Flair in Friedrichshain",
    "DFI Zukunftspark Oberfranken VI",
    "DFI Zukunftspark Oberfranken IV",
    "Tonhallen-Passage II"
]

projects = sorted(projects, reverse=True)
print(projects)

# User's question
# question = "Was ist der Zinssatz für das Projekt The Five?"
question = (
    "Hallo, "
    "ich habe eine Frage zu den Zinsen für das Projekt X"
    "Könnten Sie mir bitte den aktuellen Zinssatz nennen? "
    "Vielen Dank und viele Grüße, "
    "Max Mustermann"

    "Hallo, "
    "ich habe eine Frage zu den Zinsen für das Projekt Y. "
    "Könnten Sie mir bitte den aktuellen Zinssatz nennen? "
    "Vielen Dank und viele Grüße, "
    "Max Mustermann"

    "Hallo, "
    "ich habe eine Frage zu den Zinsen für das DFI Zukunftspark Oberfranken VI. "
    "Könnten Sie mir bitte den aktuellen Zinssatz nennen? "
    "Vielen Dank und viele Grüße, "
    "Max Mustermann"
)


# Step 1: Try to identify named entities
entities = ner(question)

# Step 2: First attempt exact matching
matched_projects = [project for project in projects if project.lower() in question.lower()]

if matched_projects:
    project_name = matched_projects[0]
    confidence = 100  # Exact match, so confidence is 100%
else:
    # Step 3: If no exact match, fall back to fuzzy matching
    project_name, confidence = process.extractOne(question, projects)

# Combine results
print(f"Entities found: {entities}")
print(f"Fuzzy Matched Project: {project_name} with confidence: {confidence}")

# Step 4: Use the project name to query the database
# query_database(project_name)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['Tonhallen-Passage II', 'The Five', 'Projekt Mozart III', 'Projekt Mozart II', 'Projekt Mozart I', 'DFI Zukunftspark Oberfranken VI', 'DFI Zukunftspark Oberfranken V', 'DFI Zukunftspark Oberfranken IV', 'Berliner Flair in Friedrichshain II', 'Berliner Flair in Friedrichshain', 'Am Akkonplatz']
Entities found: [{'entity': 'LABEL_1', 'score': np.float32(0.8397179), 'index': 1, 'word': 'Hallo', 'start': 0, 'end': 5}, {'entity': 'LABEL_1', 'score': np.float32(0.7619211), 'index': 2, 'word': ',', 'start': 5, 'end': 6}, {'entity': 'LABEL_1', 'score': np.float32(0.8200022), 'index': 3, 'word': 'ich', 'start': 7, 'end': 10}, {'entity': 'LABEL_1', 'score': np.float32(0.8509396), 'index': 4, 'word': 'habe', 'start': 11, 'end': 15}, {'entity': 'LABEL_1', 'score': np.float32(0.79300916), 'index': 5, 'word': 'eine', 'start': 16, 'end': 20}, {'entity': 'LABEL_1', 'score': np.float32(0.6879423), 'index': 6, 'word': 'Frage', 'start': 21, 'end': 26}, {'entity': 'LABEL_1', 'score': np.float32(0.692166)

# The approach below is used cosine similarity and looks promising.

In [120]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# List of predefined project names
project_names = [
    "Projekt Mozart II", "Projekt Mozart I", "The Five", 
    "DFI Zukunftspark Oberfranken IV", "Berliner Flair in Friedrichshain II", 
    "DFI Zukunftspark Oberfranken V", "DFI Zukunftspark Oberfranken III", 
    "Am Akkonplatz", "Berliner Flair in Friedrichshain", "Projekt Mozart III", 
    "Tonhallen-Passage II"
]

# Example input text
expected_text = "DFI Zukunftspark Oberfranken III"
input_text = (
    "Hallo, ich habe eine Frage zu den Zinsen für das Park Oberfranken 3. "
    "Ich würde gerne wissen, wie hoch der aktuelle Zinssatz ist. "
    "Könnten Sie mir bitte den aktuellen Zinssatz nennen? "
    "Vielen Dank und viele Grüße, Max Mustermann"
)

# Tokenization function
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Get embeddings for the input text
input_embedding = get_embeddings(input_text)

# Get embeddings for each project name
project_embeddings = [get_embeddings(name) for name in project_names]

# Compute cosine similarities between input text and each project name
similarities = [cosine_similarity(input_embedding.unsqueeze(0), proj_emb.unsqueeze(0)).item() for proj_emb in project_embeddings]

print(f"Max Similarity: {max(similarities)}")

# Find the most similar project name
best_match_index = similarities.index(max(similarities))
best_project_name = project_names[best_match_index]

# Output the result
print(f"Extracted Project Name: {best_project_name}")
assert best_project_name == expected_text




Max Similarity: 0.31990915536880493
Extracted Project Name: DFI Zukunftspark Oberfranken III


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the model and tokenizer for German BERT (gbert-base)
model_name = "deepset/gbert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# List of predefined project names
project_names = [
    "Projekt Mozart II", "Projekt Mozart I", "The Five", 
    "DFI Zukunftspark Oberfranken IV", "Berliner Flair in Friedrichshain II", 
    "DFI Zukunftspark Oberfranken V", "DFI Zukunftspark Oberfranken III", 
    "Am Akkonplatz", "Berliner Flair in Friedrichshain", "Projekt Mozart III", 
    "Tonhallen-Passage II"
]

# Example input text
input_text = "Hallo, ich habe eine Frage zu den Zinsen für das Projekt Mozart III. Könnten Sie mir bitte den aktuellen Zinssatz nennen? Vielen Dank und viele Grüße, Max Mustermann"

# Tokenization function
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Get embeddings for the input text
input_embedding = get_embeddings(input_text)

# Get embeddings for each project name
project_embeddings = [get_embeddings(name) for name in project_names]

# Compute cosine similarities between input text and each project name
similarities = [cosine_similarity(input_embedding.unsqueeze(0), proj_emb.unsqueeze(0)).item() for proj_emb in project_embeddings]

# Find the most similar project name
best_match_index = similarities.index(max(similarities))
best_project_name = project_names[best_match_index]

# Output the result
print(f"Extracted Project Name: {best_project_name}")

In [92]:
from fuzzywuzzy import process

# List of predefined project names
project_names = [
    "Projekt Mozart II", "Projekt Mozart I", "The Five", 
    "DFI Zukunftspark Oberfranken IV", "Berliner Flair in Friedrichshain II", 
    "DFI Zukunftspark Oberfranken V", "DFI Zukunftspark Oberfranken III", 
    "Am Akkonplatz", "Berliner Flair in Friedrichshain", "Projekt Mozart III", 
    "Tonhallen-Passage II"
]
project_names = sorted(project_names, reverse=True)

# Test 1
input_text = "Hallo, ich habe eine Frage zu den Zinsen für das Projekt Musik III."

# Extract potential project name from the text (fuzzy matching)
best_match = process.extractOne(input_text, project_names)

# Print the best match
print(f"Best Project Name Match: {best_match}")

# Test 2
input_text = "Hallo, ich habe eine Frage zu den Zinsen für das DFI Park Oberfranken V. Könnten Sie mir bitte den aktuellen Zinssatz nennen? Vielen Dank und viele Grüße, Max Mustermann"
best_match = process.extractOne(input_text, project_names)

# Print the best match
print(f"Best Project Name Match: {best_match}")

# Test 3
input_text = "Hallo, ich habe eine Frage zu den Zinsen für das Projekt Musik III. Könnten Sie mir bitte den aktuellen Zinssatz nennen? Vielen Dank und viele Grüße, Max Mustermann"
best_match = process.extractOne(input_text, project_names)

# Print the best match
print(f"Best Project Name Match: {best_match}")

# Test 4
input_text = "Hallo, ich habe eine Frage zu den Zinsen für das Unterfranken III. Könnten Sie mir bitte den aktuellen Zinssatz nennen? Vielen Dank und viele Grüße, Max Mustermann"
best_match = process.extractOne(input_text, project_names)

# Print the best match
print(f"Best Project Name Match: {best_match}")


Best Project Name Match: ('Projekt Mozart III', 86)
Best Project Name Match: ('DFI Zukunftspark Oberfranken V', 86)
Best Project Name Match: ('DFI Zukunftspark Oberfranken III', 86)
Best Project Name Match: ('DFI Zukunftspark Oberfranken III', 86)
