The goal of the notebook to evaluate the new function that does exact matching with the cosine similarity. The donation projects are excluded from the evaluation.

In [22]:
import pandas as pd
import string
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


from project_name_extractor_utils import get_project_list
from dtos.project import Project

from importlib import reload
from dtos.identified_project import IdentifiedProject

In [23]:
similarity_threshold: float = 0.185
ground_truth_csv_path = "/home/jovyan/work/notebook/project_name_extractor_evaluation/ground_truth_v2.csv"

## Load dataset

In [24]:
# Read Ground Truth CSV Data
data = pd.read_csv(ground_truth_csv_path, delimiter=";")

# Load the data into a DataFrame
df = pd.DataFrame(data)

Replace "NaN" with an empty string in the expected project names to avoid issues with comparison.

In [25]:
df = df.fillna("")

In [26]:
df.describe()

Unnamed: 0,source_system,project_id,project_name,subject,body,comment
count,240,240.0,240.0,240,240,240.0
unique,1,42.0,42.0,220,234,5.0
top,evdi,,,Welche Sicherheiten gibt es für Investoren?,"Könnten Sie mir bitte mitteilen, wie lange die...",
freq,240,28.0,28.0,4,2,218.0


## Load existing projects

In [27]:
projects = get_project_list()
projects = [project for project in projects if 'Spendenprojekt - Green Forest Fund' not in project.name]

In [28]:
projects[0:10]

[Project(id=UUID('54ee27f7-bfb3-49bd-9438-2be412c8d8a0'), name='DFI Zukunftspark Oberfranken VI'),
 Project(id=UUID('716867b4-c28c-425e-94be-59886d853d49'), name='Berliner Flair in Friedrichshain II'),
 Project(id=UUID('e2fefd3e-6841-434f-af12-6e700d7c60d3'), name='DFI Zukunftspark Nordbayern IV'),
 Project(id=UUID('d1f21f84-9eec-4d0b-a63a-bf656a28a256'), name='DFI Zukunftspark Oberfranken V'),
 Project(id=UUID('811dc8a3-c453-48a0-82dd-58df3ad52a6d'), name='Am Akkonplatz'),
 Project(id=UUID('83ff1d1c-6a7f-45bb-adff-0e42c26463a4'), name='Berliner Flair in Friedrichshain'),
 Project(id=UUID('e6ea9000-8561-4f86-8795-a60032f239f4'), name='DFI Zukunftspark Oberfranken IV'),
 Project(id=UUID('4973e74a-e88e-4e1b-b534-36615368d4a6'), name='Tonhallen-Passage II'),
 Project(id=UUID('35f259f1-4160-4768-9c7a-9ecbda485ba0'), name='DFI Zukunftspark Nordbayern III'),
 Project(id=UUID('41e842bb-6963-4ce9-bc24-1c0a58648d7a'), name='DFI Zukunftspark Oberfranken III')]

## Preprocess and sort the project names

In [29]:
def preprocess_text(text: str) -> str:
    text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    text = text.lower().strip()
    punctuations = string.punctuation
    for char in punctuations:
        text = text.replace(char, "")

    return text.strip()

In [30]:
def create_project(project: Project) -> Project:
    project_name = preprocess_text(project.name)
    project = Project(project.id, project_name)
    return project

preprocessed_projects = [ create_project(project) for project in projects ]
preprocessed_projects[45:55]

[Project(id=UUID('8be34dc2-3f4f-6b44-4b7a-e2e70d4e82ac'), name='stadtleben am bodensee ii'),
 Project(id=UUID('a1375e89-14cd-45a3-a1e4-8df17a8d82a1'), name='rheinknieresidenz bonn vi'),
 Project(id=UUID('6bf14376-dd72-49b7-a76f-4b1e0b65d38a'), name='metropolkomplex dortmund ii'),
 Project(id=UUID('9d9ee4a0-8bbf-4870-9382-c63b4d90ac12'), name='weidengrund hauptstadt iii'),
 Project(id=UUID('02c7b6db-9a66-4293-9c26-5b5bff0dd9ef'), name='altstadtblick bamberg i'),
 Project(id=UUID('45d23980-7ad6-4df1-8186-c8f1d83d3f87'), name='waldsee residenz oberstdorf'),
 Project(id=UUID('7f5fee56-a4ff-4d66-ac5f-fdb6ef7ce1b1'), name='schlossviertel weimar'),
 Project(id=UUID('f59b1a60-5eef-4e46-8e7a-3c8c15b2d850'), name='mainuferpanorama frankfurt'),
 Project(id=UUID('b8b56e01-b594-4c14-99a8-65fa36203fcc'), name='bergkulisse oberbayern'),
 Project(id=UUID('2cd2f6c0-8c0a-4f7b-be21-7aadfa8f0b0e'), name='straßburgblick kehl')]

In [31]:
sorted_projects_with_preprocessed_names = sorted(preprocessed_projects, key=lambda p: len(p.name), reverse=True)

In [32]:
sorted_projects_with_preprocessed_names[45:44]

[]

In [33]:
from typing import NamedTuple
from uuid import UUID

class ProjectMatch(NamedTuple):
    name: str
    id: UUID
    similarity: float

In [34]:
from typing import List
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

from dtos.identified_project import IdentifiedProject


class ProjectIdentifierService:
    def __init__(self, projects: List[Project]):
        self._projects = projects

        self._model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1"
        self._tokenizer = AutoTokenizer.from_pretrained(self._model_name)
        self._model = AutoModel.from_pretrained(self._model_name)

        # Get embeddings for each project name
        self._project_embeddings = [self._get_embeddings(project.name) for project in self._projects]

    def extract_project_name(self, query: str) -> ProjectMatch:
        matched_projects = [project for project in self._projects if project.name in query]

        if matched_projects:
            project = matched_projects[0]
            confidence = 1  # Exact match, so confidence is 100%

            return ProjectMatch(project.name, project.id, confidence)

        # Step 2: If no exact match, fall back to similarity matching
        project_match = self._extract_project_using_embeddings(query)

        confidence = (project_match.similarity + 1) / 2

        return ProjectMatch(project_match.name, project_match.id, confidence)

    def _extract_project_using_embeddings(self, input_text: str) -> ProjectMatch | None:
        # Get embeddings for the input text
        input_embedding = self._get_embeddings(input_text)

        # Compute cosine similarities between input text and each project name
        similarities: List[float] = [cosine_similarity(input_embedding.unsqueeze(0), proj_emb.unsqueeze(0)).item() for proj_emb in self._project_embeddings]

        max_similarity_value: float = max(similarities)

        # Find the most similar project name
        best_match_index = similarities.index(max_similarity_value)
        best_project = self._projects[best_match_index]

        return ProjectMatch(best_project.name, best_project.id, max_similarity_value)

    # Tokenization function
    def _get_embeddings(self, text: str):
        inputs = self._tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = self._model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze()

## Get project name using preprocessed project names and query
The function below uses the preprocessed project names and the query to find the best match. The function returns the best match and the cosine similarity score.

In [35]:
project_identifier_service = ProjectIdentifierService(sorted_projects_with_preprocessed_names)

In [36]:
def get_project_by_project_match(project_match: ProjectMatch, projects: List[Project]) -> Project:
    for project in projects:
        if project_match.id == project.id:
            return project

    raise None

In [37]:
from pandas import Series


def process_row(row: Series) -> ProjectMatch:
    row_as_dict = row.to_dict()
    query = preprocess_text(row_as_dict["subject"] + " " + row_as_dict["body"])
    extracted_project = project_identifier_service.extract_project_name(query)

    return extracted_project

### Test the approach for one row

In [38]:
item = df.iloc[123]
item

source_system                                                 evdi
project_id                    180770C4-5FE2-481B-B2CF-6BEF10A9B400
project_name                                  Stadthaus Mozart III
subject                 Renditeaussichten für Stadthaus Mozart III
body             Sehr geehrtes Team, ich interessiere mich für ...
comment                                                           
Name: 123, dtype: object

In [39]:
project_match = process_row(item)
project_name = get_project_by_project_match(project_match, projects)
project_name

Project(id=UUID('180770c4-5fe2-481b-b2cf-6bef10a9b400'), name='Stadthaus Mozart III')

In [40]:
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    project_match = process_row(row)
    extracted_project_name = get_project_by_project_match(project_match, projects).name
    df.at[idx, "extracted_project_name"] = extracted_project_name
    df.at[idx, "extracted_project_similarity"] = project_match.similarity

100%|██████████| 240/240 [00:12<00:00, 18.47it/s]


In [41]:
mask = df["project_name"] != df["extracted_project_name"]
different_values_df = df[mask]
len(different_values_df)

38

In [42]:
different_values_df

Unnamed: 0,source_system,project_id,project_name,subject,body,comment,extracted_project_name,extracted_project_similarity
39,evdi,83FF1D1C-6A7F-45BB-ADFF-0E42C26463A4,Berliner Flair in Friedrichshain,Renditeerwartungen und Laufzeit des Berliner F...,"Guten Tag, ich habe großes Interesse an einer ...",,Berliner Flair in Friedrichshain II,0.591991
104,evdi,E391251B-C6C5-46D9-AA54-6B0385CC530C,DFI Zukunftspark Dreiländereck,Laufzeit und Auszahlungsmodalitäten des Dreilä...,"Guten Tag, ich habe großes Interesse an einer ...",,DFI Zukunftspark Dreiländereck II,0.632009
130,evdi,04DF54CA-2B08-4545-ADD1-EA4D6301E993,Büro-Hochhaus am Scheidemannplatz II,Nachhaltigkeit und Energieeffizienz bei Büropr...,"Guten Tag, ich habe eine Frage bezüglich der N...",,Eco Living Lichtenrade,0.598434
136,evdi,04DF54CA-2B08-4545-ADD1-EA4D6301E993,Büro-Hochhaus am Scheidemannplatz II,Nachhaltigkeit und Energieeffizienz bei Büropr...,"Guten Tag, ich habe eine Frage bezüglich der N...",Replaced ü with ue,Eco Living Lichtenrade,0.598762
137,evdi,04DF54CA-2B08-4545-ADD1-EA4D6301E993,Büro-Hochhaus am Scheidemannplatz II,Investitionsmöglichkeiten und Renditeerwartungen,"Sehr geehrtes Team, ich interessiere mich für ...",Replaced ü with ue,Atelier-Wohnungen an der Burg II,0.630056
138,evdi,04DF54CA-2B08-4545-ADD1-EA4D6301E993,Büro-Hochhaus am Scheidemannplatz II,Sicherheiten und Risikoabsicherung beim Immobi...,"Sehr geehrte Damen und Herren, als potenzielle...",Replaced ü with ue,Atelier-Wohnungen an der Burg II,0.606869
182,evdi,CC215696-CDDF-43B2-BB79-BD204EFD64F3,Spendenprojekt - Green Forest Fund,Wie wird die Rendite beim Spendenprojekt - Gre...,Wie wird die Rendite in diesem Projekt berechn...,,Berggrün Anwesen Garmisch,0.596013
183,evdi,CC215696-CDDF-43B2-BB79-BD204EFD64F3,Spendenprojekt - Green Forest Fund,Wie hoch ist das Risiko für das Spendenprojekt...,Wie wird das Risiko dieses Projekts bewertet u...,,Waldsee Residenz Oberstdorf,0.586608
184,evdi,CC215696-CDDF-43B2-BB79-BD204EFD64F3,Spendenprojekt - Green Forest Fund,Sicherheit und Renditepotenzial?,Welche Sicherheitsmaßnahmen gibt es beim Green...,,Berggrün Anwesen Garmisch,0.613154
205,evdi,118BBDEC-3A71-4E5F-B725-BDC85E4A31EB,Stadthaus 'Mozart',Welche Sicherheiten gibt es für Investoren?,Ich interessiere mich für die Sicherheitsmaßna...,,Stadthaus 'Mozart' II,0.617017
