In [1]:
import findspark
findspark.init()

In [2]:
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, StructType, StructField, IntegerType, FloatType, DateType

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import unicodedata
import re

In [11]:
path = os.path.join(os.getcwd(), "..", "Data", "output_search_DB_embedded.csv")
spark = SparkSession.builder \
    .appName("Fuzzy Matching") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", "2") \
    .getOrCreate()

df = spark.read.csv(path, header=True, inferSchema=True)
df = df.drop("embedding", "text")
df.show(truncate=False)

+--------+-------------------+-------------------------------------------------------------+------------------------------------------------------------------------------------------+
|order_id|order_customer_name|product_name                                                 |part_type_name                                                                            |
+--------+-------------------+-------------------------------------------------------------+------------------------------------------------------------------------------------------+
|2200006 |得意先_1           |A　2023年3月号 定期演奏会                                    |['本文1']                                                                                 |
|2107551 |得意先_7           |アーティストリスト2022年                                     |['本文1', '本文1', '本文2', '本文2', '表紙1', '表紙1']                                    |
|2200898 |得意先_8           |ミュージアムリーフレット                                     |['本文']                                    

In [12]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [13]:
spark

In [14]:
def normalize(text):
    text = unicodedata.normalize("NFKC", str(text))
    text = re.sub(r'\s+', '', text)
    return text

In [None]:
def fuzzy_match(input_text, df, flag=False, top_k=10, scorer='ratio'):

    mapping = {
        'ratio': fuzz.ratio,
        'partial_ratio': fuzz.partial_ratio,
        'token_sort_ratio': fuzz.token_sort_ratio,
        'token_set_ratio': fuzz.token_set_ratio,
    }

    if scorer not in mapping:
        raise ValueError(f"Scorer '{scorer}' is not supported. Choose from {list(mapping.keys())}.")
    else:
        scorer_func = mapping[scorer]

    def fuzzy_score(input_text, current_text):
        if input_text is None or current_text is None:
            return 0
        try:
            input_proc = normalize(input_text) if flag else input_text
            current_proc = normalize(current_text) if flag else current_text
            return scorer_func(input_proc, current_proc)
        except Exception as e:
            return 0
    
    fuzzy_score_udf = F.udf(fuzzy_score, IntegerType(), useArrow = True)

    matches = []
    for col in df.columns:
        if col in ["order_id"]:
            continue

        df_with_score = df.withColumn("score", fuzzy_score_udf(F.lit(input_text), F.col(col))) 
        temp_df = df_with_score.select(
            'order_id',
            'order_customer_name',
            'product_name',
            'part_type_name',
            F.lit(col).alias("matched_column"),
            'score'
        )

        matches.append(temp_df)

    result_df = matches[0]
    for additional_df in matches[1:]:
        result_df = result_df.union(additional_df)

    result_df = result_df.orderBy(F.desc('score')).limit(top_k)
    
    return result_df

In [38]:
input_text = "ーセプル"
spark_result_df = fuzzy_match(input_text, df, flag=False, top_k=90, scorer='ratio')

In [41]:
spark_result_df.filter(F.col("matched_column").contains("part_type_name")).show(truncate=False)

+--------+-------------------+-----------------------------------------------+--------------------------------------------------+--------------+-----+
|order_id|order_customer_name|product_name                                   |part_type_name                                    |matched_column|score|
+--------+-------------------+-----------------------------------------------+--------------------------------------------------+--------------+-----+
|2204644 |得意先_148         |T大学第一小学校　2024学校案内　ポスター＆チラシ|['ポスター&チラシ']                               |part_type_name|12   |
|2204248 |得意先_200         |B　POP                                         |['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']|part_type_name|5    |
|2204784 |得意先_200         |D　8週（2/21付）　POP                          |['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']|part_type_name|5    |
|2204785 |得意先_200         |D　9週（2/28付）　POP                          |['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']|part_type_name|5    |
|2204787 |得意先_200         |D　

In [46]:
spark_result_df.select('*').where(F.col("matched_column").contains("part_type_name")).distinct().show(truncate=False)

+--------+-------------------+-----------------------------------------------+--------------------------------------------------+--------------+-----+
|order_id|order_customer_name|product_name                                   |part_type_name                                    |matched_column|score|
+--------+-------------------+-----------------------------------------------+--------------------------------------------------+--------------+-----+
|2204644 |得意先_148         |T大学第一小学校　2024学校案内　ポスター＆チラシ|['ポスター&チラシ']                               |part_type_name|12   |
|2204248 |得意先_200         |B　POP                                         |['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']|part_type_name|5    |
|2204784 |得意先_200         |D　8週（2/21付）　POP                          |['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']|part_type_name|5    |
|2204785 |得意先_200         |D　9週（2/28付）　POP                          |['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']|part_type_name|5    |
|2204787 |得意先_200         |D　

In [49]:
spark_result_df.where(spark_result_df.part_type_name.like("%クリスパ%")).show(truncate=False)

+--------+-------------------+---------------------+--------------------------------------------------+-------------------+-----+
|order_id|order_customer_name|product_name         |part_type_name                                    |matched_column     |score|
+--------+-------------------+---------------------+--------------------------------------------------+-------------------+-----+
|2204248 |得意先_200         |B　POP               |['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']|part_type_name     |5    |
|2204784 |得意先_200         |D　8週（2/21付）　POP|['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']|part_type_name     |5    |
|2204785 |得意先_200         |D　9週（2/28付）　POP|['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']|part_type_name     |5    |
|2204787 |得意先_200         |D　10週（3/7付）　POP|['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']|part_type_name     |5    |
|2204248 |得意先_200         |B　POP               |['Ne-tak', 'しらおい上質86.5kg', 'クリスパー0.25']|order_customer_name|0    |
+--------+-------------------+--------------------