In [18]:
!pip install thefuzz

Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.14.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.14.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: rapidfuzz, thefuzz
Successfully installed rapidfuzz-3.14.0 thefuzz-0.22.1


In [19]:
import pandas as pd
from thefuzz import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
new_queries = pd.read_csv("/kaggle/input/query-nlp/new_queries.csv")
resolved_queries = pd.read_csv("/kaggle/input/query-nlp/resolved_queries.csv")

In [5]:
new_queries, resolved_queries

(                              Variation_Query  Matches_With_Query_ID
 0            Unabel to conect to the internet                      1
 1                   Can’t connect to internet                      1
 2                         Intenet not working                      1
 3                Payment failed while chekout                      2
 4   Payment did not go through during chckout                      2
 5                  Payment issue at check out                      2
 6    Application crashes when opening setings                      3
 7            App crash when going to settings                      3
 8            Settings cause the app to chrash                      3
 9               Forgot passwrd and cant reset                      4
 10        Forgotten password, unable to reset                      4
 11                  I can’t reset my password                      4
 12             Unable to uplod file to server                      5
 13        Can't upl

In [10]:
new_queries["Variation_Query"] = new_queries["Variation_Query"].str.lower().str.strip()
resolved_queries["Pre_Resolved_Query"] = resolved_queries["Pre_Resolved_Query"].str.lower().str.strip()


In [11]:
def get_all_fuzzy_scores(query, df, threshold=70):
    choices = df["Pre_Resolved_Query"].tolist()
    scores = {
        "ratio": process.extractOne(query, choices, scorer=fuzz.ratio),
        "partial_ratio": process.extractOne(query, choices, scorer=fuzz.partial_ratio),
        "token_sort_ratio": process.extractOne(query, choices, scorer=fuzz.token_sort_ratio),
        "token_set_ratio": process.extractOne(query, choices, scorer=fuzz.token_set_ratio),
    }
    best_method, best_result = max(scores.items(), key=lambda x: x[1][1])
    if best_result and best_result[1] >= threshold:
        # Find the Query_ID for the matched text
        qid = df.loc[df["Pre_Resolved_Query"] == best_result[0], "Query_ID"].values[0]
        return best_method, best_result[0], qid, best_result[1]
    return None, None, None, None

fuzzy_results = []
for uq in new_queries["Variation_Query"]:
    method, match, qid, score = get_all_fuzzy_scores(uq, resolved_queries)
    fuzzy_results.append((uq, method, match, qid, score))

fuzzy_df = pd.DataFrame(fuzzy_results, columns=[
    "Unresolved_Query", "Best_Method", "Fuzzy_Match", "Fuzzy_Query_ID", "Fuzzy_Score"
])

In [12]:
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(
    new_queries["Variation_Query"].tolist() + resolved_queries["Pre_Resolved_Query"].tolist()
)

n_unresolved = len(new_queries)
unresolved_vecs = tfidf_matrix[:n_unresolved]
resolved_vecs = tfidf_matrix[n_unresolved:]

cosine_sim = cosine_similarity(unresolved_vecs, resolved_vecs)

tfidf_results = []
for i, uq in enumerate(new_queries["Variation_Query"]):
    best_idx = cosine_sim[i].argmax()
    best_score = cosine_sim[i][best_idx]
    matched_row = resolved_queries.iloc[best_idx]
    tfidf_results.append((uq, matched_row["Pre_Resolved_Query"], matched_row["Query_ID"], best_score))

tfidf_df = pd.DataFrame(tfidf_results, columns=[
    "Unresolved_Query", "TFIDF_Match", "TFIDF_Query_ID", "TFIDF_Score"
])

In [13]:
combined = pd.merge(fuzzy_df, tfidf_df, on="Unresolved_Query", how="inner")

In [14]:
def pick_final(row, fuzzy_thresh=75, tfidf_thresh=0.65):
    if row["Fuzzy_Score"] and row["Fuzzy_Score"] >= fuzzy_thresh:
        return row["Fuzzy_Match"], row["Fuzzy_Query_ID"], f"Fuzzy-{row['Best_Method']}"
    elif row["TFIDF_Score"] >= tfidf_thresh:
        return row["TFIDF_Match"], row["TFIDF_Query_ID"], "TFIDF"
    else:
        return None, None, "No Match"

combined[["Final_Match", "Final_Query_ID", "Method_Used"]] = combined.apply(
    pick_final, axis=1, result_type="expand"
)

In [15]:
combined[['Unresolved_Query', 'Fuzzy_Match', 'Method_Used']]

Unnamed: 0,Unresolved_Query,Fuzzy_Match,Method_Used
0,unabel to conect to the internet,unable to connect to the internet,Fuzzy-ratio
1,can’t connect to internet,unable to connect to the internet,Fuzzy-token_set_ratio
2,intenet not working,,No Match
3,payment failed while chekout,payment failed during checkout,Fuzzy-ratio
4,payment did not go through during chckout,payment failed during checkout,No Match
5,payment issue at check out,payment failed during checkout,No Match
6,application crashes when opening setings,app crashes when opening settings,Fuzzy-partial_ratio
7,app crash when going to settings,app crashes when opening settings,Fuzzy-ratio
8,settings cause the app to chrash,,No Match
9,forgot passwrd and cant reset,forgot password and unable to reset,Fuzzy-ratio


# **TASK 2**

In [8]:
import re

In [9]:
name_variations = pd.read_csv("/kaggle/input/variations/name_variations.csv")
base_names = pd.read_csv("/kaggle/input/variations/base_names.csv")

In [10]:
base_names, name_variations

(    Base_Name_ID          Base_Name
 0              1         John Smith
 1              2     Jennifer Brown
 2              3   Michael O'Connor
 3              4       Maria Garcia
 4              5         Robert Lee
 5              6      Linda Johnson
 6              7      William Davis
 7              8   Elizabeth Wilson
 8              9     David Martinez
 9             10        Susan Clark
 10            11    James Rodriguez
 11            12         Mary Lewis
 12            13         Paul Allen
 13            14        Karen Young
 14            15        Thomas King
 15            16       Nancy Wright
 16            17       Daniel Scott
 17            18        Sandra Hill
 18            19  Christopher Green
 19            20      Jessica Adams,
           Variation Matches_With_Base_Name
 0      Thomas  King            Thomas King
 1        ThomasKing            Thomas King
 2      Maria Garcia           Maria Garcia
 3         MaryLewis             Mary Lewis
 4

In [11]:
def normalize_name(name: str) -> str:
    if pd.isna(name):
        return ""
    name = name.strip()
    name = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', name)
    name = name.lower()
    name = re.sub(r'[^a-z\s]', '', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [12]:
print(name_variations.columns)
print(base_names.columns)


Index(['Variation', 'Matches_With_Base_Name'], dtype='object')
Index(['Base_Name_ID', 'Base_Name'], dtype='object')


In [13]:
print(name_variations.head())


      Variation Matches_With_Base_Name
0  Thomas  King            Thomas King
1    ThomasKing            Thomas King
2  Maria Garcia           Maria Garcia
3     MaryLewis             Mary Lewis
4      Nancy W.           Nancy Wright


In [14]:

name_variations.columns = name_variations.columns.str.strip()
base_names.columns = base_names.columns.str.strip()

# Define the normalization function
def normalize_name(name):
    name = name.lower()  # convert to lowercase
    name = ''.join(e for e in name if e.isalpha() or e == ' ')  # keep only letters and spaces
    name = ' '.join(name.split())  # remove extra spaces
    return name

# Confirm the columns are correct
print("Columns in name_variations:", name_variations.columns)
print("Columns in base_names:", base_names.columns)

# Apply normalization safely
if 'Variation' in name_variations.columns:
    name_variations["Normalized"] = name_variations["Variation"].astype(str).apply(normalize_name)
else:
    print("Error: 'Variation' column not found in name_variations")

if 'Base_Name' in base_names.columns:
    base_names["Normalized"] = base_names["Base_Name"].astype(str).apply(normalize_name)
else:
    print("Error: 'Base_Name' column not found in base_names")

# Display the resulting DataFrames
print("\nNormalized name_variations:")
print(name_variations.head())

print("\nNormalized base_names:")
print(base_names.head())


Columns in name_variations: Index(['Variation', 'Matches_With_Base_Name'], dtype='object')
Columns in base_names: Index(['Base_Name_ID', 'Base_Name'], dtype='object')

Normalized name_variations:
      Variation Matches_With_Base_Name    Normalized
0  Thomas  King            Thomas King   thomas king
1    ThomasKing            Thomas King    thomasking
2  Maria Garcia           Maria Garcia  maria garcia
3     MaryLewis             Mary Lewis     marylewis
4      Nancy W.           Nancy Wright       nancy w

Normalized base_names:
   Base_Name_ID         Base_Name       Normalized
0             1        John Smith       john smith
1             2    Jennifer Brown   jennifer brown
2             3  Michael O'Connor  michael oconnor
3             4      Maria Garcia     maria garcia
4             5        Robert Lee       robert lee


In [15]:
def get_best_match(name, base_names, threshold=80):
    match = process.extractOne(
        name,
        base_names["Normalized"].tolist(),
        scorer=fuzz.token_sort_ratio
    )
    if match and match[1] >= threshold:
        # Get the original base name for reporting
        matched_row = base_names.loc[base_names["Normalized"] == match[0], "Base_Name"].values[0]
        return matched_row, match[1]
    return None, None

In [20]:
results = []
for name, norm in zip(name_variations["Variation"], name_variations["Normalized"]):
    matched_name, score = get_best_match(norm, base_names)
    results.append((name, matched_name, score))


In [21]:
matches_df = pd.DataFrame(results, columns=["Variation_Name", "Matched_Base_Name", "Score"])
matches_df.head(15)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Variation_Name,Matched_Base_Name,Score
0,Thomas King,Thomas King,100.0
1,ThomasKing,,
2,Maria Garcia,Maria Garcia,100.0
3,MaryLewis,,
4,Nancy W.,,
5,Dani3l Scott,Daniel Scott,96.0
6,JOHN smith,John Smith,100.0
7,linda johnson,Linda Johnson,100.0
8,N@ncy Wright,Nancy Wright,96.0
9,William Davis,William Davis,100.0


## Results

In [22]:
from sklearn.metrics import accuracy_score

y_true = name_variations['Matches_With_Base_Name'].fillna("No Match")
y_pred = matches_df['Matched_Base_Name'].fillna("No Match")

accuracy_score(y_true, y_pred)

0.9