In [1]:
import pandas as pd
import numpy as np

# Load datasets
resolved_df = pd.read_csv('resolved_queries.csv')
unresolved_df = pd.read_csv('new_queries.csv')

In [46]:
from thefuzz import fuzz

# Fuzzy Matching - Partial Token Sort Ratio
unresolved_df['Max_Fuzzy_Score'] = 0

for q_idx in range(len(unresolved_df)):
    for r_idx in range(len(resolved_df)):
        fuzzy_score = fuzz.partial_token_sort_ratio(unresolved_df['Variation_Query'][q_idx], resolved_df['Pre_Resolved_Query'][r_idx])
        if fuzzy_score > 60 and fuzzy_score > unresolved_df.loc[q_idx, 'Max_Fuzzy_Score']:
            unresolved_df.loc[q_idx, 'Max_Fuzzy_Score'] = fuzzy_score
            unresolved_df.loc[q_idx, 'Matched_Query_ID_ptsr'] = r_idx + 1

unresolved_df.drop(columns=['Max_Fuzzy_Score'], inplace=True)

In [47]:
# Fuzzy Matching - Partial Ratio
unresolved_df['Max_Fuzzy_Score'] = 0

for q_idx in range(len(unresolved_df)):
    for r_idx in range(len(resolved_df)):
        fuzzy_score = fuzz.partial_ratio(unresolved_df['Variation_Query'][q_idx], resolved_df['Pre_Resolved_Query'][r_idx])
        if fuzzy_score > 60 and fuzzy_score > unresolved_df.loc[q_idx, 'Max_Fuzzy_Score']:
            unresolved_df.loc[q_idx, 'Max_Fuzzy_Score'] = fuzzy_score
            unresolved_df.loc[q_idx, 'Matched_Query_ID_pr'] = r_idx + 1

unresolved_df.drop(columns=['Max_Fuzzy_Score'], inplace=True)

In [48]:
# Fuzzy Matching - Token Set Ratio
unresolved_df['Max_Fuzzy_Score'] = 0

for q_idx in range(len(unresolved_df)):
    for r_idx in range(len(resolved_df)):
        fuzzy_score = fuzz.token_set_ratio(unresolved_df['Variation_Query'][q_idx], resolved_df['Pre_Resolved_Query'][r_idx])
        if fuzzy_score > 60 and fuzzy_score > unresolved_df.loc[q_idx, 'Max_Fuzzy_Score']:
            unresolved_df.loc[q_idx, 'Max_Fuzzy_Score'] = fuzzy_score
            unresolved_df.loc[q_idx, 'Matched_Query_ID_tsr'] = r_idx + 1

unresolved_df.drop(columns=['Max_Fuzzy_Score'], inplace=True)

In [49]:
# Fuzzy Matching - Token Sort Ratio
unresolved_df['Max_Fuzzy_Score'] = 0

for q_idx in range(len(unresolved_df)):
    for r_idx in range(len(resolved_df)):
        fuzzy_score = fuzz.token_sort_ratio(unresolved_df['Variation_Query'][q_idx], resolved_df['Pre_Resolved_Query'][r_idx])
        if fuzzy_score > 60 and fuzzy_score > unresolved_df.loc[q_idx, 'Max_Fuzzy_Score']:
            unresolved_df.loc[q_idx, 'Max_Fuzzy_Score'] = fuzzy_score
            unresolved_df.loc[q_idx, 'Matched_Query_ID_tsr2'] = r_idx + 1

unresolved_df.drop(columns=['Max_Fuzzy_Score'], inplace=True)

In [50]:
# TF-IDF Cosine Similarity
unresolved_df['Max_Similarity'] = 0

In [51]:
combined_queries = list(unresolved_df['Variation_Query']) + list(resolved_df['Pre_Resolved_Query'])

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_queries)

In [53]:
# Split TF-IDF matrix
tfidf_queries = tfidf_matrix[:len(unresolved_df)]
tfidf_resolved = tfidf_matrix[len(unresolved_df):]

In [39]:
# Split the matrix for resolved and unresolved queries
resolved_tfidf = tfidf_matrix[:len(resolved_df)]
unresolved_tfidf = tfidf_matrix[len(resolved_df):]

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarities and update results
for q_idx in range(len(unresolved_df)):
    for r_idx in range(len(resolved_df)):
        similarity_score = cosine_similarity(tfidf_queries[q_idx], tfidf_resolved[r_idx])[0][0]
        if similarity_score > unresolved_df.loc[q_idx, 'Max_Similarity']:
            unresolved_df.loc[q_idx, 'Max_Similarity'] = similarity_score
            unresolved_df.loc[q_idx, 'Matched_Query_ID_tfidf'] = r_idx + 1


  unresolved_df.loc[q_idx, 'Max_Similarity'] = similarity_score


In [55]:
# Display the dataframe with matched IDs
print(unresolved_df[['Variation_Query', 'Matched_Query_ID_tfidf', 'Max_Similarity']])

                              Variation_Query  Matched_Query_ID_tfidf  \
0            Unabel to conect to the internet                     1.0   
1                   Can’t connect to internet                     1.0   
2                         Intenet not working                     NaN   
3                Payment failed while chekout                     2.0   
4   Payment did not go through during chckout                     2.0   
5                  Payment issue at check out                     2.0   
6    Application crashes when opening setings                     3.0   
7            App crash when going to settings                     3.0   
8            Settings cause the app to chrash                     3.0   
9               Forgot passwrd and cant reset                     4.0   
10        Forgotten password, unable to reset                     4.0   
11                  I can’t reset my password                     4.0   
12             Unable to uplod file to server      

In [56]:
# Accuracy Calculation for each method
for col in unresolved_df.columns[2:]:
    correct_matches = 0
    for idx in range(len(unresolved_df)):
        if unresolved_df[col][idx] == unresolved_df['Matches_With_Query_ID'][idx]:
            correct_matches += 1
    match_accuracy = (correct_matches / len(unresolved_df)) * 100
    print(f'Accuracy for {col}: {match_accuracy:.2f}%')

Accuracy for Cleaned_Query: 0.00%
Accuracy for Best_Fuzzy_Match: 100.00%
Accuracy for Fuzzy_Score: 0.00%
Accuracy for Best_TFIDF_Match: 100.00%
Accuracy for TFIDF_Similarity_Score: 0.00%
Accuracy for Matched_Query_ID_ptsr: 85.00%
Accuracy for Matched_Query_ID_pr: 60.00%
Accuracy for Matched_Query_ID_tsr: 85.00%
Accuracy for Matched_Query_ID_tsr2: 80.00%
Accuracy for Max_Similarity: 0.00%
Accuracy for Matched_Query_ID_tfidf: 95.00%
