In [1]:
from google.colab import files

In [2]:
d1 = files.upload()

Saving new_queries.csv to new_queries.csv


In [3]:
d2 = files.upload()

Saving resolved_queries.csv to resolved_queries.csv


In [4]:
import pandas as pd
import numpy as np

In [6]:
new_queries = pd.read_csv("new_queries.csv")
resolved_queries = pd.read_csv("resolved_queries.csv")

In [7]:
import re

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text


resolved_queries['Processed_Query'] = resolved_queries['Pre_Resolved_Query'].apply(preprocess)
new_queries['Processed_Variation_Query'] = new_queries['Variation_Query'].apply(preprocess)


resolved_queries[['Pre_Resolved_Query', 'Processed_Query']].head(), new_queries[['Variation_Query', 'Processed_Variation_Query']].head()


(                     Pre_Resolved_Query                       Processed_Query
 0     Unable to connect to the internet     unable to connect to the internet
 1        Payment failed during checkout        payment failed during checkout
 2     App crashes when opening settings     app crashes when opening settings
 3   Forgot password and unable to reset   forgot password and unable to reset
 4  Unable to upload files to the server  unable to upload files to the server,
                              Variation_Query  \
 0           Unabel to conect to the internet   
 1                  Can’t connect to internet   
 2                        Intenet not working   
 3               Payment failed while chekout   
 4  Payment did not go through during chckout   
 
                    Processed_Variation_Query  
 0           unabel to conect to the internet  
 1                   cant connect to internet  
 2                        intenet not working  
 3               payment failed while

In [10]:
from fuzzywuzzy import fuzz, process

def fuzzy_match(variation_query, resolved_queries, method='ratio'):
    if method == 'ratio':
        scores = resolved_queries['Processed_Query'].apply(lambda x: fuzz.ratio(variation_query, x))
    elif method == 'partial_ratio':
        scores = resolved_queries['Processed_Query'].apply(lambda x: fuzz.partial_ratio(variation_query, x))
    elif method == 'token_sort_ratio':
        scores = resolved_queries['Processed_Query'].apply(lambda x: fuzz.token_sort_ratio(variation_query, x))
    elif method == 'token_set_ratio':
        scores = resolved_queries['Processed_Query'].apply(lambda x: fuzz.token_set_ratio(variation_query, x))

    best_match_idx = scores.idxmax()
    best_match_score = scores.max()
    best_match_query_id = resolved_queries.iloc[best_match_idx]['Query_ID']

    return best_match_query_id, best_match_score



In [11]:
methods = ['ratio', 'partial_ratio', 'token_sort_ratio', 'token_set_ratio']
results = {}

for method in methods:
    results[method] = new_queries['Processed_Variation_Query'].apply(lambda x: fuzzy_match(x, resolved_queries, method))

fuzzy_results = pd.DataFrame({
    'Variation_Query': new_queries['Variation_Query'],
    'Matched_Query_ID_Ratio': [result[0] for result in results['ratio']],
    'Match_Score_Ratio': [result[1] for result in results['ratio']],
    'Matched_Query_ID_Partial_Ratio': [result[0] for result in results['partial_ratio']],
    'Match_Score_Partial_Ratio': [result[1] for result in results['partial_ratio']],
    'Matched_Query_ID_Token_Sort_Ratio': [result[0] for result in results['token_sort_ratio']],
    'Match_Score_Token_Sort_Ratio': [result[1] for result in results['token_sort_ratio']],
    'Matched_Query_ID_Token_Set_Ratio': [result[0] for result in results['token_set_ratio']],
    'Match_Score_Token_Set_Ratio': [result[1] for result in results['token_set_ratio']],
})

fuzzy_results.head()

Unnamed: 0,Variation_Query,Matched_Query_ID_Ratio,Match_Score_Ratio,Matched_Query_ID_Partial_Ratio,Match_Score_Partial_Ratio,Matched_Query_ID_Token_Sort_Ratio,Match_Score_Token_Sort_Ratio,Matched_Query_ID_Token_Set_Ratio,Match_Score_Token_Set_Ratio
0,Unabel to conect to the internet,1,95,1,94,1,95,1,95
1,Can’t connect to internet,1,77,1,83,1,67,1,88
2,Intenet not working,2,33,1,52,1,35,1,37
3,Payment failed while chekout,2,83,2,79,2,76,2,83
4,Payment did not go through during chckout,2,68,2,61,2,65,2,68


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()

In [16]:
tfidf_resolved = vectorizer.fit_transform(resolved_queries['Processed_Query'])
tfidf_new = vectorizer.transform(new_queries['Processed_Variation_Query'])

cosine_similarities = cosine_similarity(tfidf_new, tfidf_resolved)

best_match_indices = cosine_similarities.argmax(axis=1)
best_match_scores = cosine_similarities.max(axis=1)

matched_query_ids = resolved_queries.iloc[best_match_indices]['Query_ID'].values

tfidf_results = pd.DataFrame({
    'Variation Query': new_queries['Variation_Query'],
    'Matched Query': matched_query_ids,
    'Match_Score ': best_match_scores
})

tfidf_results


Unnamed: 0,Variation Query,Matched Query,Match_Score
0,Unabel to conect to the internet,1,0.839042
1,Can’t connect to internet,1,0.836936
2,Intenet not working,1,0.0
3,Payment failed while chekout,2,0.707107
4,Payment did not go through during chckout,2,0.707107
5,Payment issue at check out,2,0.5
6,Application crashes when opening setings,3,0.774597
7,App crash when going to settings,3,0.722471
8,Settings cause the app to chrash,3,0.508047
9,Forgot passwrd and cant reset,4,0.782698
