In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load the datasets
resolved_queries = pd.read_csv('resolved_queries.csv')
unresolved_queries = pd.read_csv('new_queries.csv')

In [3]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lower case
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove punctuation
    return ' '.join(text.split())  # Remove extra spaces

In [4]:
# Apply preprocessing
unresolved_queries['Processed_Variation_Query'] = unresolved_queries['Variation_Query'].apply(preprocess_text)
resolved_queries['Processed_Pre_Resolved_Query'] = resolved_queries['Pre_Resolved_Query'].apply(preprocess_text)

In [5]:
# Fuzzy Matching Function
from fuzzywuzzy import fuzz
def fuzzy_query_match(unresolved_query, resolved_df):
    scores = [fuzz.ratio(unresolved_query, resolved_text) for resolved_text in resolved_df['Processed_Pre_Resolved_Query']]
    best_match_id = resolved_df.iloc[np.argmax(scores)]['Query_ID']
    best_score = max(scores)
    return best_match_id, best_score



In [6]:
# Apply fuzzy matching
unresolved_queries['Fuzzy_Match_ID'], unresolved_queries['Fuzzy_Score'] = zip(
    *unresolved_queries['Processed_Variation_Query'].apply(lambda query: fuzzy_query_match(query, resolved_queries))
)

In [7]:
# Function to calculate cosine similarity using BoW and Tf-IDF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_vector_similarity(unresolved_df, resolved_df):
    # Combine all queries into a single corpus for vectorization
    corpus = list(resolved_df['Processed_Pre_Resolved_Query']) + list(unresolved_df['Processed_Variation_Query'])
    
    # Vectorization using BoW
    vectorizer_bow = CountVectorizer()
    bow_matrix = vectorizer_bow.fit_transform(corpus)
    
    # Split BoW vectors back into resolved and unresolved queries
    bow_resolved = bow_matrix[:len(resolved_df)]
    bow_unresolved = bow_matrix[len(resolved_df):]
    
    # Vectorization using Tf-IDF
    vectorizer_tfidf = TfidfVectorizer()
    tfidf_matrix = vectorizer_tfidf.fit_transform(corpus)
    
    # Split Tf-IDF vectors back into resolved and unresolved queries
    tfidf_resolved = tfidf_matrix[:len(resolved_df)]
    tfidf_unresolved = tfidf_matrix[len(resolved_df):]
    
    # Calculate cosine similarity and find the best match for each query
    bow_matches = []
    tfidf_matches = []
    for unresolved_vec in bow_unresolved:
        bow_similarity_scores = cosine_similarity(unresolved_vec, bow_resolved)
        best_bow_match = np.argmax(bow_similarity_scores) + 1  # Adjust index for Query_ID

        tfidf_similarity_scores = cosine_similarity(unresolved_vec, tfidf_resolved)
        best_tfidf_match = np.argmax(tfidf_similarity_scores) + 1  # Adjust index for Query_ID

        bow_matches.append(best_bow_match)
        tfidf_matches.append(best_tfidf_match)
    
    return bow_matches, tfidf_matches

In [8]:
# Apply vector similarity
unresolved_queries['BoW_Match_ID'], unresolved_queries['TfIDF_Match_ID'] = calculate_vector_similarity(unresolved_queries, resolved_queries)

In [9]:
# Display final matching results
unresolved_queries[['Variation_Query', 'Matches_With_Query_ID', 'Fuzzy_Match_ID', 'BoW_Match_ID', 'TfIDF_Match_ID']]

Unnamed: 0,Variation_Query,Matches_With_Query_ID,Fuzzy_Match_ID,BoW_Match_ID,TfIDF_Match_ID
0,Unabel to conect to the internet,1,1,1,1
1,Can’t connect to internet,1,1,1,1
2,Intenet not working,1,2,1,1
3,Payment failed while chekout,2,2,2,2
4,Payment did not go through during chckout,2,2,2,2
5,Payment issue at check out,2,2,2,2
6,Application crashes when opening setings,3,3,3,3
7,App crash when going to settings,3,3,3,3
8,Settings cause the app to chrash,3,1,1,1
9,Forgot passwrd and cant reset,4,4,4,4
