In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
import warnings
warnings.filterwarnings("ignore")

TASK 1

In [2]:
resolved_queries = pd.read_csv('/kaggle/input/queries/resolved_queries.csv')
unresolved_queries = pd.read_csv('/kaggle/input/queries/new_queries.csv')

In [3]:
resolved_queries.head(3)

Unnamed: 0,Query_ID,Pre_Resolved_Query
0,1,Unable to connect to the internet
1,2,Payment failed during checkout
2,3,App crashes when opening settings


In [4]:
unresolved_queries.head(3)

Unnamed: 0,Variation_Query,Matches_With_Query_ID
0,Unabel to conect to the internet,1
1,Can’t connect to internet,1
2,Intenet not working,1


In [9]:
def preprocess(text):
    text = text.lower()  # Convert to lower case
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove punctuation
    return text

unresolved_queries['Processed_Variation_Query'] = unresolved_queries['Variation_Query'].apply(preprocess)
resolved_queries['Processed_Pre_Resolved_Query'] = resolved_queries['Pre_Resolved_Query'].apply(preprocess)

In [11]:
unresolved_queries.head()

Unnamed: 0,Variation_Query,Matches_With_Query_ID,Processed_Variation_Query
0,Unabel to conect to the internet,1,unabel to conect to the internet
1,Can’t connect to internet,1,cant connect to internet
2,Intenet not working,1,intenet not working
3,Payment failed while chekout,2,payment failed while chekout
4,Payment did not go through during chckout,2,payment did not go through during chckout


In [12]:
resolved_queries.head()

Unnamed: 0,Query_ID,Pre_Resolved_Query,Processed_Pre_Resolved_Query
0,1,Unable to connect to the internet,unable to connect to the internet
1,2,Payment failed during checkout,payment failed during checkout
2,3,App crashes when opening settings,app crashes when opening settings
3,4,Forgot password and unable to reset,forgot password and unable to reset
4,5,Unable to upload files to the server,unable to upload files to the server


In [13]:
# Fuzzy Matching using fuzzywuzzy
def fuzzy_match(unresolved, resolved_df):
    scores = []
    for resolved_text in resolved_df['Processed_Pre_Resolved_Query']:
        score = fuzz.ratio(unresolved, resolved_text)
        scores.append(score)
    best_match_id = resolved_df.iloc[np.argmax(scores)]['Query_ID']
    return best_match_id, max(scores)

# Apply fuzzy matching to each unresolved query
unresolved_queries['Fuzzy_Match_ID'], unresolved_queries['Fuzzy_Score'] = zip(*unresolved_queries['Processed_Variation_Query'].apply(
    lambda x: fuzzy_match(x, resolved_queries)))

# Display the results
unresolved_queries[['Variation_Query', 'Matches_With_Query_ID', 'Fuzzy_Match_ID', 'Fuzzy_Score']]


Unnamed: 0,Variation_Query,Matches_With_Query_ID,Fuzzy_Match_ID,Fuzzy_Score
0,Unabel to conect to the internet,1,1,95
1,Can’t connect to internet,1,1,77
2,Intenet not working,1,2,33
3,Payment failed while chekout,2,2,83
4,Payment did not go through during chckout,2,2,68
5,Payment issue at check out,2,2,68
6,Application crashes when opening setings,3,3,88
7,App crash when going to settings,3,3,86
8,Settings cause the app to chrash,3,1,37
9,Forgot passwrd and cant reset,4,4,84


In [14]:
# Function to calculate cosine similarity using BoW and Tf-IDF
def vector_similarity(unresolved_queries, resolved_queries):
    # Combine all queries into a single corpus for vectorization
    corpus = list(resolved_queries['Processed_Pre_Resolved_Query']) + list(unresolved_queries['Processed_Variation_Query'])
    
    # BoW vectorization
    vectorizer_bow = CountVectorizer()
    bow_matrix = vectorizer_bow.fit_transform(corpus)
    
    # Split BoW vectors back into resolved and unresolved queries
    bow_resolved = bow_matrix[:len(resolved_queries)]
    bow_unresolved = bow_matrix[len(resolved_queries):]
    
    # Tf-IDF vectorization
    vectorizer_tfidf = TfidfVectorizer()
    tfidf_matrix = vectorizer_tfidf.fit_transform(corpus)
    
    # Split Tf-IDF vectors back into resolved and unresolved queries
    tfidf_resolved = tfidf_matrix[:len(resolved_queries)]
    tfidf_unresolved = tfidf_matrix[len(resolved_queries):]
    
    # Calculate cosine similarity and find the best match
    results_bow = []
    results_tfidf = []
    for unresolved_vec in bow_unresolved:
        cosine_scores_bow = cosine_similarity(unresolved_vec, bow_resolved)
        best_match_bow = np.argmax(cosine_scores_bow) + 1  # Adjust for 0-indexing and match with Query_ID
        
        cosine_scores_tfidf = cosine_similarity(unresolved_vec, tfidf_resolved)
        best_match_tfidf = np.argmax(cosine_scores_tfidf) + 1  # Adjust for 0-indexing and match with Query_ID
        
        results_bow.append(best_match_bow)
        results_tfidf.append(best_match_tfidf)
    
    return results_bow, results_tfidf

# Apply vector similarity matching
unresolved_queries['BoW_Match_ID'], unresolved_queries['TfIDF_Match_ID'] = vector_similarity(unresolved_queries, resolved_queries)

# Display results
unresolved_queries[['Variation_Query', 'Matches_With_Query_ID', 'Fuzzy_Match_ID', 'BoW_Match_ID', 'TfIDF_Match_ID']]


Unnamed: 0,Variation_Query,Matches_With_Query_ID,Fuzzy_Match_ID,BoW_Match_ID,TfIDF_Match_ID
0,Unabel to conect to the internet,1,1,1,1
1,Can’t connect to internet,1,1,1,1
2,Intenet not working,1,2,1,1
3,Payment failed while chekout,2,2,2,2
4,Payment did not go through during chckout,2,2,2,2
5,Payment issue at check out,2,2,2,2
6,Application crashes when opening setings,3,3,3,3
7,App crash when going to settings,3,3,3,3
8,Settings cause the app to chrash,3,1,1,1
9,Forgot passwrd and cant reset,4,4,4,4


TASK 2

In [15]:
name_variations = pd.read_csv('/kaggle/input/names-data/name_variations.csv')
base_names = pd.read_csv('/kaggle/input/names-data/base_names.csv')

In [18]:
def preprocess(name):
    name = name.lower()  # Convert to lower case
    name = ''.join([char for char in name if char.isalnum() or char.isspace()])  # Remove punctuation
    name = ' '.join(name.split())  # Remove extra spaces
    return name

# Apply preprocessing
name_variations['Processed_Variation'] = name_variations['Variation'].apply(preprocess)
base_names['Processed_Base_Name'] = base_names['Base_Name'].apply(preprocess)


name_variations.head()

Unnamed: 0,Variation,Matches_With_Base_Name,Processed_Variation
0,Thomas King,Thomas King,thomas king
1,ThomasKing,Thomas King,thomasking
2,Maria Garcia,Maria Garcia,maria garcia
3,MaryLewis,Mary Lewis,marylewis
4,Nancy W.,Nancy Wright,nancy w


In [17]:
base_names.head()

Unnamed: 0,Base_Name_ID,Base_Name,Processed_Base_Name
0,1,John Smith,john smith
1,2,Jennifer Brown,jennifer brown
2,3,Michael O'Connor,michael oconnor
3,4,Maria Garcia,maria garcia
4,5,Robert Lee,robert lee


In [19]:
def find_best_fuzzy_match(variation, base_names_df):
    scores = [fuzz.ratio(variation, base_name) for base_name in base_names_df['Processed_Base_Name']]
    best_match_index = np.argmax(scores)
    best_match_name = base_names_df.iloc[best_match_index]['Base_Name']
    best_match_score = scores[best_match_index]
    return best_match_name, best_match_score


name_variations['Best_Match'], name_variations['Match_Score'] = zip(*name_variations['Processed_Variation'].apply(
    lambda x: find_best_fuzzy_match(x, base_names)))

In [20]:
name_variations[['Variation', 'Matches_With_Base_Name', 'Best_Match', 'Match_Score']]

Unnamed: 0,Variation,Matches_With_Base_Name,Best_Match,Match_Score
0,Thomas King,Thomas King,Thomas King,100
1,ThomasKing,Thomas King,Thomas King,95
2,Maria Garcia,Maria Garcia,Maria Garcia,100
3,MaryLewis,Mary Lewis,Mary Lewis,95
4,Nancy W.,Nancy Wright,Nancy Wright,74
...,...,...,...,...
95,Jennifer- Brown,Jennifer Brown,Jennifer Brown,100
96,Daniel- Scott,Daniel Scott,Daniel Scott,100
97,David M.,David Martinez,David Martinez,67
98,Paul Allen.,Paul Allen,Paul Allen,100


In [21]:
name_variations['Best_Match'], name_variations['Match_Score'] = zip(*name_variations['Processed_Variation'].apply(
    lambda x: find_best_fuzzy_match(x, base_names)))

# Display the results with the original intended match for comparison
name_variations[['Variation', 'Matches_With_Base_Name', 'Best_Match', 'Match_Score']]

Unnamed: 0,Variation,Matches_With_Base_Name,Best_Match,Match_Score
0,Thomas King,Thomas King,Thomas King,100
1,ThomasKing,Thomas King,Thomas King,95
2,Maria Garcia,Maria Garcia,Maria Garcia,100
3,MaryLewis,Mary Lewis,Mary Lewis,95
4,Nancy W.,Nancy Wright,Nancy Wright,74
...,...,...,...,...
95,Jennifer- Brown,Jennifer Brown,Jennifer Brown,100
96,Daniel- Scott,Daniel Scott,Daniel Scott,100
97,David M.,David Martinez,David Martinez,67
98,Paul Allen.,Paul Allen,Paul Allen,100
