## Section 1: Setup and Installations
### Install necessary libraries if not already installed (Uncomment the following lines if needed)
#### !pip install pandas scikit-learn transformers torch numpy


## Section 2: Import Libraries

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from transformers import BertModel, BertTokenizer
import torch
import numpy as np

## Section 3: Load the Dataset

In [None]:
def load_data(filepath):
    return pd.read_excel(filepath)

data = load_data('AOL_Search_Query_dataset.xlsx')


## Section 4: Text Vectorization Using BERT Model

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch



tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def bert_encode(texts):
    encoded_input = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**encoded_input)
    return outputs.last_hidden_state[:, 0, :].numpy()

def clean_query(text):

  if pd.isna(text):
    return " "

  if isinstance(text, (int, float, bool)):
      text = str(text)

  return text.strip()


data = data.dropna(subset=['Query'])
data['Query'] = data['Query'].apply(clean_query)

# Vectorize the queries
data['vectors'] = data['Query'].apply(lambda x: bert_encode([x])[0])



## Section 5: Clustering with Normalized BERT Embeddings

In [None]:
# Normalize the vectors
vectors = np.stack(data['vectors'].values)
normalized_vectors = normalize(vectors)

# Clustering
kmeans = KMeans(n_clusters=15, random_state=42)
data['Cluster'] = kmeans.fit_predict(normalized_vectors)

print("Clustering complete with BERT embeddings.")



Clustering complete with BERT embeddings.


## Section 6: Displaying Queries Grouped by Clusters

In [None]:
for i in range(15):
    print(f"\nCluster {i} Queries:")
    print(data[data['Cluster'] == i]['Query'].tolist())


Cluster 0 Queries:
['kbb.com', 'kbb.com', 'kbb.com', 'kbb.com', 'kbb.com', 'kbb.com', 'kbb.com', 'kbb.com', 'disney.com', 'playhousedisney.com', 'playhousedisney.com', "disney channel's playhouse disney.com", "disney channel's playhouse disney.com", "disney channel's playhouse disney.com", "disney channel's playhouse disney.com", "disney channel's playhouse disney.com", "disney channel's playhouse disney.com", "disney channel's playhouse disney.com", 'redirect.virtumundo.com', 'org.co-motion.com', 'org.co-motion.com', 'comments worldchiropracticalliance.org', 'google.com', 'njdobi.gov', 'superiorcourt.nj.gov', 'monmouthcounty.gov', 'njcounty.gov', 'njlegalforms.com', 'smallclaims.nj.com', 'm.mlxchange.com', 'google.com', 'girliezone.com', 'webmaste.drunkenmature.net', 'webmaste.drunkenmature.net', 'cherryteenthumbs.com', 'dl.weeklydialer.com', 'shelbyvirgins.com', 'galleries.amateursexhunters.com', 'galleries.amateursexhunters.com', 'galleries.amateursexhunters.com', 'promo.teenfuns.c

## Section 7: Saving Clustered Data to Excel

In [None]:
try:
    data.to_excel('QueryGuard_KMeans_Clustering.xlsx')
    print("Data saved successfully.")
except Exception as e:
    print(f"Failed to save data: {e}")

Data saved successfully.
