## Section 1: Setup and Installations
### Install necessary libraries if not already installed (Uncomment the following lines if needed)
#### pip install pandas numpy scikit-learn

## Section 2: Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer



## Section 3: Loading Data from Excel

In [4]:
# Function to load data
def load_data(filepath):
    try:
        return pd.read_excel(filepath)
    except Exception as e:
        print(f"Failed to load data: {e}")
        return pd.DataFrame()

data = load_data('QueryGuard_KMeans_Clustering.xlsx')

## Section 4: Verifying Data Load

In [5]:
# Ensure data is loaded
if data.empty:
    print("No data loaded. Check the file path and format.")
else:
    print("Data loaded successfully.")


Data loaded successfully.


## Section 5: Generalizing Query Time to Year-Month

In [6]:
# Generalize 'QueryTime' to year-month
try:
    data['QueryTime'] = pd.to_datetime(data['QueryTime'])
    data['QueryTime'] = data['QueryTime'].dt.to_period('M')
    print("Time generalization applied.")
except Exception as e:
    print(f"Error processing time data: {e}")


Time generalization applied.


## Section 6: Generalizing Queries within Cluster

In [7]:
# Map clusters to names
try:
    cluster_names = {
         0: 'General Services and Information Portals',
    1: 'Specialized Hobby and Lifestyle Products',
    2: 'Educational and Government Online Services',
    3: 'Educational and Philosophical Research',
    4: 'Relationship Revenge and Miscellaneous Interests',
    5: 'Healthcare and Nursing Education',
    6: 'Miscellaneous Personal Interests',
    7: 'Government and Personal Services',
    8: 'Entertainment and Personal Vendettas',
    9: 'Undefined or Missing Queries',
   10: 'Web Navigation and E-commerce',
   11: 'Entertainment, Education, and Services',
   12: 'Niche and Detailed Inquiries',
   13: 'Automotive Customization and Tech Products',
   14: 'New Jersey Services and Shopping'
    }
    data['GeneralizedQuery'] = data['Cluster'].map(cluster_names)
    print("Cluster names mapped. Here are some examples:")
    print(data[['Cluster', 'GeneralizedQuery']].head())
except Exception as e:
    print(f"Error mapping cluster names: {e}")

Cluster names mapped. Here are some examples:
   Cluster                        GeneralizedQuery
0        6        Miscellaneous Personal Interests
1        3  Educational and Philosophical Research
2        3  Educational and Philosophical Research
3        8    Entertainment and Personal Vendettas
4        8    Entertainment and Personal Vendettas


In [8]:
if 'GeneralizedQuery' in data.columns:
    data['Query'] = data['GeneralizedQuery']
    data.drop('GeneralizedQuery', axis=1, inplace=True)
    print("Query column replaced with generalized names.")
else:
    print("GeneralizedQuery column not found.")


Query column replaced with generalized names.


## Section 7: Group k-Anonymity on Dataset

In [9]:

k = 5
try:
    anonymized_data = data.groupby(['Query', 'QueryTime']).filter(lambda x: len(x) >= k)
    print(f"Data anonymized to {k}-anonymity. Number of records: {len(anonymized_data)}")
except Exception as e:
    print(f"Error implementing k-anonymity: {e}")


Data anonymized to 5-anonymity. Number of records: 19994


## Section 8: Output the Anonymized Data

In [10]:
try:
    anonymized_data.to_excel('QueryGuard_K_Anonymity.xlsx')
    print("Anonymized data saved successfully.")
except Exception as e:
    print(f"Failed to save anonymized data: {e}")


Anonymized data saved successfully.


## Section 9: Utility Calculation - `Distortion` and `Precision`

In [11]:
anonymity_counts = {
    'Query': 1,
    'QueryTime': 2
}

def distortion_ultimate(anonymity_counts):
    d = 0
    for attribute, attribute_level in anonymity_counts.items():
        attribute_max_levels = attribute_level + 1
        d += attribute_level / attribute_max_levels
    d /= len(anonymity_counts)
    return round(d, 2)

def precision_ultimate(anonymity_counts, pt_rows):
    p = 0
    for attribute, attribute_level in anonymity_counts.items():
        attribute_max_levels = attribute_level + 1
        p += pt_rows * attribute_level / attribute_max_levels
    return round(1 - (p / (pt_rows * len(anonymity_counts))), 2)

pt_rows = 19994

print("Distortion:", distortion_ultimate(anonymity_counts))
print("Precision:", precision_ultimate(anonymity_counts, pt_rows))


Distortion: 0.58
Precision: 0.42
