In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
from annoy import AnnoyIndex

# Load and preprocess the data
def load_data(file_path):
    return pd.read_csv(file_path)

def preprocess_data(df):
    # Separate categorical and numerical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    numerical_cols = df.select_dtypes(exclude=['object']).columns
    
    # Preprocess with OneHotEncoder and StandardScaler
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])
    
    processed_data = preprocessor.fit_transform(df)
    return processed_data, preprocessor

# Annoy search setup and query function
def create_annoy_index(data_matrix, metric='angular'):
    dim = data_matrix.shape[1]
    index = AnnoyIndex(dim, metric)  # Use 'angular' for cosine similarity
    
    for i in range(len(data_matrix)):
        index.add_item(i, data_matrix[i].tolist())  # Convert each row to list for Annoy
    
    index.build(10)  # Use 10 trees, can increase for higher accuracy
    return index

def find_top_k_similar(index, query_vector, k=10):
    indices, distances = index.get_nns_by_vector(query_vector.flatten().tolist(), k, include_distances=True)
    return indices, distances




In [None]:
# Main program
def main():
    # Load and preprocess data
    df = load_data("large_dataset.csv")
    data_matrix, preprocessor = preprocess_data(df)
    
    # Create FAISS index
    index = create_faiss_index(data_matrix)
    
    # Define a new data point for the query
    new_data_point = {"column1": "value1", "column2": "value2", ...}  # Replace with actual values
    new_data_df = pd.DataFrame([new_data_point])
    query_vector = preprocessor.transform(new_data_df).toarray()
    
    # Find top 10 similar rows
    indices, distances = find_top_k_similar(index, query_vector)
    
    # Output the most similar rows
    print("Top 10 similar rows:")
    print(df.iloc[indices])

if __name__ == "__main__":
    main()
