In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
import faiss

# Load and preprocess the data
def load_data(file_path):
    return pd.read_csv(file_path)

def preprocess_data(df):
    # Separate categorical and numerical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    numerical_cols = df.select_dtypes(exclude=['object']).columns
    
    # Preprocess with OneHotEncoder and StandardScaler
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])
    
    processed_data = preprocessor.fit_transform(df)
    return processed_data, preprocessor

# Similarity function using cosine similarity
def compute_similarity(data_matrix, query_vector):
    """
    Compute similarity between the query and each row in the data_matrix.
    Returns an array of similarity scores.
    """
    # Using cosine similarity for both categorical and numerical data
    similarities = cosine_similarity(data_matrix, query_vector.reshape(1, -1)).flatten()
    return similarities

# FAISS search setup and query function
def create_faiss_index(data_matrix):
    # Use FAISS to create an index for quick similarity search
    dim = data_matrix.shape[1]
    index = faiss.IndexFlatL2(dim)  # L2 distance is Euclidean, often works well for normalized data
    index.add(data_matrix.astype('float32'))
    return index

def find_top_k_similar(index, query_vector, k=10):
    query_vector = query_vector.astype('float32').reshape(1, -1)
    distances, indices = index.search(query_vector, k)
    return indices.flatten(), distances.flatten()



In [None]:
# Main program
def main():
    # Load and preprocess data
    df = load_data("large_dataset.csv")
    data_matrix, preprocessor = preprocess_data(df)
    
    # Create FAISS index
    index = create_faiss_index(data_matrix)
    
    # Define a new data point for the query
    new_data_point = {"column1": "value1", "column2": "value2", ...}  # Replace with actual values
    new_data_df = pd.DataFrame([new_data_point])
    query_vector = preprocessor.transform(new_data_df).toarray()
    
    # Find top 10 similar rows
    indices, distances = find_top_k_similar(index, query_vector)
    
    # Output the most similar rows
    print("Top 10 similar rows:")
    print(df.iloc[indices])

if __name__ == "__main__":
    main()
