# Document Generation #

In [None]:
import zipfile
import os

def unzip_document(zip_path, extract_to='documentos'):
    # Create the target directory if it doesn't exist
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)

    # Unzip the file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Files extracted to {extract_to}")

# Example usage
zip_path = 'Documentos.zip'  # Replace with your zip file path
unzip_document(zip_path)

In [None]:
# prompt: do a pip install unicode

!pip install unidecode
!python -m spacy download es_core_news_lg

In [None]:
# prompt: delete the fder documentos_procesados

import shutil

def delete_directory(dir_path):
    try:
        shutil.rmtree(dir_path)
        print(f"Directory '{dir_path}' deleted successfully.")
    except FileNotFoundError:
        print(f"Directory '{dir_path}' not found.")
    except OSError as e:
        print(f"Error deleting directory '{dir_path}': {e}")

# Example usage
delete_directory('documentos_procesados')

In [None]:
import os
import re
import nltk
import spacy
from nltk.corpus import stopwords
from unidecode import unidecode

# Download necessary NLTK resources
nltk.download('stopwords')

# Initialize spaCy with Spanish model
nlp = spacy.load("es_core_news_lg")

# Initialize Spanish stopwords
stop_words = set(stopwords.words('spanish'))

# Define source and destination folders
carpeta_origen = 'documentos'
carpeta_destino = 'documentos_procesados'

# Create destination folder if it doesn't exist
os.makedirs(carpeta_destino, exist_ok=True)

def preprocesar_nombre_archivo(nombre_archivo):
    # Split the filename by '_'
    partes = nombre_archivo.split('_')

    # Process each part
    partes_procesadas = []
    for parte in partes:
        # Remove non-alphanumeric characters
        parte = re.sub(r'\W+', ' ', parte)

        # Convert to lowercase and remove accents
        parte = unidecode(parte.lower())

        # Process with spaCy
        doc = nlp(parte)

        # Filter stopwords and get lemmas
        palabras = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]

        # Join processed words for this part
        parte_procesado = '_'.join(palabras)

        if parte_procesado:
            partes_procesadas.append(parte_procesado)

    # Join all processed parts
    nombre_procesado = '_'.join(partes_procesadas)

    return nombre_procesado

def preprocesar_texto(texto):
    # Convert to lowercase and remove accents
    texto = unidecode(texto.lower())

    # Remove non-alphanumeric characters
    texto = re.sub(r'\W+', ' ', texto)

    # Process with spaCy
    doc = nlp(texto)

    # Filter words and get lemmas
    palabras = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]

    # Join processed words
    texto_procesado = ' '.join(palabras)

    return texto_procesado

def procesar_archivos(carpeta_origen, carpeta_destino):
    for archivo in os.listdir(carpeta_origen):
        if archivo.endswith('.txt'):
            ruta_origen = os.path.join(carpeta_origen, archivo)

            # Preprocess filename
            nombre_procesado = preprocesar_nombre_archivo(archivo)
            ruta_destino = os.path.join(carpeta_destino, f"{archivo}.txt")

            try:
                # Read original file
                with open(ruta_origen, 'r', encoding='utf-8') as f:
                    contenido = f.read()

                # Process file content
                contenido_procesado = preprocesar_texto(contenido)

                # Save processed content with processed filename
                with open(ruta_destino, 'w', encoding='utf-8') as f:
                    f.write(contenido_procesado)

                print(f"Processed: {archivo} -> {nombre_procesado}.txt")

            except Exception as e:
                print(f"Error processing {archivo}: {e}")

if __name__ == "__main__":
    procesar_archivos(carpeta_origen, carpeta_destino)

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib  # Para guardar el vectorizador

def calculate_tf_idf(folder_path, output_file, vectorizer_file):
    """
    Calculates TF-IDF vectors for documents in a folder, saves them to a CSV file,
    and saves the TF-IDF vectorizer as a .joblib file.
    """

    # Initialize TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Store filenames and text contents
    filenames = []
    documents = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            filepath = os.path.join(folder_path, filename)
            try:
                with open(filepath, 'r', encoding='utf-8') as file:
                    text = file.read()
                    filenames.append(filename)
                    documents.append(text)
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

    # Fit and transform the documents
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Save the TF-IDF vectorizer to a .joblib file
    joblib.dump(vectorizer, vectorizer_file)
    print(f"TF-IDF vectorizer saved to {vectorizer_file}")

    # Create a DataFrame from the TF-IDF matrix
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    # Insert filenames as the first column
    tfidf_df.insert(0, 'filename', filenames)

    # Save the DataFrame to a CSV file
    tfidf_df.to_csv(output_file, index=False)
    print(f"TF-IDF vectors saved to {output_file}")

# Example usage
folder_path = 'documentos_procesados'
output_file = 'tf_idf_documentos.csv'
vectorizer_file = 'tfidf_vectorizer.joblib'
calculate_tf_idf(folder_path, output_file, vectorizer_file)


In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer
import joblib  # Para guardar el vectorizador

def calculate_log_tf_idf(folder_path, output_file, vectorizer_file):
    """
    Calculates logarithmic TF and standard IDF vectors for documents in a folder,
    and saves the CountVectorizer as a .joblib file.
    """
    # Store filenames and text contents
    filenames = []
    documents = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            filepath = os.path.join(folder_path, filename)
            try:
                with open(filepath, 'r', encoding='utf-8') as file:
                    text = file.read()
                    filenames.append(filename)
                    documents.append(text)
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

    # Create count vectorizer
    count_vectorizer = CountVectorizer()
    term_freq_matrix = count_vectorizer.fit_transform(documents)

    # Save the CountVectorizer to a .joblib file
    joblib.dump(count_vectorizer, vectorizer_file)
    print(f"CountVectorizer saved to {vectorizer_file}")

    # Get feature names
    feature_names = count_vectorizer.get_feature_names_out()

    # Calculate logarithmic TF
    log_tf_matrix = np.log1p(term_freq_matrix.toarray())

    # Calculate IDF
    doc_count = len(documents)
    idf_vector = np.log(doc_count / (np.sum(term_freq_matrix.toarray() > 0, axis=0) + 1))

    # Calculate TF-IDF
    tfidf_matrix = log_tf_matrix * idf_vector

    # Create DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix, columns=feature_names)
    tfidf_df.insert(0, 'filename', filenames)

    # Save to CSV
    tfidf_df.to_csv(output_file, index=False)
    print(f"Logarithmic TF-IDF vectors saved to {output_file}")

# Example usage
folder_path = 'documentos_procesados'
output_file = 'tf_idf_documentos_2.csv'
vectorizer_file = 'count_vectorizer.joblib'
calculate_log_tf_idf(folder_path, output_file, vectorizer_file)


In [None]:
queries = [
    "curso de programación, diseño gráfico y cocina internacional con técnicas avanzadas de cocina y recetas modernas",
    "Curso de escritura creativa y técnica para estudiantes de música interesados en mejorar sus habilidades de composición",
    "curso de idiomas inicial para estudiantes que quieren aprender inglés de manera interactiva y efectiva con enfoque en conversación",
    "curso sobre inteligencia artificial, tecnología de vanguardia, innovación disruptiva y desarrollo de aplicaciones inteligentes en el mercado",
    "curso para estudios de matemáticas aplicadas y artes liberales, combinando teoría matemática avanzada con análisis crítico cultural",
]

queries_procesadas = []

for query in queries:
    # Convert to lowercase and remove accents
    query_pre = unidecode(query.lower())

    # Remove non-alphanumeric characters
    query_pre = re.sub(r'\W+', ' ', query_pre)

    # Process with spaCy
    doc = nlp(query_pre)

    # Filter words and get lemmas
    palabras = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]

    # Join processed words
    query_preprocesada = ' '.join(palabras)

    queries_procesadas.append(query_preprocesada)

for query in queries_procesadas:
    print(query)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Cargar el vectorizador entrenado (asegúrate de haberlo guardado previamente)
import joblib
tfidf_vectorizer = joblib.load('tfidf_vectorizer.joblib')

queries_vector = []

for query in queries_procesadas:
    query_vector = tfidf_vectorizer.transform([query])
    print(query_vector)
    queries_vector.append(query_vector)

In [None]:
# prompt: save queries_vector as a csv file, the first column needs to be the index of the vector

import pandas as pd
import numpy as np

# Assuming queries_vector is already defined as in your provided code

# Create an empty list to store the data for the CSV file
data_for_csv = []

# Iterate through the queries_vector and their corresponding indices
for i, query_vector in enumerate(queries_vector):
    # Convert the sparse matrix to a dense array
    dense_vector = query_vector.toarray()[0]

    # Create a row for the CSV, with the index as the first element
    row = [i] + list(dense_vector)
    data_for_csv.append(row)

# Create column names
column_names = ['index'] + [f'feature_{i}' for i in range(len(dense_vector))]

# Create a pandas DataFrame from the data
df = pd.DataFrame(data_for_csv, columns=column_names)

# Save the DataFrame to a CSV file
df.to_csv('queries_vector.csv', index=False)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Cargar el vectorizador entrenado (asegúrate de haberlo guardado previamente)
import joblib
tfidf_vectorizer = joblib.load('count_vectorizer.joblib')

queries_vector_2 = []

for query in queries_procesadas:
    query_vector = tfidf_vectorizer.transform([query])
    print(query_vector)
    queries_vector_2.append(query_vector)

In [None]:
# prompt: save queries_vector as a csv file, the first column needs to be the index of the vector

import pandas as pd
import numpy as np

# Assuming queries_vector is already defined as in your provided code

# Create an empty list to store the data for the CSV file
data_for_csv = []

# Iterate through the queries_vector and their corresponding indices
for i, query_vector in enumerate(queries_vector_2):
    # Convert the sparse matrix to a dense array
    dense_vector = query_vector.toarray()[0]

    # Create a row for the CSV, with the index as the first element
    row = [i] + list(dense_vector)
    data_for_csv.append(row)

# Create column names
column_names = ['index'] + [f'feature_{i}' for i in range(len(dense_vector))]

# Create a pandas DataFrame from the data
df = pd.DataFrame(data_for_csv, columns=column_names)

# Save the DataFrame to a CSV file
df.to_csv('queries_vector_2.csv', index=False)

# Dimensionality Reduction #

## For tf_idf ##

In [None]:
# prompt: save my csv file in a dataframe

import pandas as pd

# Assuming your CSV file is named 'your_file.csv'
# Replace 'your_file.csv' with the actual filename
try:
  df = pd.read_csv('tf_idf_documentos.csv')
  print("File loaded successfully!")
except FileNotFoundError:
  print("Error: File not found. Please check the filename and ensure it exists in the current directory.")
except pd.errors.EmptyDataError:
  print("Error: The CSV file is empty.")
except pd.errors.ParserError:
  print("Error: There was an issue parsing the CSV file. Please check its format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")
else:
  # You can now work with the DataFrame 'df'
  print(df.head()) # Print the first few rows to verify
  # ... your further code ...


In [None]:
# prompt: visualize my df with tsne

import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Assuming your data for t-SNE is in columns 1 onwards (exclude the first column if it's an index)
X = df.iloc[:, 1:]  # Adjust the slicing if your data is in different columns


tsne = TSNE(n_components=2, random_state=42)  # Adjust parameters as needed
X_tsne = tsne.fit_transform(X)

plt.figure(figsize=(10, 8))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def read_and_prepare_data(file_path, normalization_method='minmax'):
   """
   Read CSV, remove first column, and normalize data

   Args:
       file_path (str): Path to CSV file
       normalization_method (str): 'minmax' or 'standard'

   Returns:
       tuple: Normalized DataFrame, first column
   """
   try:
       # Read CSV file
       df = pd.read_csv(file_path)

       # Extract first column
       first_column = df.iloc[:, 0]

       # Remove first column from DataFrame
       X = df.iloc[:, 1:]

       # Normalize data
       if normalization_method == 'minmax':
           scaler = MinMaxScaler()
           X_normalized = pd.DataFrame(
               scaler.fit_transform(X),
               columns=X.columns
           )
       elif normalization_method == 'standard':
           from sklearn.preprocessing import StandardScaler
           scaler = StandardScaler()
           X_normalized = pd.DataFrame(
               scaler.fit_transform(X),
               columns=X.columns
           )
       else:
           raise ValueError("Invalid normalization method")

       return X_normalized, first_column

   except Exception as e:
       print(f"Error processing data: {e}")
       return None, None

In [None]:
file_path = 'tf_idf_documentos.csv'
X, first_column = read_and_prepare_data(file_path, 'standard')


In [None]:
import os
import pandas as pd
from sklearn.manifold import TSNE, Isomap
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import numpy as np
from itertools import product
import math

def run_isomap_with_tsne(X_scaled, param_grid, first_column):
    """Run Isomap with multiple parameter combinations, plot reduced space with t-SNE, and save results."""
    results = []
    min_error = float('inf')  # Track the minimum reconstruction error
    best_idx = None           # Track the index of the best parameter combination

    # Folder for saving CSVs
    output_folder = "Isomap_2"
    os.makedirs(output_folder, exist_ok=True)

    # Get the total number of parameter combinations
    param_combinations = list(product(param_grid['n_components'], param_grid['n_neighbors']))
    num_combinations = len(param_combinations)

    # Create a grid of subplots
    cols = 3  # Number of columns
    rows = math.ceil(num_combinations / cols)  # Number of rows
    fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5))
    axes = axes.flatten()  # Flatten axes for easy iteration

    for idx, (n_components, n_neighbors) in enumerate(param_combinations):
        try:
            print(f"Running Isomap with n_components={n_components}, n_neighbors={n_neighbors}")

            # Validate parameters
            if n_components > X_scaled.shape[1]:
                print(f"Skipping n_components={n_components}: exceeds feature dimensions.")
                continue
            if n_neighbors >= X_scaled.shape[0]:
                print(f"Skipping n_neighbors={n_neighbors}: exceeds number of data points.")
                continue

            # Run Isomap
            isomap = Isomap(n_components=n_components, n_neighbors=n_neighbors, n_jobs=-1)
            transformed_data = isomap.fit_transform(X_scaled)

            # Calculate reconstruction error
            original_distances = pairwise_distances(X_scaled)
            reduced_distances = pairwise_distances(transformed_data)
            reconstruction_error = np.mean((original_distances - reduced_distances) ** 2)

            # Update the best error and index
            if reconstruction_error < min_error:
                min_error = reconstruction_error
                best_idx = idx

            # Create DataFrame with first column
            output_df = pd.DataFrame(transformed_data)
            output_df.insert(0, 'original_column', first_column)

            # Save the reduced space to a CSV file
            file_name = f"isomap_ncomp{n_components}_nneigh{n_neighbors}_error{reconstruction_error:.4f}.csv"
            file_path = os.path.join(output_folder, file_name)
            output_df.to_csv(file_path, index=False)
            print(f"Saved Isomap result to {file_path}")

            # Apply t-SNE on the Isomap-transformed data
            tsne = TSNE(n_components=2, random_state=42)
            tsne_result = tsne.fit_transform(transformed_data)

            # Plot the reduced space in the corresponding subplot
            ax = axes[idx]
            ax.scatter(tsne_result[:, 0], tsne_result[:, 1], c='blue', alpha=0.6, edgecolors='k')
            ax.set_title(f"n_components={n_components}, n_neighbors={n_neighbors}\nMSE: {reconstruction_error:.4f}")
            ax.set_xlabel("t-SNE Dimension 1")
            ax.set_ylabel("t-SNE Dimension 2")
            ax.grid(True)

            # Store results
            results.append({
                'params': {'n_components': n_components, 'n_neighbors': n_neighbors},
                'transformed_data': transformed_data,
                'reconstruction_error': reconstruction_error
            })

        except ValueError as ve:
            print(f"ValueError for n_components={n_components}, n_neighbors={n_neighbors}: {str(ve)}")
        except Exception as e:
            print(f"Unexpected error for n_components={n_components}, n_neighbors={n_neighbors}: {str(e)}")

    # Highlight the best subplot
    if best_idx is not None:
        axes[best_idx].set_title(
            axes[best_idx].get_title(),
            fontweight='bold',
            color='red'
        )

    # Remove empty subplots
    for i in range(len(param_combinations), len(axes)):
        fig.delaxes(axes[i])

    # Adjust layout and display the plot
    plt.tight_layout()
    plt.show()

    return results


In [None]:
!pip install umap-learn

In [None]:
param_grid = {'n_components': [2, 3, 10, 20, 30, 40, 50, 60], 'n_neighbors': [10, 20, 30, 40, 50, 60]}
results = run_isomap_with_tsne(X, param_grid, first_column)

# Display reconstruction errors
for result in results:
    print(f"Params: {result['params']}, Reconstruction Error: {result['reconstruction_error']}")


In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from itertools import product
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras import regularizers
import math

def build_autoencoder(X_scaled, params):
    """Build autoencoder with given parameters."""
    input_dim = X_scaled.shape[1]
    input_layer = Input(shape=(input_dim,))

    # Encoder
    if params['type'] == 'sparse':
        encoded = Dense(params['encoding_dim'], activation='relu',
                        activity_regularizer=regularizers.l1(params['parameter']))(input_layer)
    else:  # denoising
        encoded = Dense(params['encoding_dim'], activation='relu')(
            Dropout(params['parameter'])(input_layer))

    # Decoder
    decoded = Dense(input_dim, activation='sigmoid')(encoded)

    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')

    return autoencoder

def run_autoencoder(X_scaled, params):
    """Run autoencoder with given parameters."""
    try:
        # Split data without labels
        X_train, X_test = train_test_split(
            X_scaled, test_size=0.2, random_state=42)

        # Build and train autoencoder
        autoencoder = build_autoencoder(X_scaled, params)
        history = autoencoder.fit(
            X_train, X_train,
            epochs=150,
            batch_size=32,
            validation_split=0.2,
            verbose=0
        )

        # Get latent space
        encoder = Model(autoencoder.input, autoencoder.layers[1].output)
        latent_space = encoder.predict(X_scaled)

        # Calculate reconstruction error
        reconstructed = autoencoder.predict(X_scaled)
        reconstruction_error = np.mean((X_scaled - reconstructed) ** 2)

        return {
            'params': params,
            'history': history.history,
            'latent_space': latent_space,
            'reconstruction_error': reconstruction_error
        }
    except Exception as e:
        print(f"Error in Autoencoder with params {params}: {str(e)}")
        return None

def run_autoencoders_with_tsne(X_scaled, param_grid, first_column):
    """Run autoencoders with multiple parameter combinations and plot results with t-SNE."""
    results = []
    min_error = float('inf')  # Track the minimum reconstruction error
    best_idx = None           # Track the index of the best parameter combination

    if param_grid['type'][0] == 'sparse':
        # Get the total number of parameter combinations
        param_combinations = list(product(
            param_grid['type'],
            param_grid['encoding_dim'],
            param_grid['sparsity']
        ))
    else:
        param_combinations = list(product(
            param_grid['type'],
            param_grid['encoding_dim'],
            param_grid['dropout_rate']
        ))
    num_combinations = len(param_combinations)

    # Create a grid of subplots
    cols = 3  # Number of columns
    rows = math.ceil(num_combinations / cols)  # Number of rows
    fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5))
    axes = axes.flatten()  # Flatten axes for easy iteration

    for idx, (ae_type, encoding_dim, parameter) in enumerate(param_combinations):
        try:
            print(f"Running Autoencoder with type={ae_type}, encoding_dim={encoding_dim}, parameter={parameter}")

            params = {
                'type': ae_type,
                'encoding_dim': encoding_dim,
                'parameter': parameter
            }

            # Build and train the autoencoder
            result = run_autoencoder(X_scaled, params)
            if result is None:
                continue

            latent_space = result['latent_space']
            reconstruction_error = result['reconstruction_error']

            # Update the best error and index
            if reconstruction_error < min_error:
                min_error = reconstruction_error
                best_idx = idx

            # Save latent space to CSV
            output_folder = "Autoencoders_2"
            os.makedirs(output_folder, exist_ok=True)

            # Create DataFrame with first column
            output_df = pd.DataFrame(latent_space)
            output_df.insert(0, 'original_column', first_column)

            file_name = f"autoencoder_type{ae_type}_dim{encoding_dim}_param{parameter}_error{reconstruction_error:.4f}.csv"
            file_path = os.path.join(output_folder, file_name)
            pd.DataFrame(output_df).to_csv(file_path, index=False)
            print(f"Saved latent space to {file_path}")

            # Reduce latent space to 2D with t-SNE
            tsne = TSNE(n_components=2, random_state=42)
            tsne_result = tsne.fit_transform(latent_space)

            # Plot the reduced latent space
            ax = axes[idx]
            ax.scatter(tsne_result[:, 0], tsne_result[:, 1], c='blue', alpha=0.6, edgecolors='k')
            if ae_type == 'sparse':
                ax.set_title(f"type={ae_type}, dim={encoding_dim}, sparsity={parameter}\nMSE: {reconstruction_error:.4f}")
            else:
                ax.set_title(f"type={ae_type}, dim={encoding_dim}, dropout_rate = {parameter}\nMSE: {reconstruction_error:.4f}")
            ax.set_xlabel("t-SNE Dimension 1")
            ax.set_ylabel("t-SNE Dimension 2")
            ax.grid(True)

            # Store results
            results.append({
                'params': params,
                'reconstruction_error': reconstruction_error
            })

        except Exception as e:
            print(f"Error for type={ae_type}, dim={encoding_dim}: {str(e)}")

    # Highlight the best subplot
    if best_idx is not None:
        axes[best_idx].set_title(
            axes[best_idx].get_title(),
            fontweight='bold',
            color='red'
        )

    # Remove empty subplots
    for i in range(len(param_combinations), len(axes)):
        fig.delaxes(axes[i])

    # Adjust layout and display the plot
    plt.tight_layout()
    plt.show()

    return results


In [None]:
param_grid_sparse = {
    'type': ['sparse'],
    'encoding_dim': [2, 3, 10, 20, 30, 40, 50, 60],
    'sparsity': [1e-4, 1e-5, 1e-6, 1e-7]  # Only used for sparse autoencoder
}

param_grid_denoising = {
    'type': ['denoising'],
    'encoding_dim': [2, 3, 10, 20, 30, 40, 50, 60],
    'dropout_rate': [0.1, 0.2, 0.3, 0.4]  # Only used for denoising autoencoder
}

# Run the function
results_sparse = run_autoencoders_with_tsne(X, param_grid_sparse, first_column)
results_denoising = run_autoencoders_with_tsne(X, param_grid_denoising, first_column)


In [None]:
import umap
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
from itertools import product
import math

def run_umap_with_tsne(X_scaled, param_grid, first_column):
    """Run UMAP with different parameter combinations and plot reduced space with t-SNE."""
    results = []
    min_error = float('inf')  # Track the minimum reconstruction error
    best_idx = None           # Track the index of the best parameter combination

    # Folder for saving CSVs
    output_folder = "Umap_2"
    os.makedirs(output_folder, exist_ok=True)

    # Get the total number of parameter combinations
    param_combinations = list(product(
        param_grid['n_components'],
        param_grid['n_neighbors'],
        param_grid['min_dist'],
        param_grid['metric']
    ))
    num_combinations = len(param_combinations)

    # Create a grid of subplots
    cols = 3  # Number of columns
    rows = math.ceil(num_combinations / cols)  # Number of rows
    fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5))
    axes = axes.flatten()  # Flatten axes for easy iteration

    # Iterate through each combination of n_components, n_neighbors, and min_dist
    for idx, (n_components, n_neighbors, min_dist, metric) in enumerate(param_combinations):
        try:
            print(f"Running UMAP with n_components={n_components}, n_neighbors={n_neighbors}, min_dist={min_dist}, metric={metric}")

            # Validate parameters
            if n_components > X_scaled.shape[1]:
                print(f"Skipping n_components={n_components}: exceeds feature dimensions.")
                continue
            if n_neighbors >= X_scaled.shape[0]:
                print(f"Skipping n_neighbors={n_neighbors}: exceeds number of data points.")
                continue

            # Run UMAP with the new parameter
            umap_model = umap.UMAP(
                n_neighbors=n_neighbors,
                n_components=n_components,
                min_dist=min_dist,
                metric=metric,
                random_state=42
            )
            umap_transformed_data = umap_model.fit_transform(X_scaled)

            # Apply t-SNE on the UMAP-transformed data
            tsne = TSNE(n_components=2, random_state=42)
            tsne_result = tsne.fit_transform(umap_transformed_data)

            # Calculate reconstruction error (using pairwise distance as a proxy)
            original_distances = pairwise_distances(X_scaled)
            reduced_distances = pairwise_distances(umap_transformed_data)
            reconstruction_error = np.mean((original_distances - reduced_distances) ** 2)

            # Update the best error and index
            if reconstruction_error < min_error:
                min_error = reconstruction_error
                best_idx = idx

            # Create DataFrame with first column
            output_df = pd.DataFrame(umap_transformed_data)
            output_df.insert(0, 'original_column', first_column)

            # Save the reduced space to a CSV file
            file_name = f"umap_ncomp{n_components}_nneigh{n_neighbors}_mindist{min_dist}_metric{metric}_error{reconstruction_error:.4f}.csv"
            file_path = os.path.join(output_folder, file_name)
            pd.DataFrame(output_df).to_csv(file_path, index=False)
            print(f"Saved Umap result to {file_path}")

            # Plot the reduced space in the corresponding subplot
            ax = axes[idx]
            ax.scatter(tsne_result[:, 0], tsne_result[:, 1], c='blue', alpha=0.6, edgecolors='k')
            ax.set_title(f"n_components={n_components}, n_neighbors={n_neighbors}, min_dist={min_dist}, metric={metric}\nError: {reconstruction_error:.4f}")
            ax.set_xlabel("t-SNE Dimension 1")
            ax.set_ylabel("t-SNE Dimension 2")
            ax.grid(True)

            # Store results
            results.append({
                'params': {'n_components': n_components, 'n_neighbors': n_neighbors, 'min_dist': min_dist, 'metric': metric},
                'transformed_data': umap_transformed_data,
                'reconstruction_error': reconstruction_error
            })


        except ValueError as ve:
            print(f"ValueError for n_components={n_components}, n_neighbors={n_neighbors}, min_dist={min_dist}: {str(ve)}")
        except Exception as e:
            print(f"Unexpected error for n_components={n_components}, n_neighbors={n_neighbors}, min_dist={min_dist}: {str(e)}")

    # Highlight the best subplot
    if best_idx is not None:
        axes[best_idx].set_title(
            axes[best_idx].get_title(),
            fontweight='bold',
            color='red'
        )

    # Remove empty subplots
    for i in range(len(param_combinations), len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()

    return results


In [None]:
# Example of how to call the function with a parameter grid
param_grid = {
    'n_components': [2, 3, 5, 10, 20],
    'n_neighbors': [20, 40, 60],
    'min_dist': [0.1, 0.3, 0.7],
    'metric': ['euclidean', 'cosine']
}

# Run the function
results = run_umap_with_tsne(X, param_grid, first_column)


## Tf_idf_2 ##

In [None]:
# prompt: save my csv file in a dataframe

import pandas as pd

# Assuming your CSV file is named 'your_file.csv'
# Replace 'your_file.csv' with the actual filename
try:
  df = pd.read_csv('tf_idf_documentos_2.csv')
  print("File loaded successfully!")
except FileNotFoundError:
  print("Error: File not found. Please check the filename and ensure it exists in the current directory.")
except pd.errors.EmptyDataError:
  print("Error: The CSV file is empty.")
except pd.errors.ParserError:
  print("Error: There was an issue parsing the CSV file. Please check its format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")
else:
  # You can now work with the DataFrame 'df'
  print(df.head()) # Print the first few rows to verify

In [None]:
# prompt: visualize my df with tsne

import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Assuming your data for t-SNE is in columns 1 onwards (exclude the first column if it's an index)
X = df.iloc[:, 1:]  # Adjust the slicing if your data is in different columns


tsne = TSNE(n_components=2, random_state=42)  # Adjust parameters as needed
X_tsne = tsne.fit_transform(X)

plt.figure(figsize=(10, 8))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
plt.title('t-SNE Visualization')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

In [None]:
file_path = 'tf_idf_documentos_2.csv'
X, first_column = read_and_prepare_data(file_path, 'standard')

# Optional: print DataFrame details
print("DataFrame shape:", X.shape)
print("First column details:", first_column)

In [None]:
param_grid = {'n_components': [2, 3, 10, 20, 30, 40, 50, 60], 'n_neighbors': [10, 20, 30, 40, 50, 60]}
results = run_isomap_with_tsne(X, param_grid, first_column)

# Display reconstruction errors
for result in results:
    print(f"Params: {result['params']}, Reconstruction Error: {result['reconstruction_error']}")

In [None]:
param_grid_sparse = {
    'type': ['sparse'],
    'encoding_dim': [2, 3, 10, 20, 30, 40, 50, 60],
    'sparsity': [1e-4, 1e-5, 1e-6, 1e-7]  # Only used for sparse autoencoder
}

param_grid_denoising = {
    'type': ['denoising'],
    'encoding_dim': [2, 3, 10, 20, 30, 40, 50, 60],
    'dropout_rate': [0.1, 0.2, 0.3, 0.4]  # Only used for denoising autoencoder
}

# Run the function
results_sparse = run_autoencoders_with_tsne(X, param_grid_sparse, first_column)
results_denoising = run_autoencoders_with_tsne(X, param_grid_denoising, first_column)


In [None]:
# Example of how to call the function with a parameter grid
param_grid = {
    'n_components': [2, 3, 5, 10, 20],
    'n_neighbors': [20, 40, 60],
    'min_dist': [0.1, 0.3, 0.7],
    'metric': ['euclidean', 'cosine']
}

# Run the function
results = run_umap_with_tsne(X, param_grid, first_column)


In [None]:
# prompt: zip my three folders Autoencoders, Isomap and Umap into a single zip file but keep each folder

import shutil
import os

def zip_folders(folder_names, zip_file_name):
  """Zips multiple folders into a single zip file, preserving folder structure."""
  import zipfile

  try:
    with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
      for folder_name in folder_names:
        for root, _, files in os.walk(folder_name):
          for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, os.path.dirname(folder_name)) # Preserve folder structure
            zipf.write(file_path, arcname=arcname)
    print(f"Successfully zipped folders into '{zip_file_name}'")

  except FileNotFoundError:
    print(f"One or more folders not found.")
  except Exception as e:
    print(f"An error occurred: {e}")


# Example usage
folder_names = ["Autoencoders", "Isomap", "Umap", "Autoencoders_2", "Isomap_2", "Umap_2"]  # Replace with your actual folder names
zip_file_name = "dimensionality_reduction_results.zip"
zip_folders(folder_names, zip_file_name)


# Clustering #

In [None]:
from sklearn.metrics import (silhouette_score, calinski_harabasz_score, davies_bouldin_score)
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib


def kmeans_clustering(X, k_range=range(2, 9), title_prefix="", model_name=""):
    # Convertir X a DataFrame si no lo es
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X, columns=[f'Feature_{i+1}' for i in range(X.shape[1])])

    # Inicializar listas para almacenar las métricas
    inertia = []
    silhouette_scores = []
    calinski_scores = []
    davies_bouldin_scores = []

    # Configurar subplots para una cuadrícula de 3x3 con un tamaño de 15x15 pulgadas
    fig, axes = plt.subplots(7, 3, figsize=(20, 50))
    axes = axes.flatten()  # Aplanar para iterar fácilmente

    for i, k in enumerate(k_range):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X)

        # Guardar la inercia (para el Elbow Plot)
        inertia.append(kmeans.inertia_)

        # Calcular y guardar las métricas
        labels = kmeans.labels_
        silhouette = silhouette_score(X, labels)
        silhouette_scores.append(silhouette)

        calinski = calinski_harabasz_score(X, labels)
        calinski_scores.append(calinski)

        davies_bouldin = davies_bouldin_score(X, labels)
        davies_bouldin_scores.append(davies_bouldin)

        # Guardar el modelo con nombre basado en las métricas
        model_filename = f"classification_model/{model_name}_k{k}_silh{silhouette:.2f}_calinski{calinski:.2f}_davies{davies_bouldin:.2f}.joblib"
        joblib.dump(kmeans, model_filename)
        print(f"Model saved as: {model_filename}")

        # Visualizar clustering en 2D con métricas
        axes[i].scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis', s=30, alpha=0.6)
        axes[i].scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
                        c='red', marker='x', s=100, linewidths=2, label='Centroids')
        axes[i].set_title(f'{title_prefix}Clusters: {k}\nSilhouette: {silhouette:.2f}\n'
                          f'Calinski: {calinski:.2f}\nDavies-Bouldin: {davies_bouldin:.2f}')
        axes[i].legend()

    # Ocultar cualquier subplot adicional no utilizado
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

    # Crear el Elbow Plot
    plt.figure(figsize=(10, 6))
    plt.plot(k_range, inertia, marker='o', color='b', label='Inercia (Elbow)')
    plt.xlabel('Número de clusters (k)')
    plt.ylabel('Inercia')
    plt.title('Método Elbow para el número óptimo de clusters')
    plt.grid(True)
    plt.legend()
    plt.show()

    # Graficar métricas adicionales
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))

    # Silhouette Score
    axes[0].plot(k_range, silhouette_scores, marker='o', color='g', label='Silhouette Score')
    axes[0].set_xlabel('Número de clusters (k)')
    axes[0].set_ylabel('Silhouette Score')
    axes[0].set_title('Silhouette Score (más alto es mejor)')
    axes[0].grid(True)
    axes[0].legend()

    # Calinski-Harabasz Score
    axes[1].plot(k_range, calinski_scores, marker='o', color='r', label='Calinski-Harabasz Score')
    axes[1].set_xlabel('Número de clusters (k)')
    axes[1].set_ylabel('Calinski-Harabasz Score')
    axes[1].set_title('Calinski-Harabasz Score (más alto es mejor)')
    axes[1].grid(True)
    axes[1].legend()

    # Davies-Bouldin Score
    axes[2].plot(k_range, davies_bouldin_scores, marker='o', color='purple', label='Davies-Bouldin Score')
    axes[2].set_xlabel('Número de clusters (k)')
    axes[2].set_ylabel('Davies-Bouldin Score')
    axes[2].set_title('Davies-Bouldin Score (más bajo es mejor)')
    axes[2].grid(True)
    axes[2].legend()

    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

def gmm_clustering_with_metrics(X, k_range=range(2, 9), title_prefix="", model_name=""):
    # Convertir X a DataFrame si no lo es
    if isinstance(X, np.ndarray):
        X = pd.DataFrame(X, columns=[f'Feature_{i+1}' for i in range(X.shape[1])])

    # Inicializar listas para almacenar las métricas
    silhouette_scores = []
    calinski_scores = []
    davies_bouldin_scores = []

    # Configurar subplots para una cuadrícula de 3x3 con un tamaño de 15x15 pulgadas
    fig, axes = plt.subplots(7, 3, figsize=(20, 50))
    axes = axes.flatten()  # Aplanar para iterar fácilmente

    for i, k in enumerate(k_range):
        gmm = GaussianMixture(n_components=k, random_state=42)
        gmm.fit(X)

        # Obtener las etiquetas de clúster
        labels = gmm.predict(X)

        # Calcular y guardar las métricas
        silhouette = silhouette_score(X, labels)
        silhouette_scores.append(silhouette)

        calinski = calinski_harabasz_score(X, labels)
        calinski_scores.append(calinski)

        davies_bouldin = davies_bouldin_score(X, labels)
        davies_bouldin_scores.append(davies_bouldin)

        # Guardar el modelo con nombre basado en las métricas
        model_filename = f"classification_model/{model_name}_k{k}_silh{silhouette:.2f}_calinski{calinski:.2f}_davies{davies_bouldin:.2f}.joblib"
        joblib.dump(gmm, model_filename)
        print(f"Model saved as: {model_filename}")

        # Visualizar clustering en 2D con métricas
        axes[i].scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis', s=30, alpha=0.6)
        axes[i].scatter(gmm.means_[:, 0], gmm.means_[:, 1],
                        c='red', marker='x', s=100, linewidths=2, label='Centroids')
        axes[i].set_title(f'{title_prefix}Clusters: {k}\nSilhouette: {silhouette:.2f}\n'
                          f'Calinski: {calinski:.2f}\nDavies-Bouldin: {davies_bouldin:.2f}')
        axes[i].legend()

    # Ocultar cualquier subplot adicional no utilizado
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

    # Graficar métricas adicionales
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))

    # Silhouette Score
    axes[0].plot(k_range, silhouette_scores, marker='o', color='g', label='Silhouette Score')
    axes[0].set_xlabel('Número de clusters (k)')
    axes[0].set_ylabel('Silhouette Score')
    axes[0].set_title('Silhouette Score (más alto es mejor)')
    axes[0].grid(True)
    axes[0].legend()

    # Calinski-Harabasz Score
    axes[1].plot(k_range, calinski_scores, marker='o', color='r', label='Calinski-Harabasz Score')
    axes[1].set_xlabel('Número de clusters (k)')
    axes[1].set_ylabel('Calinski-Harabasz Score')
    axes[1].set_title('Calinski-Harabasz Score (más alto es mejor)')
    axes[1].grid(True)
    axes[1].legend()

    # Davies-Bouldin Score
    axes[2].plot(k_range, davies_bouldin_scores, marker='o', color='purple', label='Davies-Bouldin Score')
    axes[2].set_xlabel('Número de clusters (k)')
    axes[2].set_ylabel('Davies-Bouldin Score')
    axes[2].set_title('Davies-Bouldin Score (más bajo es mejor)')
    axes[2].grid(True)
    axes[2].legend()

    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
def read_dataset(file_path):
    df = pd.read_csv(file_path)
    X_latent = df.drop('original_column', axis=1).values
    scaler = StandardScaler()
    X_latent = scaler.fit_transform(X_latent)
    return X_latent

In [None]:
def evaluate_space_selection(path, k_range=range(2, 23), title_prefix="Kmeans", model_name = ""):
    X = read_dataset(path)
    kmeans_clustering(X, k_range, title_prefix = 'Kmeans', model_name = 'Kmeans')
    gmm_clustering_with_metrics(X, k_range, title_prefix = 'GMM', model_name = 'GMM')


In [None]:
# prompt: create a folder called classification_model

import os

# Create the directory if it doesn't exist
if not os.path.exists("classification_model"):
    os.makedirs("classification_model")
    print("Directory 'classification_model' created successfully")
else:
    print("Directory 'classification_model' already exists")

## Tf_idf ##

In [None]:
path = '/content/isomap_ncomp20_nneigh40_error346.4654.csv'
evaluate_space_selection(path)

In [None]:
path = '/content/autoencoder_typesparse_dim60_param1e-05_error0.8782.csv'
evaluate_space_selection(path)

In [None]:
path = '/content/umap_ncomp2_nneigh20_mindist0.7_metriccosine_error2006.8442.csv'
evaluate_space_selection(path)

## Tf_idf_2 ##

In [None]:
path = '/content/isomap_ncomp30_nneigh60_error288.6896.csv'
evaluate_space_selection(path)

In [None]:
path = '/content/autoencoder_typesparse_dim60_param1e-05_error0.8737.csv'
evaluate_space_selection(path)

In [None]:
path = '/content/umap_ncomp2_nneigh20_mindist0.7_metriccosine_error2009.5221.csv'
evaluate_space_selection(path)

# Reduction Test for Umap #

In [None]:
import umap
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
from itertools import product
import math

def run_umap_with_tsne(X_scaled, X_query, param_grid, first_column_doc, first_column_query):
    """Run UMAP with different parameter combinations and plot reduced space with t-SNE."""
    results = []
    min_error = float('inf')  # Track the minimum reconstruction error
    best_idx = None           # Track the index of the best parameter combination

    # Folder for saving CSVs
    output_folder = "Umap_2"
    os.makedirs(output_folder, exist_ok=True)

    # Get the total number of parameter combinations
    param_combinations = list(product(
        param_grid['n_components'],
        param_grid['n_neighbors'],
        param_grid['min_dist'],
        param_grid['metric']
    ))

    # Iterate through each combination of n_components, n_neighbors, and min_dist
    for idx, (n_components, n_neighbors, min_dist, metric) in enumerate(param_combinations):
        try:

            # Validate parameters
            if n_components > X_scaled.shape[1]:
                print(f"Skipping n_components={n_components}: exceeds feature dimensions.")
                continue
            if n_neighbors >= X_scaled.shape[0]:
                print(f"Skipping n_neighbors={n_neighbors}: exceeds number of data points.")
                continue

            # Run UMAP with the new parameter
            umap_model = umap.UMAP(
                n_neighbors=n_neighbors,
                n_components=n_components,
                min_dist=min_dist,
                metric=metric,
                random_state=42
            )
            umap_transformed_data = umap_model.fit_transform(X_scaled)

            # Create DataFrame with first column
            output_df = pd.DataFrame(umap_transformed_data)
            output_df.insert(0, 'original_column', first_column_doc)

            # Save the reduced space to a CSV file
            file_name = f"umap_ncomp{n_components}_nneigh{n_neighbors}_mindist{min_dist}_metric{metric}.csv"
            file_path = os.path.join(output_folder, file_name)
            pd.DataFrame(output_df).to_csv(file_path, index=False)
            print(f"Saved Umap doc result to {file_path}")

            umap_transformed_data_query = umap_model.transform(X_query)

            output_df_query = pd.DataFrame(umap_transformed_data_query)
            output_df_query.insert(0, 'original_column', first_column_query)

            file_name_query = f"umap_query_ncomp{n_components}_nneigh{n_neighbors}_mindist{min_dist}_metric{metric}.csv"
            file_path_query = os.path.join(output_folder, file_name_query)
            pd.DataFrame(output_df_query).to_csv(file_path_query, index=False)
            print(f"Saved Umap query result to {file_path_query}")

        except ValueError as ve:
            print(f"ValueError for n_components={n_components}, n_neighbors={n_neighbors}, min_dist={min_dist}: {str(ve)}")
        except Exception as e:
            print(f"Unexpected error for n_components={n_components}, n_neighbors={n_neighbors}, min_dist={min_dist}: {str(e)}")

    return results


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

def cluster_documents_and_assign_queries(doc_csv, query_csv, n_clusters, output_csv):
    """
    Clusters documents using K-Means and assigns queries to the closest cluster.

    Parameters:
    - doc_csv: Path to the CSV file containing reduced dimensionality of documents.
    - query_csv: Path to the CSV file containing reduced dimensionality of queries.
    - n_clusters: Number of clusters for K-Means.
    - output_csv: File path to save query-to-cluster assignment results.
    """
    # Load the reduced dimensions for documents and queries
    doc_data = pd.read_csv(doc_csv)
    query_data = pd.read_csv(query_csv)

    # Extract feature vectors and document/query names
    document_vectors = doc_data[['0', '1']].values
    document_names = doc_data['original_column'].values  # Document names

    query_vectors = query_data[['0', '1']].values
    query_names = query_data['original_column'].values  # Query names

    # Apply K-Means clustering to documents
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(document_vectors)

    # Get the cluster assignments for documents
    doc_clusters = kmeans.labels_

    # Assign queries to the closest cluster
    query_cluster_indices, _ = pairwise_distances_argmin_min(query_vectors, kmeans.cluster_centers_)

    # Prepare query-cluster matches
    query_matches = []
    for query_name, query_cluster_idx in zip(query_names, query_cluster_indices):
        # Get documents belonging to the same cluster
        matched_docs = [
            document_names[idx] for idx, cluster in enumerate(doc_clusters) if cluster == query_cluster_idx
        ]
        query_matches.append({
            'Query': query_name,
            'Assigned_Cluster': query_cluster_idx,
            'Matched_Documents': ', '.join(matched_docs)  # Join matched document names
        })

    # Save results to a CSV
    matches_df = pd.DataFrame(query_matches)
    matches_df.to_csv(output_csv, index=False)
    print(f"Query-to-cluster assignments saved to {output_csv}")


## Tf_idf ##

In [None]:
file_path_documents = 'tf_idf_documentos.csv'
X_doc, first_column_doc = read_and_prepare_data(file_path_documents, 'standard')

file_path_queries = 'queries_vector.csv'
X_queries, first_column_queries = read_and_prepare_data(file_path_queries, 'standard')

In [None]:
param_grid = {
    'n_components': [2],
    'n_neighbors': [20],
    'min_dist': [0.7],
    'metric': ['cosine']
}

run_umap_with_tsne(X_doc, X_queries, param_grid, first_column_doc, first_column_queries)

In [None]:
# Example usage
doc_csv = "/content/Umap/umap_ncomp2_nneigh20_mindist0.7_metriccosine.csv"   # CSV with reduced dimensions of documents
query_csv = "/content/Umap/umap_query_ncomp2_nneigh20_mindist0.7_metriccosine.csv"   # CSV with reduced dimensions of queries
output_csv = "query_cluster_matches.csv"  # Output CSV with matches
n_clusters = 6  # Number of neighbors for KNN

cluster_documents_and_assign_queries(doc_csv, query_csv, n_clusters, output_csv)


In [None]:
import pandas as pd
import numpy as np

# Load the queries CSV
query_csv = "/content/Umap/umap_query_ncomp2_nneigh20_mindist0.7_metriccosine.csv"
query_data = pd.read_csv(query_csv)

document_csv = "/content/Umap/umap_ncomp2_nneigh20_mindist0.7_metriccosine.csv"
document_data = pd.read_csv(document_csv)


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity_for_selected_documents(documents_df, query_vector, selected_documents):
    # Filter the DataFrame to include only the selected documents
    selected_df = documents_df[documents_df['original_column'].isin(selected_documents)]

    # Extract the vectors of the selected documents
    selected_vectors = selected_df.iloc[:, 1:].values  # Excludes 'original_column'

    # Compute cosine similarity between each selected document vector and the query vector
    similarities = cosine_similarity(selected_vectors, query_vector.reshape(1, -1)).flatten()

    # Create a result DataFrame with document names and their similarity scores
    results = pd.DataFrame({
        'document': selected_df['original_column'].values,
        'similarity': similarities
    })

    # Sort the results by similarity in descending order
    results = results.sort_values(by='similarity', ascending=False).reset_index(drop=True)

    return results

In [None]:
import numpy as np

def get_similar_documents_manhattan(query_vector, document_data, selected_documents):

    # Filter the document data to include only the selected documents
    filtered_data = document_data[document_data['original_column'].isin(selected_documents)]

    # Extract the document names and their corresponding vectors
    document_names = filtered_data['original_column'].values
    document_vectors = filtered_data[['0', '1']].values  # Adjust if there are more dimensions

    # Compute Manhattan distances between the query vector and all selected document vectors
    distances = np.sum(np.abs(document_vectors - query_vector), axis=1)

    # Combine document names with their distances and sort by distance
    results = list(zip(document_names, distances))
    results.sort(key=lambda x: x[1])  # Sort by distance (ascending)

    return results


In [None]:
import numpy as np

def get_similar_documents_pearson(query_vector, document_data, selected_documents):
    # Filter the document data to include only the selected documents
    filtered_data = document_data[document_data['original_column'].isin(selected_documents)]

    # Extract the document names and their corresponding vectors
    document_names = filtered_data['original_column'].values
    document_vectors = filtered_data[['0', '1']].values  # Adjust if there are more dimensions

    # Calculate Pearson correlation between the query vector and all document vectors
    correlations = []
    for doc_vector in document_vectors:
        # Pearson correlation: np.corrcoef returns a correlation matrix; take the off-diagonal element
        correlation = np.corrcoef(query_vector, doc_vector)[0, 1]
        correlations.append(correlation)

    # Combine document names with their correlations and sort by correlation (descending)
    results = list(zip(document_names, correlations))
    results.sort(key=lambda x: x[1], reverse=True)  # Sort by correlation (descending)

    return results


### Cosine ###

In [None]:
# Example usage
document_csv = "/content/Umap/umap_ncomp2_nneigh20_mindist0.7_metriccosine.csv"
query_csv = "/content/Umap/umap_query_ncomp2_nneigh20_mindist0.7_metriccosine.csv"

# Load data
document_data = pd.read_csv(document_csv)
query_data = pd.read_csv(query_csv)

# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[0].values

# Select specific document names
selected_documents = [
    "32_Emprendimiento.txt.txt",
    "65_Alta_Cocina_Mundial.txt.txt",
    "110_Creatividad_Empresarial.txt.txt",
    "63_Servicio_y_Protocolo.txt.txt",
    "67_Innovacion_Culinaria.txt.txt",
    "47_Conceptos_y_Tecnicas_1.txt.txt",
    "62_Pasteleria.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "56_Carniceria.txt.txt",
    "64_Practica_Culinaria_2.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "54_Practica_Culinaria_1.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "69_Administracion_de_Eventos.txt.txt",
    "61_Alta_Cocina_Francesa.txt.txt",
    "97_Principios_de_Marketing.txt.txt",
    "57_Alta_Cocina_Ecuatoriana.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "113_Negocios_Internacionales.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "49_Panaderia.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[1].values

# Select specific document names
selected_documents = [
    "7_Ingles_Nivel_2.txt.txt",
    "119_Composicion_Visual_1.txt.txt",
    "178_Sonido.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "123_Fundamentos_de_Escultura.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt",
    "169_Improvisacion.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "95_Fisiologia_+Lab.txt.txt",
    "173_Performance.txt.txt",
    "96_Evolucion.txt.txt",
    "172_Composicion.txt.txt",
    "122_Fotografia_1.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "170_Danza_Moderna_2.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "177_Cinematografia.txt.txt",
    "6_Ingles_Nivel_1.txt.txt",
    "179_Storytelling.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt"
]
  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[2].values

# Select specific document names
selected_documents = [
    "7_Ingles_Nivel_2.txt.txt",
    "119_Composicion_Visual_1.txt.txt",
    "178_Sonido.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "123_Fundamentos_de_Escultura.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt",
    "169_Improvisacion.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "95_Fisiologia_+Lab.txt.txt",
    "173_Performance.txt.txt",
    "96_Evolucion.txt.txt",
    "172_Composicion.txt.txt",
    "122_Fotografia_1.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "170_Danza_Moderna_2.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "177_Cinematografia.txt.txt",
    "6_Ingles_Nivel_1.txt.txt",
    "179_Storytelling.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt"
]
# Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[3].values

# Select specific document names
selected_documents = [
    "105_Marketing_Digital.txt.txt",
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "101_Analisis_de_Datos.txt.txt",
    "183_Procesos,_Metodos_y_Estandares.txt.txt",
    "116_Analisis_Estrategico_ADM.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "36_Aprendizaje_Automatico.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "186_Sistemas_Lean.txt.txt",
    "55_Administracion_de_A_&_B.txt.txt",
    "42_Seguridad_Informatica.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "106_Negociacion.txt.txt",
    "109_Investigacion_de_Mercados.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "43_Mineria_de_Datos.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "48_Principios_de_Administracion.txt.txt",
    "52_Contabilidad_Empresarial.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "59_Introduccion_al_Marketing_HSP.txt.txt",
    "38_Sistemas_Operativos.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "51_Introduccion_a_la_Hospitalidad.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "33_DiseÃ±o_de_Sistemas.txt.txt"
]
  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[4].values

# Select specific document names
selected_documents = [
    "34_Inteligencia_Artificial.txt.txt",
    "3_Calculo_Diferencial_+_Ej.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "26_Electronica_Basica_+Lab.txt.txt",
    "161_Algebra_Lineal_2.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "18_Calculo_Vectorial.txt.txt",
    "28_Probabilidad_y_Estadistica_+Ej.txt.txt",
    "58_Introduccion_a_la_Biologia_+Ej.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "162_Algebra_Abstracta_1.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "164_Analisis_Funcional.txt.txt",
    "153_Introduccion_a_Probabilidades.txt.txt",
    "150_Variable_Compleja.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "163_Geometria_Diferencial.txt.txt",
    "22_Algebra_Lineal_1_+Ej.txt.txt",
    "137_Matematicas_Cotidianas.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "8_Programacion_en_C++_+Ej.txt.txt",
    "160_Ecuaciones_Diferenciales_Parciales.txt.txt",
    "76_Estadistica_para_CCSS.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "147_Modelado_3D_1.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "156_Inferencia_Estadistica.txt.txt",
    "168_Analisis_Real.txt.txt",
    "167_Topologia_2.txt.txt",
    "158_Teoria_de_Numeros.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

### Manhattan ##

In [None]:
# Example usage
document_csv = "/content/Umap/umap_ncomp2_nneigh20_mindist0.7_metriccosine.csv"
query_csv = "/content/Umap/umap_query_ncomp2_nneigh20_mindist0.7_metriccosine.csv"

# Load data
document_data = pd.read_csv(document_csv)
query_data = pd.read_csv(query_csv)

# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[0].values

# Select specific document names
selected_documents = [
    "32_Emprendimiento.txt.txt",
    "65_Alta_Cocina_Mundial.txt.txt",
    "110_Creatividad_Empresarial.txt.txt",
    "63_Servicio_y_Protocolo.txt.txt",
    "67_Innovacion_Culinaria.txt.txt",
    "47_Conceptos_y_Tecnicas_1.txt.txt",
    "62_Pasteleria.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "56_Carniceria.txt.txt",
    "64_Practica_Culinaria_2.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "54_Practica_Culinaria_1.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "69_Administracion_de_Eventos.txt.txt",
    "61_Alta_Cocina_Francesa.txt.txt",
    "97_Principios_de_Marketing.txt.txt",
    "57_Alta_Cocina_Ecuatoriana.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "113_Negocios_Internacionales.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "49_Panaderia.txt.txt"
]
  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[1].values

# Select specific document names
selected_documents = [
    "7_Ingles_Nivel_2.txt.txt",
    "119_Composicion_Visual_1.txt.txt",
    "178_Sonido.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "123_Fundamentos_de_Escultura.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt",
    "169_Improvisacion.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "95_Fisiologia_+Lab.txt.txt",
    "173_Performance.txt.txt",
    "96_Evolucion.txt.txt",
    "172_Composicion.txt.txt",
    "122_Fotografia_1.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "170_Danza_Moderna_2.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "177_Cinematografia.txt.txt",
    "6_Ingles_Nivel_1.txt.txt",
    "179_Storytelling.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[2].values

# Select specific document names
selected_documents = [
    "7_Ingles_Nivel_2.txt.txt",
    "119_Composicion_Visual_1.txt.txt",
    "178_Sonido.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "123_Fundamentos_de_Escultura.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt",
    "169_Improvisacion.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "95_Fisiologia_+Lab.txt.txt",
    "173_Performance.txt.txt",
    "96_Evolucion.txt.txt",
    "172_Composicion.txt.txt",
    "122_Fotografia_1.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "170_Danza_Moderna_2.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "177_Cinematografia.txt.txt",
    "6_Ingles_Nivel_1.txt.txt",
    "179_Storytelling.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt"
]
# Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[3].values

# Select specific document names
selected_documents = [
    "105_Marketing_Digital.txt.txt",
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "101_Analisis_de_Datos.txt.txt",
    "183_Procesos,_Metodos_y_Estandares.txt.txt",
    "116_Analisis_Estrategico_ADM.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "36_Aprendizaje_Automatico.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "186_Sistemas_Lean.txt.txt",
    "55_Administracion_de_A_&_B.txt.txt",
    "42_Seguridad_Informatica.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "106_Negociacion.txt.txt",
    "109_Investigacion_de_Mercados.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "43_Mineria_de_Datos.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "48_Principios_de_Administracion.txt.txt",
    "52_Contabilidad_Empresarial.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "59_Introduccion_al_Marketing_HSP.txt.txt",
    "38_Sistemas_Operativos.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "51_Introduccion_a_la_Hospitalidad.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "33_DiseÃ±o_de_Sistemas.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[4].values

# Select specific document names
selected_documents = [
    "34_Inteligencia_Artificial.txt.txt",
    "3_Calculo_Diferencial_+_Ej.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "26_Electronica_Basica_+Lab.txt.txt",
    "161_Algebra_Lineal_2.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "18_Calculo_Vectorial.txt.txt",
    "28_Probabilidad_y_Estadistica_+Ej.txt.txt",
    "58_Introduccion_a_la_Biologia_+Ej.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "162_Algebra_Abstracta_1.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "164_Analisis_Funcional.txt.txt",
    "153_Introduccion_a_Probabilidades.txt.txt",
    "150_Variable_Compleja.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "163_Geometria_Diferencial.txt.txt",
    "22_Algebra_Lineal_1_+Ej.txt.txt",
    "137_Matematicas_Cotidianas.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "8_Programacion_en_C++_+Ej.txt.txt",
    "160_Ecuaciones_Diferenciales_Parciales.txt.txt",
    "76_Estadistica_para_CCSS.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "147_Modelado_3D_1.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "156_Inferencia_Estadistica.txt.txt",
    "168_Analisis_Real.txt.txt",
    "167_Topologia_2.txt.txt",
    "158_Teoria_de_Numeros.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

## Tf_idf_2 ##

In [None]:
file_path_documents = 'tf_idf_documentos_2.csv'
X_doc, first_column_doc = read_and_prepare_data(file_path_documents, 'standard')

file_path_queries = 'queries_vector_2.csv'
X_queries, first_column_queries = read_and_prepare_data(file_path_queries, 'standard')

In [None]:
param_grid = {
    'n_components': [2],
    'n_neighbors': [20],
    'min_dist': [0.7],
    'metric': ['cosine']
}

run_umap_with_tsne(X_doc, X_queries, param_grid, first_column_doc, first_column_queries)

In [None]:
# Example usage
doc_csv = "/content/Umap_2/umap_ncomp2_nneigh20_mindist0.7_metriccosine.csv"   # CSV with reduced dimensions of documents
query_csv = "/content/Umap_2/umap_query_ncomp2_nneigh20_mindist0.7_metriccosine.csv"   # CSV with reduced dimensions of queries
output_csv = "query_cluster_matches_2.csv"  # Output CSV with matches
n_clusters = 3  # Number of neighbors for KNN

cluster_documents_and_assign_queries(doc_csv, query_csv, n_clusters, output_csv)

### Cosine ###

In [None]:
# Example usage
document_csv = "/content/Umap_2/umap_ncomp2_nneigh20_mindist0.7_metriccosine.csv"
query_csv = "/content/Umap_2/umap_query_ncomp2_nneigh20_mindist0.7_metriccosine.csv"

# Load data
document_data = pd.read_csv(document_csv)
query_data = pd.read_csv(query_csv)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[0].values

# Select specific document names
selected_documents = [
    "105_Marketing_Digital.txt.txt",
    "32_Emprendimiento.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "65_Alta_Cocina_Mundial.txt.txt",
    "110_Creatividad_Empresarial.txt.txt",
    "63_Servicio_y_Protocolo.txt.txt",
    "67_Innovacion_Culinaria.txt.txt",
    "116_Analisis_Estrategico_ADM.txt.txt",
    "47_Conceptos_y_Tecnicas_1.txt.txt",
    "62_Pasteleria.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "56_Carniceria.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "25_Cultura_Gastronomica.txt.txt",
    "55_Administracion_de_A_&_B.txt.txt",
    "64_Practica_Culinaria_2.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "54_Practica_Culinaria_1.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "106_Negociacion.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "93_Practica_4.txt.txt",
    "69_Administracion_de_Eventos.txt.txt",
    "61_Alta_Cocina_Francesa.txt.txt",
    "97_Principios_de_Marketing.txt.txt",
    "109_Investigacion_de_Mercados.txt.txt",
    "57_Alta_Cocina_Ecuatoriana.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "75_Planificacion_y_Evaluacion_1.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "48_Principios_de_Administracion.txt.txt",
    "90_Practica_3.txt.txt",
    "52_Contabilidad_Empresarial.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "113_Negocios_Internacionales.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "95_Fisiologia_+Lab.txt.txt",
    "114_Innovacion_y_Sustentabilidad.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "87_Practica_2.txt.txt",
    "96_Evolucion.txt.txt",
    "59_Introduccion_al_Marketing_HSP.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "38_Sistemas_Operativos.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "49_Panaderia.txt.txt",
    "71_Identidad_Culinaria.txt.txt",
    "51_Introduccion_a_la_Hospitalidad.txt.txt"
]
# Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[1].values

# Select specific document names
selected_documents = ["184_Ergonomia.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "89_Ensenanza_Integrada_de_CITIAM.txt.txt",
    "7_Ingles_Nivel_2.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "119_Composicion_Visual_1.txt.txt",
    "79_Coloquios_EDU.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "178_Sonido.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "85_Arte_y_Educacion.txt.txt",
    "138_Enfasis_3.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "78_Planificacion_y_Evaluacion_2.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "60_Coloquios_Gastr.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "91_Inclusion_y_Diversidad.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "83_Ensenanza_de_Matematicas.txt.txt",
    "140_Arte_y_Educacion___Curaduria.txt.txt",
    "1_Escritura_Academica.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "123_Fundamentos_de_Escultura.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "86_Bilingualism.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt",
    "169_Improvisacion.txt.txt",
    "13_Ingles_Nivel_4.txt.txt",
    "142_Produccion_&_Exhibicion.txt.txt",
    "81_Practica_1.txt.txt",
    "82_Metodologias_de_Ensenanza.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "84_Ensenanza_de_Ciencias_Sociales.txt.txt",
    "132_Enfasis_1.txt.txt",
    "20_Ingles_Nivel_6.txt.txt",
    "134_Enfasis_2.txt.txt",
    "174_Proyecto_final_en_Danza.txt.txt",
    "173_Performance.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "88_Ensenanza_de_Ciencias.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "73_Teorias_del_Aprendizaje.txt.txt",
    "74_Desarrollo__NiÃ±o_y_Adolescente.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "172_Composicion.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "122_Fotografia_1.txt.txt",
    "128_Taller_de_Arte_1.txt.txt",
    "170_Danza_Moderna_2.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "19_Ingles_Nivel_5.txt.txt",
    "177_Cinematografia.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "6_Ingles_Nivel_1.txt.txt",
    "179_Storytelling.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]
# Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[2].values

# Select specific document names
selected_documents = ["184_Ergonomia.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "89_Ensenanza_Integrada_de_CITIAM.txt.txt",
    "7_Ingles_Nivel_2.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "119_Composicion_Visual_1.txt.txt",
    "79_Coloquios_EDU.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "178_Sonido.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "85_Arte_y_Educacion.txt.txt",
    "138_Enfasis_3.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "78_Planificacion_y_Evaluacion_2.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "60_Coloquios_Gastr.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "91_Inclusion_y_Diversidad.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "83_Ensenanza_de_Matematicas.txt.txt",
    "140_Arte_y_Educacion___Curaduria.txt.txt",
    "1_Escritura_Academica.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "123_Fundamentos_de_Escultura.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "86_Bilingualism.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt",
    "169_Improvisacion.txt.txt",
    "13_Ingles_Nivel_4.txt.txt",
    "142_Produccion_&_Exhibicion.txt.txt",
    "81_Practica_1.txt.txt",
    "82_Metodologias_de_Ensenanza.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "84_Ensenanza_de_Ciencias_Sociales.txt.txt",
    "132_Enfasis_1.txt.txt",
    "20_Ingles_Nivel_6.txt.txt",
    "134_Enfasis_2.txt.txt",
    "174_Proyecto_final_en_Danza.txt.txt",
    "173_Performance.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "88_Ensenanza_de_Ciencias.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "73_Teorias_del_Aprendizaje.txt.txt",
    "74_Desarrollo__NiÃ±o_y_Adolescente.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "172_Composicion.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "122_Fotografia_1.txt.txt",
    "128_Taller_de_Arte_1.txt.txt",
    "170_Danza_Moderna_2.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "19_Ingles_Nivel_5.txt.txt",
    "177_Cinematografia.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "6_Ingles_Nivel_1.txt.txt",
    "179_Storytelling.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]

  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[3].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "34_Inteligencia_Artificial.txt.txt",
    "3_Calculo_Diferencial_+_Ej.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "26_Electronica_Basica_+Lab.txt.txt",
    "101_Analisis_de_Datos.txt.txt",
    "161_Algebra_Lineal_2.txt.txt",
    "183_Procesos,_Metodos_y_Estandares.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "18_Calculo_Vectorial.txt.txt",
    "36_Aprendizaje_Automatico.txt.txt",
    "28_Probabilidad_y_Estadistica_+Ej.txt.txt",
    "58_Introduccion_a_la_Biologia_+Ej.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "186_Sistemas_Lean.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "42_Seguridad_Informatica.txt.txt",
    "162_Algebra_Abstracta_1.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "164_Analisis_Funcional.txt.txt",
    "153_Introduccion_a_Probabilidades.txt.txt",
    "150_Variable_Compleja.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "163_Geometria_Diferencial.txt.txt",
    "22_Algebra_Lineal_1_+Ej.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "137_Matematicas_Cotidianas.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "43_Mineria_de_Datos.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "8_Programacion_en_C++_+Ej.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "160_Ecuaciones_Diferenciales_Parciales.txt.txt",
    "76_Estadistica_para_CCSS.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "147_Modelado_3D_1.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "156_Inferencia_Estadistica.txt.txt",
    "168_Analisis_Real.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "33_DiseÃ±o_de_Sistemas.txt.txt",
    "158_Teoria_de_Numeros.txt.txt"
]

# Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)


In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[4].values

# Select specific document names
selected_documents = ["184_Ergonomia.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "89_Ensenanza_Integrada_de_CITIAM.txt.txt",
    "7_Ingles_Nivel_2.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "119_Composicion_Visual_1.txt.txt",
    "79_Coloquios_EDU.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "178_Sonido.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "85_Arte_y_Educacion.txt.txt",
    "138_Enfasis_3.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "78_Planificacion_y_Evaluacion_2.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "60_Coloquios_Gastr.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "91_Inclusion_y_Diversidad.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "83_Ensenanza_de_Matematicas.txt.txt",
    "140_Arte_y_Educacion___Curaduria.txt.txt",
    "1_Escritura_Academica.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "123_Fundamentos_de_Escultura.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "86_Bilingualism.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt",
    "169_Improvisacion.txt.txt",
    "13_Ingles_Nivel_4.txt.txt",
    "142_Produccion_&_Exhibicion.txt.txt",
    "81_Practica_1.txt.txt",
    "82_Metodologias_de_Ensenanza.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "84_Ensenanza_de_Ciencias_Sociales.txt.txt",
    "132_Enfasis_1.txt.txt",
    "20_Ingles_Nivel_6.txt.txt",
    "134_Enfasis_2.txt.txt",
    "174_Proyecto_final_en_Danza.txt.txt",
    "173_Performance.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "88_Ensenanza_de_Ciencias.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "73_Teorias_del_Aprendizaje.txt.txt",
    "74_Desarrollo__NiÃ±o_y_Adolescente.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "172_Composicion.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "122_Fotografia_1.txt.txt",
    "128_Taller_de_Arte_1.txt.txt",
    "170_Danza_Moderna_2.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "19_Ingles_Nivel_5.txt.txt",
    "177_Cinematografia.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "6_Ingles_Nivel_1.txt.txt",
    "179_Storytelling.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]

# Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)


### Manhattan ###

In [None]:
# Example usage
document_csv = "/content/Umap_2/umap_ncomp2_nneigh20_mindist0.7_metriccosine.csv"
query_csv = "/content/Umap_2/umap_query_ncomp2_nneigh20_mindist0.7_metriccosine.csv"

# Load data
document_data = pd.read_csv(document_csv)
query_data = pd.read_csv(query_csv)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[0].values

# Select specific document names
selected_documents = [
    "105_Marketing_Digital.txt.txt",
    "32_Emprendimiento.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "65_Alta_Cocina_Mundial.txt.txt",
    "110_Creatividad_Empresarial.txt.txt",
    "63_Servicio_y_Protocolo.txt.txt",
    "67_Innovacion_Culinaria.txt.txt",
    "116_Analisis_Estrategico_ADM.txt.txt",
    "47_Conceptos_y_Tecnicas_1.txt.txt",
    "62_Pasteleria.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "56_Carniceria.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "25_Cultura_Gastronomica.txt.txt",
    "55_Administracion_de_A_&_B.txt.txt",
    "64_Practica_Culinaria_2.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "54_Practica_Culinaria_1.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "106_Negociacion.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "93_Practica_4.txt.txt",
    "69_Administracion_de_Eventos.txt.txt",
    "61_Alta_Cocina_Francesa.txt.txt",
    "97_Principios_de_Marketing.txt.txt",
    "109_Investigacion_de_Mercados.txt.txt",
    "57_Alta_Cocina_Ecuatoriana.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "75_Planificacion_y_Evaluacion_1.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "48_Principios_de_Administracion.txt.txt",
    "90_Practica_3.txt.txt",
    "52_Contabilidad_Empresarial.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "113_Negocios_Internacionales.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "95_Fisiologia_+Lab.txt.txt",
    "114_Innovacion_y_Sustentabilidad.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "87_Practica_2.txt.txt",
    "96_Evolucion.txt.txt",
    "59_Introduccion_al_Marketing_HSP.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "38_Sistemas_Operativos.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "49_Panaderia.txt.txt",
    "71_Identidad_Culinaria.txt.txt",
    "51_Introduccion_a_la_Hospitalidad.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
query_vector = query_data[['0', '1']].iloc[1].values

# Select specific document names
selected_documents = ["184_Ergonomia.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "89_Ensenanza_Integrada_de_CITIAM.txt.txt",
    "7_Ingles_Nivel_2.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "119_Composicion_Visual_1.txt.txt",
    "79_Coloquios_EDU.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "178_Sonido.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "85_Arte_y_Educacion.txt.txt",
    "138_Enfasis_3.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "78_Planificacion_y_Evaluacion_2.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "60_Coloquios_Gastr.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "91_Inclusion_y_Diversidad.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "83_Ensenanza_de_Matematicas.txt.txt",
    "140_Arte_y_Educacion___Curaduria.txt.txt",
    "1_Escritura_Academica.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "123_Fundamentos_de_Escultura.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "86_Bilingualism.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt",
    "169_Improvisacion.txt.txt",
    "13_Ingles_Nivel_4.txt.txt",
    "142_Produccion_&_Exhibicion.txt.txt",
    "81_Practica_1.txt.txt",
    "82_Metodologias_de_Ensenanza.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "84_Ensenanza_de_Ciencias_Sociales.txt.txt",
    "132_Enfasis_1.txt.txt",
    "20_Ingles_Nivel_6.txt.txt",
    "134_Enfasis_2.txt.txt",
    "174_Proyecto_final_en_Danza.txt.txt",
    "173_Performance.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "88_Ensenanza_de_Ciencias.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "73_Teorias_del_Aprendizaje.txt.txt",
    "74_Desarrollo__NiÃ±o_y_Adolescente.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "172_Composicion.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "122_Fotografia_1.txt.txt",
    "128_Taller_de_Arte_1.txt.txt",
    "170_Danza_Moderna_2.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "19_Ingles_Nivel_5.txt.txt",
    "177_Cinematografia.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "6_Ingles_Nivel_1.txt.txt",
    "179_Storytelling.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]
# Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
query_vector = query_data[['0', '1']].iloc[2].values

# Select specific document names
selected_documents = ["184_Ergonomia.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "89_Ensenanza_Integrada_de_CITIAM.txt.txt",
    "7_Ingles_Nivel_2.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "119_Composicion_Visual_1.txt.txt",
    "79_Coloquios_EDU.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "178_Sonido.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "85_Arte_y_Educacion.txt.txt",
    "138_Enfasis_3.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "78_Planificacion_y_Evaluacion_2.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "60_Coloquios_Gastr.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "91_Inclusion_y_Diversidad.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "83_Ensenanza_de_Matematicas.txt.txt",
    "140_Arte_y_Educacion___Curaduria.txt.txt",
    "1_Escritura_Academica.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "123_Fundamentos_de_Escultura.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "86_Bilingualism.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt",
    "169_Improvisacion.txt.txt",
    "13_Ingles_Nivel_4.txt.txt",
    "142_Produccion_&_Exhibicion.txt.txt",
    "81_Practica_1.txt.txt",
    "82_Metodologias_de_Ensenanza.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "84_Ensenanza_de_Ciencias_Sociales.txt.txt",
    "132_Enfasis_1.txt.txt",
    "20_Ingles_Nivel_6.txt.txt",
    "134_Enfasis_2.txt.txt",
    "174_Proyecto_final_en_Danza.txt.txt",
    "173_Performance.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "88_Ensenanza_de_Ciencias.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "73_Teorias_del_Aprendizaje.txt.txt",
    "74_Desarrollo__NiÃ±o_y_Adolescente.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "172_Composicion.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "122_Fotografia_1.txt.txt",
    "128_Taller_de_Arte_1.txt.txt",
    "170_Danza_Moderna_2.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "19_Ingles_Nivel_5.txt.txt",
    "177_Cinematografia.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "6_Ingles_Nivel_1.txt.txt",
    "179_Storytelling.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]

  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
query_vector = query_data[['0', '1']].iloc[3].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "34_Inteligencia_Artificial.txt.txt",
    "3_Calculo_Diferencial_+_Ej.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "26_Electronica_Basica_+Lab.txt.txt",
    "101_Analisis_de_Datos.txt.txt",
    "161_Algebra_Lineal_2.txt.txt",
    "183_Procesos,_Metodos_y_Estandares.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "18_Calculo_Vectorial.txt.txt",
    "36_Aprendizaje_Automatico.txt.txt",
    "28_Probabilidad_y_Estadistica_+Ej.txt.txt",
    "58_Introduccion_a_la_Biologia_+Ej.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "186_Sistemas_Lean.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "42_Seguridad_Informatica.txt.txt",
    "162_Algebra_Abstracta_1.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "164_Analisis_Funcional.txt.txt",
    "153_Introduccion_a_Probabilidades.txt.txt",
    "150_Variable_Compleja.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "163_Geometria_Diferencial.txt.txt",
    "22_Algebra_Lineal_1_+Ej.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "137_Matematicas_Cotidianas.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "43_Mineria_de_Datos.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "8_Programacion_en_C++_+Ej.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "160_Ecuaciones_Diferenciales_Parciales.txt.txt",
    "76_Estadistica_para_CCSS.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "147_Modelado_3D_1.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "156_Inferencia_Estadistica.txt.txt",
    "168_Analisis_Real.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "33_DiseÃ±o_de_Sistemas.txt.txt",
    "158_Teoria_de_Numeros.txt.txt"
]
# Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
query_vector = query_data[['0', '1']].iloc[4].values

# Select specific document names
selected_documents = ["184_Ergonomia.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "89_Ensenanza_Integrada_de_CITIAM.txt.txt",
    "7_Ingles_Nivel_2.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "119_Composicion_Visual_1.txt.txt",
    "79_Coloquios_EDU.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "178_Sonido.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "85_Arte_y_Educacion.txt.txt",
    "138_Enfasis_3.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "78_Planificacion_y_Evaluacion_2.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "60_Coloquios_Gastr.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "91_Inclusion_y_Diversidad.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "83_Ensenanza_de_Matematicas.txt.txt",
    "140_Arte_y_Educacion___Curaduria.txt.txt",
    "1_Escritura_Academica.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "123_Fundamentos_de_Escultura.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "86_Bilingualism.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt",
    "169_Improvisacion.txt.txt",
    "13_Ingles_Nivel_4.txt.txt",
    "142_Produccion_&_Exhibicion.txt.txt",
    "81_Practica_1.txt.txt",
    "82_Metodologias_de_Ensenanza.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "84_Ensenanza_de_Ciencias_Sociales.txt.txt",
    "132_Enfasis_1.txt.txt",
    "20_Ingles_Nivel_6.txt.txt",
    "134_Enfasis_2.txt.txt",
    "174_Proyecto_final_en_Danza.txt.txt",
    "173_Performance.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "88_Ensenanza_de_Ciencias.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "73_Teorias_del_Aprendizaje.txt.txt",
    "74_Desarrollo__NiÃ±o_y_Adolescente.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "172_Composicion.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "122_Fotografia_1.txt.txt",
    "128_Taller_de_Arte_1.txt.txt",
    "170_Danza_Moderna_2.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "19_Ingles_Nivel_5.txt.txt",
    "177_Cinematografia.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "6_Ingles_Nivel_1.txt.txt",
    "179_Storytelling.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

# Reduction Test for Isomap #

In [None]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.manifold import Isomap
from itertools import product

def run_isomap_with_tsne(X_scaled, X_query, param_grid, first_column_doc, first_column_query):
    """Run Isomap with different parameter combinations and save the reduced space."""
    results = []
    min_error = float('inf')  # Track the minimum reconstruction error
    best_idx = None           # Track the index of the best parameter combination

    # Folder for saving CSVs
    output_folder = "Isomap_Results"
    os.makedirs(output_folder, exist_ok=True)

    # Convert inputs to numpy arrays if they are DataFrames
    if isinstance(X_scaled, pd.DataFrame):
        X_scaled = X_scaled.to_numpy()
    if isinstance(X_query, pd.DataFrame):
        X_query = X_query.to_numpy()

    # Get the total number of parameter combinations
    param_combinations = list(product(
        param_grid['n_components'],
        param_grid['n_neighbors']
    ))

    # Iterate through each combination of n_components and n_neighbors
    for idx, (n_components, n_neighbors) in enumerate(param_combinations):
        try:
            # Validate parameters
            if n_components > X_scaled.shape[1]:
                print(f"Skipping n_components={n_components}: exceeds feature dimensions.")
                continue
            if n_neighbors >= X_scaled.shape[0]:
                print(f"Skipping n_neighbors={n_neighbors}: exceeds number of data points.")
                continue

            # Run Isomap with the new parameters
            isomap_model = Isomap(
                n_neighbors=n_neighbors,
                n_components=n_components
            )
            isomap_transformed_data = isomap_model.fit_transform(X_scaled)

            # Create DataFrame with first column
            output_df = pd.DataFrame(isomap_transformed_data)
            output_df.insert(0, 'original_column', first_column_doc)

            # Save the reduced space to a CSV file
            file_name = f"isomap_ncomp{n_components}_nneigh{n_neighbors}.csv"
            file_path = os.path.join(output_folder, file_name)
            pd.DataFrame(output_df).to_csv(file_path, index=False)
            print(f"Saved Isomap doc result to {file_path}")

            # Transform the query data using the same Isomap model
            isomap_transformed_data_query = isomap_model.transform(X_query)

            output_df_query = pd.DataFrame(isomap_transformed_data_query)
            output_df_query.insert(0, 'original_column', first_column_query)

            file_name_query = f"isomap_query_ncomp{n_components}_nneigh{n_neighbors}.csv"
            file_path_query = os.path.join(output_folder, file_name_query)
            pd.DataFrame(output_df_query).to_csv(file_path_query, index=False)
            print(f"Saved Isomap query result to {file_path_query}")

        except ValueError as ve:
            print(f"ValueError for n_components={n_components}, n_neighbors={n_neighbors}: {str(ve)}")
        except Exception as e:
            print(f"Unexpected error for n_components={n_components}, n_neighbors={n_neighbors}: {str(e)}")

    return results


## Tf_idf ##

In [None]:
file_path_documents = 'tf_idf_documentos.csv'
X_doc, first_column_doc = read_and_prepare_data(file_path_documents, 'standard')

file_path_queries = 'queries_vector.csv'
X_queries, first_column_queries = read_and_prepare_data(file_path_queries, 'standard')

In [None]:
param_grid = {
    'n_components': [20],
    'n_neighbors': [40]
}

run_isomap_with_tsne(X_doc, X_queries, param_grid, first_column_doc, first_column_queries)

In [None]:
# Example usage
doc_csv = "/content/Isomap_Results/isomap_ncomp20_nneigh40.csv"   # CSV with reduced dimensions of documents
query_csv = "/content/Isomap_Results/isomap_query_ncomp20_nneigh40.csv"   # CSV with reduced dimensions of queries
output_csv = "query_cluster_matches_isomap.csv"  # Output CSV with matches
n_clusters = 22  # Number of neighbors for KNN

cluster_documents_and_assign_queries(doc_csv, query_csv, n_clusters, output_csv)


In [None]:
import pandas as pd
import numpy as np

# Load the queries CSV
query_csv = "/content/Isomap_Results/isomap_ncomp20_nneigh40.csv"
query_data = pd.read_csv(query_csv)

document_csv = "/content/Isomap_Results/isomap_query_ncomp20_nneigh40.csv"
document_data = pd.read_csv(document_csv)


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity_for_selected_documents(documents_df, query_vector, selected_documents):
    # Filter the DataFrame to include only the selected documents
    selected_df = documents_df[documents_df['original_column'].isin(selected_documents)]

    # Extract the vectors of the selected documents
    selected_vectors = selected_df.iloc[:, 1:].values  # Excludes 'original_column'

    # Compute cosine similarity between each selected document vector and the query vector
    similarities = cosine_similarity(selected_vectors, query_vector.reshape(1, -1)).flatten()

    # Create a result DataFrame with document names and their similarity scores
    results = pd.DataFrame({
        'document': selected_df['original_column'].values,
        'similarity': similarities
    })

    # Sort the results by similarity in descending order
    results = results.sort_values(by='similarity', ascending=False).reset_index(drop=True)

    return results

In [None]:
import numpy as np

def get_similar_documents_manhattan(query_vector, document_data, selected_documents):

    # Filter the document data to include only the selected documents
    filtered_data = document_data[document_data['original_column'].isin(selected_documents)]

    # Extract the document names and their corresponding vectors
    document_names = filtered_data['original_column'].values
    document_vectors = filtered_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19']].values  # Adjust if there are more dimensions

    # Compute Manhattan distances between the query vector and all selected document vectors
    distances = np.sum(np.abs(document_vectors - query_vector), axis=1)

    # Combine document names with their distances and sort by distance
    results = list(zip(document_names, distances))
    results.sort(key=lambda x: x[1])  # Sort by distance (ascending)

    return results


### Cosine ###

In [None]:
# Example usage
document_csv = "/content/Isomap_Results/isomap_ncomp20_nneigh40.csv"
query_csv = "/content/Isomap_Results/isomap_query_ncomp20_nneigh40.csv"

# Load data
document_data = pd.read_csv(document_csv)
query_data = pd.read_csv(query_csv)

# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19']].iloc[0].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]

  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19']].iloc[1].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]# Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19']].iloc[2].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
] # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19']].iloc[3].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19']].iloc[4].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

### Manhattan ###

In [None]:
# Example usage
document_csv = "/content/Isomap_Results/isomap_ncomp20_nneigh40.csv"
query_csv = "/content/Isomap_Results/isomap_query_ncomp20_nneigh40.csv"

# Load data
document_data = pd.read_csv(document_csv)
query_data = pd.read_csv(query_csv)

# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19']].iloc[0].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]
  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19']].iloc[1].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19']].iloc[2].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19']].iloc[3].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19']].iloc[4].values

# Select specific document names
selected_documents = [
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt",
    "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "11_Ser_y_Cosmos.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "35_Base_de_Datos.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt",
    "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt",
    "151_Ecuaciones_Diferenciales.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "4_Quimica_General_1_+Lab_Ej.txt.txt",
    "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "99_Gerencia_de_Costos.txt.txt",
    "17_Fisica_para_Ingenieria_1_+Lab_Ej.txt.txt",
    "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt",
    "80_Ensenanza_de_Lenguaje.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt",
    "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt",
    "117_Herramientas_Digitales_1.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "21_Programacion_de_Apps.txt.txt",
    "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt",
    "98_Estadistica_Empresarial_+Lab.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "23_Fisica_para_Ingenieria_2_+Lab_Ej.txt.txt",
    "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "112_Gestion_del_Talento.txt.txt",
    "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt",
    "148_Calculo_para_Ciencias_1.txt.txt",
    "31_Organizacion_de_Computadores.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt",
    "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt",
    "41_Coloquios_ING.txt.txt",
    "111_Proyectos_Empresariales.txt.txt",
    "157_Analisis_Numerico.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "70_Gerencia_Financiera_HSP.txt.txt",
    "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt",
    "159_Combinatoria_y_Grafos.txt.txt",
    "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt",
    "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt",
    "37_Redes_+Lab.txt.txt",
    "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt",
    "139_Taller_de_Arte_3.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
]  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

## Tf_idf_2 ##

In [None]:
file_path_documents = 'tf_idf_documentos_2.csv'
X_doc, first_column_doc = read_and_prepare_data(file_path_documents, 'standard')

file_path_queries = 'queries_vector_2.csv'
X_queries, first_column_queries = read_and_prepare_data(file_path_queries, 'standard')

In [None]:
param_grid = {
    'n_components': [30],
    'n_neighbors': [60]
}

run_isomap_with_tsne(X_doc, X_queries, param_grid, first_column_doc, first_column_queries)

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import StandardScaler

def cluster_documents_and_assign_queries(doc_csv, query_csv, n_clusters, output_csv):
    """
    Clusters documents using K-Means and assigns queries to the closest cluster.

    Parameters:
    - doc_csv: Path to the CSV file containing reduced dimensionality of documents.
    - query_csv: Path to the CSV file containing reduced dimensionality of queries.
    - n_clusters: Number of clusters for K-Means.
    - output_csv: File path to save query-to-cluster assignment results.
    """
    # Load the reduced dimensions for documents and queries
    doc_data = pd.read_csv(doc_csv)
    query_data = pd.read_csv(query_csv)

    # Extract feature vectors and document/query names
    document_vectors = doc_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19',
                                '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']].values

    document_names = doc_data['original_column'].values  # Document names

    query_vectors = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19',
                                '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']].values
    query_names = query_data['original_column'].values  # Query names

    # Apply K-Means clustering to documents
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(document_vectors)

    # Get the cluster assignments for documents
    doc_clusters = kmeans.labels_

    # Assign queries to the closest cluster
    query_cluster_indices, _ = pairwise_distances_argmin_min(query_vectors, kmeans.cluster_centers_)

    # Prepare query-cluster matches
    query_matches = []
    for query_name, query_cluster_idx in zip(query_names, query_cluster_indices):
        # Get documents belonging to the same cluster
        matched_docs = [
            document_names[idx] for idx, cluster in enumerate(doc_clusters) if cluster == query_cluster_idx
        ]
        query_matches.append({
            'Query': query_name,
            'Assigned_Cluster': query_cluster_idx,
            'Matched_Documents': ', '.join(matched_docs)  # Join matched document names
        })

    # Save results to a CSV
    matches_df = pd.DataFrame(query_matches)
    matches_df.to_csv(output_csv, index=False)
    print(f"Query-to-cluster assignments saved to {output_csv}")


In [None]:
# Example usage
doc_csv = "/content/Isomap_Results/isomap_ncomp30_nneigh60.csv"   # CSV with reduced dimensions of documents
query_csv = "/content/Isomap_Results/isomap_query_ncomp30_nneigh60.csv"   # CSV with reduced dimensions of queries
output_csv = "query_cluster_matches_2.csv"  # Output CSV with matches
n_clusters = 22  # Number of neighbors for KNN

cluster_documents_and_assign_queries(doc_csv, query_csv, n_clusters, output_csv)

### Cosine ###

In [None]:
import numpy as np

def get_similar_documents_manhattan(query_vector, document_data, selected_documents):

    # Filter the document data to include only the selected documents
    filtered_data = document_data[document_data['original_column'].isin(selected_documents)]

    # Extract the document names and their corresponding vectors
    document_names = filtered_data['original_column'].values
    document_vectors = filtered_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19',
                                '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']].values  # Adjust if there are more dimensions

    # Compute Manhattan distances between the query vector and all selected document vectors
    distances = np.sum(np.abs(document_vectors - query_vector), axis=1)

    # Combine document names with their distances and sort by distance
    results = list(zip(document_names, distances))
    results.sort(key=lambda x: x[1])  # Sort by distance (ascending)

    return results


In [None]:
# Example usage
document_csv = "/content/Isomap_Results/isomap_ncomp30_nneigh60.csv"
query_csv = "/content/Isomap_Results/isomap_query_ncomp30_nneigh60.csv"

# Load data
document_data = pd.read_csv(document_csv)
query_data = pd.read_csv(query_csv)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19',
                                '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']].iloc[0].values

selected_documents = [
    "184_Ergonomia.txt.txt", "105_Marketing_Digital.txt.txt", "32_Emprendimiento.txt.txt",
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt", "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "3_Calculo_Diferencial_+_Ej.txt.txt", "11_Ser_y_Cosmos.txt.txt",
    "89_Ensenanza_Integrada_de_CITIAM.txt.txt", "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "26_Electronica_Basica_+Lab.txt.txt", "7_Ingles_Nivel_2.txt.txt",
    "65_Alta_Cocina_Mundial.txt.txt", "110_Creatividad_Empresarial.txt.txt",
    "161_Algebra_Lineal_2.txt.txt", "183_Procesos,_Metodos_y_Estandares.txt.txt",
    "129_Coloquios_ART.txt.txt", "116_Analisis_Estrategico_ADM.txt.txt",
    "10_Autoconocimiento.txt.txt", "47_Conceptos_y_Tecnicas_1.txt.txt", "35_Base_de_Datos.txt.txt",
    "119_Composicion_Visual_1.txt.txt", "79_Coloquios_EDU.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt", "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt", "151_Ecuaciones_Diferenciales.txt.txt", "18_Calculo_Vectorial.txt.txt",
    "125_Nuevos_Medios.txt.txt", "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt", "58_Introduccion_a_la_Biologia_+Ej.txt.txt",
    "102_Principios_de_Finanzas.txt.txt", "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "186_Sistemas_Lean.txt.txt", "4_Quimica_General_1_+Lab_Ej.txt.txt", "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt", "162_Algebra_Abstracta_1.txt.txt",
    "99_Gerencia_de_Costos.txt.txt", "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt", "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt", "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt", "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt", "164_Analisis_Funcional.txt.txt",
    "153_Introduccion_a_Probabilidades.txt.txt", "117_Herramientas_Digitales_1.txt.txt",
    "150_Variable_Compleja.txt.txt", "83_Ensenanza_de_Matematicas.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt", "140_Arte_y_Educacion___Curaduria.txt.txt",
    "1_Escritura_Academica.txt.txt", "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt", "163_Geometria_Diferencial.txt.txt",
    "22_Algebra_Lineal_1_+Ej.txt.txt", "123_Fundamentos_de_Escultura.txt.txt",
    "126_Arte_Contemporaneo.txt.txt", "86_Bilingualism.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt", "97_Principios_de_Marketing.txt.txt",
    "109_Investigacion_de_Mercados.txt.txt", "21_Programacion_de_Apps.txt.txt",
    "137_Matematicas_Cotidianas.txt.txt", "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt", "13_Ingles_Nivel_4.txt.txt",
    "149_Teoria_de_Grupos.txt.txt", "142_Produccion_&_Exhibicion.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt", "81_Practica_1.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt", "75_Planificacion_y_Evaluacion_1.txt.txt",
    "8_Programacion_en_C++_+Ej.txt.txt", "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt", "98_Estadistica_Empresarial_+Lab.txt.txt",
    "82_Metodologias_de_Ensenanza.txt.txt", "160_Ecuaciones_Diferenciales_Parciales.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt", "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt", "132_Enfasis_1.txt.txt", "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt", "113_Negocios_Internacionales.txt.txt",
    "112_Gestion_del_Talento.txt.txt", "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt", "134_Enfasis_2.txt.txt", "148_Calculo_para_Ciencias_1.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt", "147_Modelado_3D_1.txt.txt",
    "114_Innovacion_y_Sustentabilidad.txt.txt", "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt", "174_Proyecto_final_en_Danza.txt.txt",
    "173_Performance.txt.txt", "111_Proyectos_Empresariales.txt.txt", "157_Analisis_Numerico.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt", "168_Analisis_Real.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt", "38_Sistemas_Operativos.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt", "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt", "128_Taller_de_Arte_1.txt.txt", "49_Panaderia.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt", "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt", "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt", "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt", "139_Taller_de_Arte_3.txt.txt",
    "33_DiseÃ±o_de_Sistemas.txt.txt", "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt", "158_Teoria_de_Numeros.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19',
                                '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']].iloc[1].values

selected_documents = [
    "184_Ergonomia.txt.txt", "105_Marketing_Digital.txt.txt", "32_Emprendimiento.txt.txt",
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt", "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "3_Calculo_Diferencial_+_Ej.txt.txt", "11_Ser_y_Cosmos.txt.txt",
    "89_Ensenanza_Integrada_de_CITIAM.txt.txt", "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "26_Electronica_Basica_+Lab.txt.txt", "7_Ingles_Nivel_2.txt.txt",
    "65_Alta_Cocina_Mundial.txt.txt", "110_Creatividad_Empresarial.txt.txt",
    "161_Algebra_Lineal_2.txt.txt", "183_Procesos,_Metodos_y_Estandares.txt.txt",
    "129_Coloquios_ART.txt.txt", "116_Analisis_Estrategico_ADM.txt.txt",
    "10_Autoconocimiento.txt.txt", "47_Conceptos_y_Tecnicas_1.txt.txt", "35_Base_de_Datos.txt.txt",
    "119_Composicion_Visual_1.txt.txt", "79_Coloquios_EDU.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt", "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt", "151_Ecuaciones_Diferenciales.txt.txt", "18_Calculo_Vectorial.txt.txt",
    "125_Nuevos_Medios.txt.txt", "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt", "58_Introduccion_a_la_Biologia_+Ej.txt.txt",
    "102_Principios_de_Finanzas.txt.txt", "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "186_Sistemas_Lean.txt.txt", "4_Quimica_General_1_+Lab_Ej.txt.txt", "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt", "162_Algebra_Abstracta_1.txt.txt",
    "99_Gerencia_de_Costos.txt.txt", "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt", "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt", "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt", "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt", "164_Analisis_Funcional.txt.txt",
    "153_Introduccion_a_Probabilidades.txt.txt", "117_Herramientas_Digitales_1.txt.txt",
    "150_Variable_Compleja.txt.txt", "83_Ensenanza_de_Matematicas.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt", "140_Arte_y_Educacion___Curaduria.txt.txt",
    "1_Escritura_Academica.txt.txt", "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt", "163_Geometria_Diferencial.txt.txt",
    "22_Algebra_Lineal_1_+Ej.txt.txt", "123_Fundamentos_de_Escultura.txt.txt",
    "126_Arte_Contemporaneo.txt.txt", "86_Bilingualism.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt", "97_Principios_de_Marketing.txt.txt",
    "109_Investigacion_de_Mercados.txt.txt", "21_Programacion_de_Apps.txt.txt",
    "137_Matematicas_Cotidianas.txt.txt", "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt", "13_Ingles_Nivel_4.txt.txt",
    "149_Teoria_de_Grupos.txt.txt", "142_Produccion_&_Exhibicion.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt", "81_Practica_1.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt", "75_Planificacion_y_Evaluacion_1.txt.txt",
    "8_Programacion_en_C++_+Ej.txt.txt", "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt", "98_Estadistica_Empresarial_+Lab.txt.txt",
    "82_Metodologias_de_Ensenanza.txt.txt", "160_Ecuaciones_Diferenciales_Parciales.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt", "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt", "132_Enfasis_1.txt.txt", "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt", "113_Negocios_Internacionales.txt.txt",
    "112_Gestion_del_Talento.txt.txt", "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt", "134_Enfasis_2.txt.txt", "148_Calculo_para_Ciencias_1.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt", "147_Modelado_3D_1.txt.txt",
    "114_Innovacion_y_Sustentabilidad.txt.txt", "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt", "174_Proyecto_final_en_Danza.txt.txt",
    "173_Performance.txt.txt", "111_Proyectos_Empresariales.txt.txt", "157_Analisis_Numerico.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt", "168_Analisis_Real.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt", "38_Sistemas_Operativos.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt", "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt", "128_Taller_de_Arte_1.txt.txt", "49_Panaderia.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt", "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt", "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt", "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt", "139_Taller_de_Arte_3.txt.txt",
    "33_DiseÃ±o_de_Sistemas.txt.txt", "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt", "158_Teoria_de_Numeros.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19',
                                '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']].iloc[2].values

selected_documents = [
    "184_Ergonomia.txt.txt", "105_Marketing_Digital.txt.txt", "32_Emprendimiento.txt.txt",
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt", "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "3_Calculo_Diferencial_+_Ej.txt.txt", "11_Ser_y_Cosmos.txt.txt",
    "89_Ensenanza_Integrada_de_CITIAM.txt.txt", "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "26_Electronica_Basica_+Lab.txt.txt", "7_Ingles_Nivel_2.txt.txt",
    "65_Alta_Cocina_Mundial.txt.txt", "110_Creatividad_Empresarial.txt.txt",
    "161_Algebra_Lineal_2.txt.txt", "183_Procesos,_Metodos_y_Estandares.txt.txt",
    "129_Coloquios_ART.txt.txt", "116_Analisis_Estrategico_ADM.txt.txt",
    "10_Autoconocimiento.txt.txt", "47_Conceptos_y_Tecnicas_1.txt.txt", "35_Base_de_Datos.txt.txt",
    "119_Composicion_Visual_1.txt.txt", "79_Coloquios_EDU.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt", "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt", "151_Ecuaciones_Diferenciales.txt.txt", "18_Calculo_Vectorial.txt.txt",
    "125_Nuevos_Medios.txt.txt", "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt", "58_Introduccion_a_la_Biologia_+Ej.txt.txt",
    "102_Principios_de_Finanzas.txt.txt", "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "186_Sistemas_Lean.txt.txt", "4_Quimica_General_1_+Lab_Ej.txt.txt", "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt", "162_Algebra_Abstracta_1.txt.txt",
    "99_Gerencia_de_Costos.txt.txt", "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt", "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt", "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt", "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt", "164_Analisis_Funcional.txt.txt",
    "153_Introduccion_a_Probabilidades.txt.txt", "117_Herramientas_Digitales_1.txt.txt",
    "150_Variable_Compleja.txt.txt", "83_Ensenanza_de_Matematicas.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt", "140_Arte_y_Educacion___Curaduria.txt.txt",
    "1_Escritura_Academica.txt.txt", "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt", "163_Geometria_Diferencial.txt.txt",
    "22_Algebra_Lineal_1_+Ej.txt.txt", "123_Fundamentos_de_Escultura.txt.txt",
    "126_Arte_Contemporaneo.txt.txt", "86_Bilingualism.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt", "97_Principios_de_Marketing.txt.txt",
    "109_Investigacion_de_Mercados.txt.txt", "21_Programacion_de_Apps.txt.txt",
    "137_Matematicas_Cotidianas.txt.txt", "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt", "13_Ingles_Nivel_4.txt.txt",
    "149_Teoria_de_Grupos.txt.txt", "142_Produccion_&_Exhibicion.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt", "81_Practica_1.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt", "75_Planificacion_y_Evaluacion_1.txt.txt",
    "8_Programacion_en_C++_+Ej.txt.txt", "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt", "98_Estadistica_Empresarial_+Lab.txt.txt",
    "82_Metodologias_de_Ensenanza.txt.txt", "160_Ecuaciones_Diferenciales_Parciales.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt", "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt", "132_Enfasis_1.txt.txt", "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt", "113_Negocios_Internacionales.txt.txt",
    "112_Gestion_del_Talento.txt.txt", "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt", "134_Enfasis_2.txt.txt", "148_Calculo_para_Ciencias_1.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt", "147_Modelado_3D_1.txt.txt",
    "114_Innovacion_y_Sustentabilidad.txt.txt", "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt", "174_Proyecto_final_en_Danza.txt.txt",
    "173_Performance.txt.txt", "111_Proyectos_Empresariales.txt.txt", "157_Analisis_Numerico.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt", "168_Analisis_Real.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt", "38_Sistemas_Operativos.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt", "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt", "128_Taller_de_Arte_1.txt.txt", "49_Panaderia.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt", "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt", "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt", "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt", "139_Taller_de_Arte_3.txt.txt",
    "33_DiseÃ±o_de_Sistemas.txt.txt", "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt", "158_Teoria_de_Numeros.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19',
                                '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']].iloc[3].values

selected_documents = [
    "184_Ergonomia.txt.txt", "105_Marketing_Digital.txt.txt", "32_Emprendimiento.txt.txt",
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt", "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "3_Calculo_Diferencial_+_Ej.txt.txt", "11_Ser_y_Cosmos.txt.txt",
    "89_Ensenanza_Integrada_de_CITIAM.txt.txt", "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "26_Electronica_Basica_+Lab.txt.txt", "7_Ingles_Nivel_2.txt.txt",
    "65_Alta_Cocina_Mundial.txt.txt", "110_Creatividad_Empresarial.txt.txt",
    "161_Algebra_Lineal_2.txt.txt", "183_Procesos,_Metodos_y_Estandares.txt.txt",
    "129_Coloquios_ART.txt.txt", "116_Analisis_Estrategico_ADM.txt.txt",
    "10_Autoconocimiento.txt.txt", "47_Conceptos_y_Tecnicas_1.txt.txt", "35_Base_de_Datos.txt.txt",
    "119_Composicion_Visual_1.txt.txt", "79_Coloquios_EDU.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt", "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt", "151_Ecuaciones_Diferenciales.txt.txt", "18_Calculo_Vectorial.txt.txt",
    "125_Nuevos_Medios.txt.txt", "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt", "58_Introduccion_a_la_Biologia_+Ej.txt.txt",
    "102_Principios_de_Finanzas.txt.txt", "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "186_Sistemas_Lean.txt.txt", "4_Quimica_General_1_+Lab_Ej.txt.txt", "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt", "162_Algebra_Abstracta_1.txt.txt",
    "99_Gerencia_de_Costos.txt.txt", "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt", "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt", "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt", "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt", "164_Analisis_Funcional.txt.txt",
    "153_Introduccion_a_Probabilidades.txt.txt", "117_Herramientas_Digitales_1.txt.txt",
    "150_Variable_Compleja.txt.txt", "83_Ensenanza_de_Matematicas.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt", "140_Arte_y_Educacion___Curaduria.txt.txt",
    "1_Escritura_Academica.txt.txt", "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt", "163_Geometria_Diferencial.txt.txt",
    "22_Algebra_Lineal_1_+Ej.txt.txt", "123_Fundamentos_de_Escultura.txt.txt",
    "126_Arte_Contemporaneo.txt.txt", "86_Bilingualism.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt", "97_Principios_de_Marketing.txt.txt",
    "109_Investigacion_de_Mercados.txt.txt", "21_Programacion_de_Apps.txt.txt",
    "137_Matematicas_Cotidianas.txt.txt", "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt", "13_Ingles_Nivel_4.txt.txt",
    "149_Teoria_de_Grupos.txt.txt", "142_Produccion_&_Exhibicion.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt", "81_Practica_1.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt", "75_Planificacion_y_Evaluacion_1.txt.txt",
    "8_Programacion_en_C++_+Ej.txt.txt", "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt", "98_Estadistica_Empresarial_+Lab.txt.txt",
    "82_Metodologias_de_Ensenanza.txt.txt", "160_Ecuaciones_Diferenciales_Parciales.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt", "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt", "132_Enfasis_1.txt.txt", "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt", "113_Negocios_Internacionales.txt.txt",
    "112_Gestion_del_Talento.txt.txt", "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt", "134_Enfasis_2.txt.txt", "148_Calculo_para_Ciencias_1.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt", "147_Modelado_3D_1.txt.txt",
    "114_Innovacion_y_Sustentabilidad.txt.txt", "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt", "174_Proyecto_final_en_Danza.txt.txt",
    "173_Performance.txt.txt", "111_Proyectos_Empresariales.txt.txt", "157_Analisis_Numerico.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt", "168_Analisis_Real.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt", "38_Sistemas_Operativos.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt", "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt", "128_Taller_de_Arte_1.txt.txt", "49_Panaderia.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt", "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt", "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt", "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt", "139_Taller_de_Arte_3.txt.txt",
    "33_DiseÃ±o_de_Sistemas.txt.txt", "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt", "158_Teoria_de_Numeros.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                                 '11', '12', '13', '14', '15', '16', '17', '18', '19',
                                '20', '21', '22', '23', '24', '25', '26', '27', '28', '29']].iloc[4].values

selected_documents = [
    "184_Ergonomia.txt.txt", "105_Marketing_Digital.txt.txt", "32_Emprendimiento.txt.txt",
    "185_Ingenieria_de_la_Calidad_+_Lab.txt.txt", "24_Aprendizaje_y_Servicio_PASEC.txt.txt",
    "3_Calculo_Diferencial_+_Ej.txt.txt", "11_Ser_y_Cosmos.txt.txt",
    "89_Ensenanza_Integrada_de_CITIAM.txt.txt", "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "26_Electronica_Basica_+Lab.txt.txt", "7_Ingles_Nivel_2.txt.txt",
    "65_Alta_Cocina_Mundial.txt.txt", "110_Creatividad_Empresarial.txt.txt",
    "161_Algebra_Lineal_2.txt.txt", "183_Procesos,_Metodos_y_Estandares.txt.txt",
    "129_Coloquios_ART.txt.txt", "116_Analisis_Estrategico_ADM.txt.txt",
    "10_Autoconocimiento.txt.txt", "47_Conceptos_y_Tecnicas_1.txt.txt", "35_Base_de_Datos.txt.txt",
    "119_Composicion_Visual_1.txt.txt", "79_Coloquios_EDU.txt.txt",
    "130_Arte_y_Contexto_Social.txt.txt", "44_Aplicaciones_Distribuidas.txt.txt",
    "62_Pasteleria.txt.txt", "151_Ecuaciones_Diferenciales.txt.txt", "18_Calculo_Vectorial.txt.txt",
    "125_Nuevos_Medios.txt.txt", "100_Principios_de_Seguros.txt.txt",
    "72_Fundamentos_de_la_Educacion.txt.txt", "58_Introduccion_a_la_Biologia_+Ej.txt.txt",
    "102_Principios_de_Finanzas.txt.txt", "181_Inv._de_Operaciones_1_+Lab.txt.txt",
    "186_Sistemas_Lean.txt.txt", "4_Quimica_General_1_+Lab_Ej.txt.txt", "138_Enfasis_3.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt", "162_Algebra_Abstracta_1.txt.txt",
    "99_Gerencia_de_Costos.txt.txt", "127_Laboratorio_de_Creacion_2.txt.txt",
    "176_Lenguaje_Visual_y_Montaje.txt.txt", "66_Reposteria_y_Chocolateria.txt.txt",
    "30_Programacion_Avanzada_de_Apps.txt.txt", "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "50_Nutricion_Humana_+Lab.txt.txt", "182_Control_de_Produccion.txt.txt",
    "118_Fundamentos_de_las_Artes.txt.txt", "164_Analisis_Funcional.txt.txt",
    "153_Introduccion_a_Probabilidades.txt.txt", "117_Herramientas_Digitales_1.txt.txt",
    "150_Variable_Compleja.txt.txt", "83_Ensenanza_de_Matematicas.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt", "140_Arte_y_Educacion___Curaduria.txt.txt",
    "1_Escritura_Academica.txt.txt", "15_Matematicas_Discretas.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt", "163_Geometria_Diferencial.txt.txt",
    "22_Algebra_Lineal_1_+Ej.txt.txt", "123_Fundamentos_de_Escultura.txt.txt",
    "126_Arte_Contemporaneo.txt.txt", "86_Bilingualism.txt.txt",
    "180_DiseÃ±o_de_Produccion.txt.txt", "97_Principios_de_Marketing.txt.txt",
    "109_Investigacion_de_Mercados.txt.txt", "21_Programacion_de_Apps.txt.txt",
    "137_Matematicas_Cotidianas.txt.txt", "14_Programacion_Avanzada_en_C++.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt", "13_Ingles_Nivel_4.txt.txt",
    "149_Teoria_de_Grupos.txt.txt", "142_Produccion_&_Exhibicion.txt.txt",
    "152_Calculo_para_Ciencias_2.txt.txt", "81_Practica_1.txt.txt",
    "16_Introduccion_a_la_Economia.txt.txt", "75_Planificacion_y_Evaluacion_1.txt.txt",
    "8_Programacion_en_C++_+Ej.txt.txt", "143_Programacion_Para_DiseÃ±o_1.txt.txt",
    "108_Economia_y_Negocios.txt.txt", "98_Estadistica_Empresarial_+Lab.txt.txt",
    "82_Metodologias_de_Ensenanza.txt.txt", "160_Ecuaciones_Diferenciales_Parciales.txt.txt",
    "121_Dibujo_para_Arte_y_Diseno.txt.txt", "131_Taller_de_Investigacion.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt", "132_Enfasis_1.txt.txt", "107_Operaciones_+Lab.txt.txt",
    "145_Programacion_Para_DiseÃ±o_3.txt.txt", "113_Negocios_Internacionales.txt.txt",
    "112_Gestion_del_Talento.txt.txt", "77_Neurociencia_y_Educacion.txt.txt",
    "166_Topologia_1.txt.txt", "134_Enfasis_2.txt.txt", "148_Calculo_para_Ciencias_1.txt.txt",
    "46_Matematica_Empresarial_+Ej.txt.txt", "147_Modelado_3D_1.txt.txt",
    "114_Innovacion_y_Sustentabilidad.txt.txt", "39_Proyectos__Gerencia_y_Analisis.txt.txt",
    "27_Estructuras_de_Datos.txt.txt", "174_Proyecto_final_en_Danza.txt.txt",
    "173_Performance.txt.txt", "111_Proyectos_Empresariales.txt.txt", "157_Analisis_Numerico.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt", "168_Analisis_Real.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt", "38_Sistemas_Operativos.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt", "136_Laboratorio_de_Creacion_3.txt.txt",
    "5_Cosmos.txt.txt", "128_Taller_de_Arte_1.txt.txt", "49_Panaderia.txt.txt",
    "144_Programacion_Para_DiseÃ±o_2.txt.txt", "103_Coloquios_adm.txt.txt",
    "120_Laboratorio_de_Creacion_1.txt.txt", "175_Lenguaje_del_Cine.txt.txt",
    "133_Taller_de_Arte_2.txt.txt", "146_Juegos_y_Narrativa.txt.txt",
    "167_Topologia_2.txt.txt", "139_Taller_de_Arte_3.txt.txt",
    "33_DiseÃ±o_de_Sistemas.txt.txt", "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt", "158_Teoria_de_Numeros.txt.txt"
]
 # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = calculate_similarity_for_selected_documents(document_data, query_vector, selected_documents)

print(similarity_results)

### Manhattan ###

In [None]:
# Example usage
document_csv = "/content/Isomap_Results/isomap_ncomp30_nneigh60.csv"
query_csv = "/content/Isomap_Results/isomap_query_ncomp30_nneigh60.csv"

# Load data
document_data = pd.read_csv(document_csv)
query_data = pd.read_csv(query_csv)

In [None]:
# Example: Use the first query vector
query_vector = query_data[['0', '1']].iloc[0].values

# Select specific document names
selected_documents = [
                      "119_Composicion_Visual_1.txt.txt",
                      "178_Sonido.txt.txt",
                      "176_Lenguaje_Visual_y_Montaje.txt.txt",
                      "117_Herramientas_Digitales_1.txt.txt",
                      "123_Fundamentos_de_Escultura.txt.txt",
                      "180_DiseÃ±o_de_Produccion.txt.txt",
                      "122_Fotografia_1.txt.txt",
                      "175_Lenguaje_del_Cine.txt.txt",
                      "177_Cinematografia.txt.txt",
                      "179_Storytelling.txt.txt",
                      "146_Juegos_y_Narrativa.txt.txt"
                  ]   # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
query_vector = query_data[['0', '1']].iloc[1].values

# Select specific document names
selected_documents = [
    "184_Ergonomia.txt.txt",
    "105_Marketing_Digital.txt.txt",
    "10_Autoconocimiento.txt.txt",
    "102_Principios_de_Finanzas.txt.txt",
    "135_Gestion_y_Produccion_Cultural.txt.txt",
    "60_Coloquios_Gastr.txt.txt",
    "91_Inclusion_y_Diversidad.txt.txt",
    "2_Taller_de_Ing._Cs._Computacion.txt.txt",
    "93_Practica_4.txt.txt",
    "12_Ingles_Nivel_3.txt.txt",
    "45_Proyecto_Integrador_CMP.txt.txt",
    "94_Zoologia_+Lab.txt.txt",
    "95_Fisiologia_+Lab.txt.txt",
    "174_Proyecto_final_en_Danza.txt.txt",
    "29_Teoria_de_la_Computacion.txt.txt",
    "74_Desarrollo__NiÃ±o_y_Adolescente.txt.txt",
    "92_Proyecto_Integrador_EDU.txt.txt",
    "40_Practica_Pre-Profesional_PASEM.txt.txt",
    "103_Coloquios_adm.txt.txt"
]# Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
query_vector = query_data[['0', '1']].iloc[2].values

# Select specific document names
selected_documents = [
    "65_Alta_Cocina_Mundial.txt.txt",
    "110_Creatividad_Empresarial.txt.txt",
    "67_Innovacion_Culinaria.txt.txt",
    "47_Conceptos_y_Tecnicas_1.txt.txt",
    "62_Pasteleria.txt.txt",
    "25_Cultura_Gastronomica.txt.txt",
    "64_Practica_Culinaria_2.txt.txt",
    "66_Reposteria_y_Chocolateria.txt.txt",
    "53_Conceptos_y_Tecnicas_2_+PRA.txt.txt",
    "54_Practica_Culinaria_1.txt.txt",
    "115_Tributacion_y_Entorno_Legal.txt.txt",
    "61_Alta_Cocina_Francesa.txt.txt",
    "57_Alta_Cocina_Ecuatoriana.txt.txt",
    "68_Enologia_y_Cocteleria.txt.txt",
    "113_Negocios_Internacionales.txt.txt",
    "114_Innovacion_y_Sustentabilidad.txt.txt",
    "104_Finanzas_Corporativas.txt.txt",
    "49_Panaderia.txt.txt",
    "71_Identidad_Culinaria.txt.txt"
]
  # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
query_vector = query_data[['0', '1']].iloc[3].values

# Select specific document names
selected_documents = [
    "171_Barra_para_danza_contemporanea.txt.txt",
    "129_Coloquios_ART.txt.txt",
    "125_Nuevos_Medios.txt.txt",
    "138_Enfasis_3.txt.txt",
    "126_Arte_Contemporaneo.txt.txt",
    "169_Improvisacion.txt.txt",
    "132_Enfasis_1.txt.txt",
    "134_Enfasis_2.txt.txt",
    "173_Performance.txt.txt",
    "172_Composicion.txt.txt",
    "170_Danza_Moderna_2.txt.txt",
    "141_Temas_en_Comunicacion_y_Arte.txt.txt",
    "124_Teoria_Critica_1__Arte_&_Media.txt.txt"
] # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

In [None]:
query_vector = query_data[['0', '1']].iloc[4].values

# Select specific document names
selected_documents = [
    "3_Calculo_Diferencial_+_Ej.txt.txt",
    "155_Logica_y_Teoria_de_Conjuntos.txt.txt",
    "161_Algebra_Lineal_2.txt.txt",
    "18_Calculo_Vectorial.txt.txt",
    "162_Algebra_Abstracta_1.txt.txt",
    "164_Analisis_Funcional.txt.txt",
    "153_Introduccion_a_Probabilidades.txt.txt",
    "150_Variable_Compleja.txt.txt",
    "165_Algebra_Abstracta_2.txt.txt",
    "15_Matematicas_Discretas.txt.txt",
    "163_Geometria_Diferencial.txt.txt",
    "22_Algebra_Lineal_1_+Ej.txt.txt",
    "137_Matematicas_Cotidianas.txt.txt",
    "154_Fundamentos_de_Geometria.txt.txt",
    "149_Teoria_de_Grupos.txt.txt",
    "160_Ecuaciones_Diferenciales_Parciales.txt.txt",
    "9_Calculo_Integral_+_Ej.txt.txt",
    "166_Topologia_1.txt.txt",
    "168_Analisis_Real.txt.txt",
    "167_Topologia_2.txt.txt",
    "158_Teoria_de_Numeros.txt.txt"
] # Replace with the actual document names

# Calculate similarity for selected documents
similarity_results = get_similar_documents_manhattan(query_vector, document_data, selected_documents)

for i in range(len(similarity_results)):
    print(f"Document: {similarity_results[i][0]}, Distance: {similarity_results[i][1]}")

# Test for normal tf_idf #

## Tf_idf ##

In [None]:
# prompt: read and save two csv files as df

import pandas as pd

documentos = pd.read_csv("/content/tf_idf_documentos.csv") #replace file1.csv with your actual file name
queries = pd.read_csv("/content/queries_vector.csv") #replace file2.csv with your actual file name

### Cosine ###

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(query_vector, document_vectors):
    """
    Computes cosine similarity between a single query vector and all document vectors.

    Args:
        query_vector (list or array): A single query vector.
        document_vectors (array): Matrix of document vectors.

    Returns:
        list: Cosine similarity scores between the query and each document.
    """
    query_vector = query_vector.reshape(1, -1)  # Reshape for compatibility
    similarities = cosine_similarity(query_vector, document_vectors)
    return similarities[0]  # Extract the scores as a flat list

In [None]:
# Extract document vectors (numeric columns only)
document_vectors = documentos.iloc[:, 1:].values  # Adjust indices as needed

# Example: Compute similarity for one query row
query_row_index = 0  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_cosine_similarity(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Similarity Score': similarity_scores
})

# Display the top 5 matches
top_matches = results.nlargest(5, 'Similarity Score')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 1  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_cosine_similarity(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Similarity Score': similarity_scores
})

# Display the top 5 matches
top_matches = results.nlargest(5, 'Similarity Score')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 2  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_cosine_similarity(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Similarity Score': similarity_scores
})

# Display the top 5 matches
top_matches = results.nlargest(5, 'Similarity Score')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 3  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_cosine_similarity(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Similarity Score': similarity_scores
})

# Display the top 5 matches
top_matches = results.nlargest(5, 'Similarity Score')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 4  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_cosine_similarity(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Similarity Score': similarity_scores
})

# Display the top 5 matches
top_matches = results.nlargest(5, 'Similarity Score')
print(top_matches)

### Manhattan ###

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import manhattan_distances

def compute_manhattan_distance(query_vector, document_vectors):
    """
    Computes Manhattan distance between a single query vector and all document vectors.

    Args:
        query_vector (list or array): A single query vector.
        document_vectors (array): Matrix of document vectors.

    Returns:
        list: Manhattan distances between the query and each document.
    """
    query_vector = query_vector.reshape(1, -1)  # Reshape for compatibility
    distances = manhattan_distances(query_vector, document_vectors)
    return distances[0]  # Extract the distances as a flat list

In [None]:
# Example: Compute similarity for one query row
query_row_index = 0  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_manhattan_distance(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Distance': similarity_scores
})

# Display the top 5 matches
top_matches = results.nsmallest(5, 'Distance')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 1  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_manhattan_distance(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Distance': similarity_scores
})

# Display the top 5 matches
top_matches = results.nsmallest(5, 'Distance')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 2  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_manhattan_distance(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Distance': similarity_scores
})

# Display the top 5 matches
top_matches = results.nsmallest(5, 'Distance')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 3  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_manhattan_distance(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Distance': similarity_scores
})

# Display the top 5 matches
top_matches = results.nsmallest(5, 'Distance')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 4  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_manhattan_distance(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Distance': similarity_scores
})

# Display the top 5 matches
top_matches = results.nsmallest(5, 'Distance')
print(top_matches)

## Tf_idf_2 ##

### Cosine ###

In [None]:
# prompt: read and save two csv files as df

import pandas as pd

documentos = pd.read_csv("/content/tf_idf_documentos_2.csv") #replace file1.csv with your actual file name
queries = pd.read_csv("/content/queries_vector_2.csv") #replace file2.csv with your actual file name

In [None]:
# Extract document vectors (numeric columns only)
document_vectors = documentos.iloc[:, 1:].values  # Adjust indices as needed

# Example: Compute similarity for one query row
query_row_index = 0  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_cosine_similarity(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Similarity Score': similarity_scores
})

# Display the top 5 matches
top_matches = results.nlargest(5, 'Similarity Score')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 1  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_cosine_similarity(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Similarity Score': similarity_scores
})

# Display the top 5 matches
top_matches = results.nlargest(5, 'Similarity Score')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 2  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_cosine_similarity(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Similarity Score': similarity_scores
})

# Display the top 5 matches
top_matches = results.nlargest(5, 'Similarity Score')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 3  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_cosine_similarity(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Similarity Score': similarity_scores
})

# Display the top 5 matches
top_matches = results.nlargest(5, 'Similarity Score')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 4  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_cosine_similarity(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Similarity Score': similarity_scores
})

# Display the top 5 matches
top_matches = results.nlargest(5, 'Similarity Score')
print(top_matches)

### Manhattan ###

In [None]:
# Example: Compute similarity for one query row
query_row_index = 0  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_manhattan_distance(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Distance': similarity_scores
})

# Display the top 5 matches
top_matches = results.nsmallest(5, 'Distance')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 1  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_manhattan_distance(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Distance': similarity_scores
})

# Display the top 5 matches
top_matches = results.nsmallest(5, 'Distance')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 2  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_manhattan_distance(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Distance': similarity_scores
})

# Display the top 5 matches
top_matches = results.nsmallest(5, 'Distance')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 3  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_manhattan_distance(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Distance': similarity_scores
})

# Display the top 5 matches
top_matches = results.nsmallest(5, 'Distance')
print(top_matches)

In [None]:
# Example: Compute similarity for one query row
query_row_index = 4  # Index of the query to process
query_vector = queries.iloc[query_row_index, 1:].values  # Extract a single query vector
similarity_scores = compute_manhattan_distance(query_vector, document_vectors)

# Map the results back to document identifiers
document_ids = documentos.iloc[:, 0]  # Assuming the first column contains document identifiers
results = pd.DataFrame({
    'Document ID': document_ids,
    'Distance': similarity_scores
})

# Display the top 5 matches
top_matches = results.nsmallest(5, 'Distance')
print(top_matches)