The first two cells provide an example of how we shuffle the data.

In [5]:
import numpy as np
import pandas as pd

# Load the semantic-model correlation matrix from GloVe_sim.csv
semantic_model_df = pd.read_csv('GloVe_sim.csv')

# Get the number of words
num_words = len(semantic_model_df.columns) - 1  # Assuming the first column is not a word but an identifier like 'term'

# Create a vector of word indices and shuffle it
word_indices = np.arange(num_words)
np.random.shuffle(word_indices)

# Reorder the rows and columns of the semantic-model correlation matrix
shuffled_semantic_model_df = semantic_model_df.iloc[:, 1:].iloc[word_indices, word_indices]

# Add the 'term' column back to maintain the structure
shuffled_semantic_model_df.insert(0, 'term', semantic_model_df['term'].iloc[word_indices])

Now we create a big loop to do estimate our permutation p-value

In [9]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.stats import pearsonr
import time

start_time = time.time()

# Load data
word2vec_sim_df = pd.read_csv('GloVe_sim.csv')
neural_sim_df = pd.read_csv('p1_sim.csv')

# Align the columns and reorder the neural similarity matrix (if necessary)
neural_sim_df_aligned = neural_sim_df[word2vec_sim_df.columns]
neural_sim_df_reordered = neural_sim_df_aligned.set_index('term').reindex(word2vec_sim_df['term']).reset_index()

# List of words (excluding the 'term' column)
words = word2vec_sim_df.columns[1:]

# Initialize a list to store decoding accuracies
decoding_accuracies = []

# Number of iterations for the permutation test
num_iterations = 100  # Adjust this number as needed

for iteration in range(num_iterations):
    # Initialize a list to store decoding results for this iteration
    decoding_results = []

    # Shuffle the semantic-model correlation matrix
    word_indices = np.arange(len(word2vec_sim_df.columns) - 1)
    np.random.shuffle(word_indices)
    shuffled_semantic_model_df = word2vec_sim_df.iloc[:, 1:].iloc[word_indices, word_indices]
    shuffled_semantic_model_df.insert(0, 'term', word2vec_sim_df['term'].iloc[word_indices])

    # Iterate over all unique pairs of words
    for word1, word2 in combinations(words, 2):
        # Extract vectors
        word1_neural_vector = neural_sim_df_reordered[word1]
        word2_neural_vector = neural_sim_df_reordered[word2]
        word1_semantic_vector = shuffled_semantic_model_df[word1]
        word2_semantic_vector = shuffled_semantic_model_df[word2]

        # Remove indices with perfect correlations
        perfect_corr_indices = (word1_neural_vector == 1.0) | (word2_neural_vector == 1.0) | \
                               (word1_semantic_vector == 1.0) | (word2_semantic_vector == 1.0)
        word1_neural_vector_filtered = word1_neural_vector[~perfect_corr_indices]
        word2_neural_vector_filtered = word2_neural_vector[~perfect_corr_indices]
        word1_semantic_vector_filtered = word1_semantic_vector[~perfect_corr_indices]
        word2_semantic_vector_filtered = word2_semantic_vector[~perfect_corr_indices]

        # Calculate correlations
        corr_word1_neural_semantic = pearsonr(word1_neural_vector_filtered, word1_semantic_vector_filtered)[0]
        corr_word1_neural_word2_semantic = pearsonr(word1_neural_vector_filtered, word2_semantic_vector_filtered)[0]
        corr_word2_neural_semantic = pearsonr(word2_neural_vector_filtered, word2_semantic_vector_filtered)[0]
        corr_word2_neural_word1_semantic = pearsonr(word2_neural_vector_filtered, word1_semantic_vector_filtered)[0]

        # Check decoding accuracy
        decode_accuracy_word1 = corr_word1_neural_semantic > corr_word1_neural_word2_semantic
        decode_accuracy_word2 = corr_word2_neural_semantic > corr_word2_neural_word1_semantic

        # Append results
        decoding_results.append({
            'word1': word1,
            'word2': word2,
            'corr_word1_neural_semantic': corr_word1_neural_semantic,
            'corr_word1_neural_word2_semantic': corr_word1_neural_word2_semantic,
            'corr_word2_neural_semantic': corr_word2_neural_semantic,
            'corr_word2_neural_word1_semantic': corr_word2_neural_word1_semantic,
            'decode_accuracy_word1': decode_accuracy_word1,
            'decode_accuracy_word2': decode_accuracy_word2
        })

    # Convert results to a DataFrame for this iteration
    decoding_results_df = pd.DataFrame(decoding_results)

    # Calculate decoding accuracy for this iteration
    total_accuracy = (decoding_results_df['decode_accuracy_word1'].sum() + 
                      decoding_results_df['decode_accuracy_word2'].sum())
    decoding_accuracy = total_accuracy / (2 * len(decoding_results_df))

    # Append the result to the list
    decoding_accuracies.append(decoding_accuracy)

# Convert the list to a DataFrame
decoding_accuracies_df = pd.DataFrame(decoding_accuracies, columns=['Decoding Accuracy'])

# save csv
decoding_accuracies_df.to_csv('perm_p1_GloVE_100.csv')
end_time = time.time()

duration = end_time - start_time
print(f"Total execution time: {duration} seconds")

Total execution time: 381.8034279346466 seconds
