<a href="https://colab.research.google.com/github/Diya910/NLP/blob/main/text_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import random
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
wv = api.load('word2vec-google-news-300')



In [None]:
# Download stopwords
nltk.download('stopwords')

stopwords_set = set(stopwords.words('english'))

def clean(text):
    if isinstance(text, str):
        cleaned_tokens = [token.lower() for token in re.findall(r'\b\w+\b', text) if token.lower() not in stopwords_set]
        return cleaned_tokens
    else:
        return []

def prepare_data(df):
    df['question'] = df['question_string'].fillna('')
    df['concatenated_text'] = (df['Title'] + ' ' + df['X-label'] + ' ' + df['Y-label'] + ' ' + df['legend']).fillna('')
    return df[['question', 'concatenated_text']]

df = pd.read_csv("test_3000_14_data.csv", encoding='ISO-8859-1')
df = prepare_data(df)

# Clean the text data
df['question_tokens'] = df['question'].apply(clean)
df['concatenated_text_tokens'] = df['concatenated_text'].apply(clean)
all_tokens = df['question_tokens'].tolist() + df['concatenated_text_tokens'].tolist()

def get_word2vec_embedding(model, tokens):
    embeddings = [model[token] for token in tokens if token in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)


def calculate_recall_at_k(df, model, k):
    recall_counts = {1: 0, 5: 0, 10: 0}
    n = len(df)

    for i in tqdm(df.index):
        q_vector = df['question_vector'].loc[i]
        true_t_vector = df['concatenated_text_vector'].loc[i]

        # Generate 199 random indices excluding the current one
        available_indices = [j for j in df.index if j != i]
        if len(available_indices) < 199:
            other_indices = available_indices
        else:
            other_indices = random.sample(available_indices,199)

        candidate_vectors = [df['concatenated_text_vector'].loc[j] for j in other_indices]
        candidate_vectors.append(true_t_vector)

        # Calculate cosine similarities
        similarities = cosine_similarity([q_vector], candidate_vectors).flatten()

        # Get the similarity score of the true match
        true_similarity = similarities[-1]

        # Check if true similarity is in top k
        top_k_similarities = np.partition(similarities, -k)[-k:]

        # Handle cases where there are less than k candidates
        if len(top_k_similarities) < k:
            k = len(top_k_similarities)

        sorted_indices = similarities.argsort()[::-1]
        recall_counts[1] += 1 if true_similarity >= top_k_similarities[-1] else 0
        recall_counts[5] += 1 if true_similarity >= np.partition(similarities, -5)[-5] else 0
        recall_counts[10] += 1 if true_similarity >= np.partition(similarities, -10)[-10] else 0

    # Calculate recall for each k
    recall_at_1 = recall_counts[1] / n
    recall_at_5 = recall_counts[5] / n
    recall_at_10 = recall_counts[10] / n

    return recall_at_1, recall_at_5, recall_at_10

# Prepare vectors
df['question_vector'] = df['question_tokens'].apply(lambda tokens: get_word2vec_embedding(wv, tokens))
df['concatenated_text_vector'] = df['concatenated_text_tokens'].apply(lambda tokens: get_word2vec_embedding(wv, tokens))

# Calculate recall for test data
recall_at_1, recall_at_5, recall_at_10 = calculate_recall_at_k(df,wv, 10)

print(f"Recall@1: {recall_at_1:.4f}")
print(f"Recall@5: {recall_at_5:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  0%|          | 0/3000 [00:00<?, ?it/s]

Recall@1: 0.7340
Recall@5: 0.7950
Recall@10: 0.8707


In [None]:
import re
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from gensim.downloader import load

# Download stopwords
nltk.download('stopwords')

stopwords_set = set(stopwords.words('english'))

# Function to clean text
def clean(text):
    if isinstance(text, str):
        cleaned_tokens = [token.lower() for token in re.findall(r'\b\w+\b', text) if token.lower() not in stopwords_set]
        return cleaned_tokens
    else:
        return []

# Function to prepare data
def prepare_data(df):
    df['question'] = df['question_string'].fillna('')
    df['concatenated_text'] = (df['Title'] + ' ' + df['X-label'] + ' ' + df['Y-label'] + ' ' + df['legend']).fillna('')
    return df[['question', 'concatenated_text']]

# Function to load GloVe embeddings
def load_glove_model():
    glove_model = load('glove-wiki-gigaword-100')
    return glove_model

def get_glove_embedding(model, tokens):
    embeddings = [model.get_vector(token) for token in tokens if token in model.key_to_index]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Function to calculate recall at k
def calculate_recall_at_k(df, model, k):
    recall_counts = {1: 0, 5: 0, 10: 0}
    n = len(df)

    for i in tqdm(df.index):
        q_vector = df['question_vector'].loc[i]
        true_t_vector = df['concatenated_text_vector'].loc[i]

        # Generate 199 random indices excluding the current one
        available_indices = [j for j in df.index if j != i]
        if len(available_indices) < 199:
            other_indices = available_indices
        else:
            other_indices = random.sample(available_indices, 199)

        candidate_vectors = [df['concatenated_text_vector'].loc[j] for j in other_indices]
        candidate_vectors.append(true_t_vector)

        # Calculate cosine similarities
        similarities = cosine_similarity([q_vector], candidate_vectors).flatten()

        # Get the similarity score of the true match
        true_similarity = similarities[-1]

        # Check if true similarity is in top k
        top_k_similarities = np.partition(similarities, -k)[-k:]

        # Handle cases where there are less than k candidates
        if len(top_k_similarities) < k:
            k = len(top_k_similarities)

        sorted_indices = similarities.argsort()[::-1]
        recall_counts[1] += 1 if true_similarity >= top_k_similarities[-1] else 0
        recall_counts[5] += 1 if true_similarity >= np.partition(similarities, -5)[-5] else 0
        recall_counts[10] += 1 if true_similarity >= np.partition(similarities, -10)[-10] else 0

    # Calculate recall for each k
    recall_at_1 = recall_counts[1] / n
    recall_at_5 = recall_counts[5] / n
    recall_at_10 = recall_counts[10] / n

    return recall_at_1, recall_at_5, recall_at_10

# Load and prepare the data
df = pd.read_csv("test_3000_14_data.csv", encoding='ISO-8859-1')
df = prepare_data(df)

# Clean the text data
df['question_tokens'] = df['question'].apply(clean)
df['concatenated_text_tokens'] = df['concatenated_text'].apply(clean)

# Load GloVe model
glove_model = load_glove_model()

# Prepare vectors using GloVe embeddings
df['question_vector'] = df['question_tokens'].apply(lambda tokens: get_glove_embedding(glove_model, tokens))
df['concatenated_text_vector'] = df['concatenated_text_tokens'].apply(lambda tokens: get_glove_embedding(glove_model, tokens))

# Calculate recall for the dataset
recall_at_1, recall_at_5, recall_at_10 = calculate_recall_at_k(df, glove_model, 10)

print(f"Recall@1: {recall_at_1:.4f}")
print(f"Recall@5: {recall_at_5:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  0%|          | 0/3000 [00:00<?, ?it/s]

Recall@1: 0.6367
Recall@5: 0.6897
Recall@10: 0.7780


In [None]:
import re
import random
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
ft = api.load('fasttext-wiki-news-subwords-300')



In [None]:
# Download stopwords
nltk.download('stopwords')

stopwords_set = set(stopwords.words('english'))

def clean(text):
    if isinstance(text, str):
        cleaned_tokens = [token.lower() for token in re.findall(r'\b\w+\b', text) if token.lower() not in stopwords_set]
        return cleaned_tokens
    else:
        return []

def prepare_data(df):
    df['question'] = df['question_string'].fillna('')
    df['concatenated_text'] = (df['Title'] + ' ' + df['X-label'] + ' ' + df['Y-label'] + ' ' + df['legend']).fillna('')
    return df[['question', 'concatenated_text']]

df = pd.read_csv("test_3000_14_data.csv", encoding='ISO-8859-1')
df = prepare_data(df)

# Clean the text data
df['question_tokens'] = df['question'].apply(clean)
df['concatenated_text_tokens'] = df['concatenated_text'].apply(clean)
all_tokens = df['question_tokens'].tolist() + df['concatenated_text_tokens'].tolist()

def get_fasttext_embedding(model, tokens):
    embeddings = [model[token] for token in tokens if token in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)


def calculate_recall_at_k(df, model, k):
    recall_counts = {1: 0, 5: 0, 10: 0}
    n = len(df)

    for i in tqdm(df.index):
        q_vector = df['question_vector'].loc[i]
        true_t_vector = df['concatenated_text_vector'].loc[i]

        # Generate 199 random indices excluding the current one
        available_indices = [j for j in df.index if j != i]
        if len(available_indices) < 199:
            other_indices = available_indices
        else:
            other_indices = random.sample(available_indices,199)

        candidate_vectors = [df['concatenated_text_vector'].loc[j] for j in other_indices]
        candidate_vectors.append(true_t_vector)

        # Calculate cosine similarities
        similarities = cosine_similarity([q_vector], candidate_vectors).flatten()

        # Get the similarity score of the true match
        true_similarity = similarities[-1]

        # Check if true similarity is in top k
        top_k_similarities = np.partition(similarities, -k)[-k:]

        # Handle cases where there are less than k candidates
        if len(top_k_similarities) < k:
            k = len(top_k_similarities)

        sorted_indices = similarities.argsort()[::-1]
        recall_counts[1] += 1 if true_similarity >= top_k_similarities[-1] else 0
        recall_counts[5] += 1 if true_similarity >= np.partition(similarities, -5)[-5] else 0
        recall_counts[10] += 1 if true_similarity >= np.partition(similarities, -10)[-10] else 0

    # Calculate recall for each k
    recall_at_1 = recall_counts[1] / n
    recall_at_5 = recall_counts[5] / n
    recall_at_10 = recall_counts[10] / n

    return recall_at_1, recall_at_5, recall_at_10

# Prepare vectors
df['question_vector'] = df['question_tokens'].apply(lambda tokens: get_fasttext_embedding(ft, tokens))
df['concatenated_text_vector'] = df['concatenated_text_tokens'].apply(lambda tokens: get_fasttext_embedding(ft, tokens))

# Calculate recall for test data
recall_at_1, recall_at_5, recall_at_10 = calculate_recall_at_k(df,wv, 10)

print(f"Recall@1: {recall_at_1:.4f}")
print(f"Recall@5: {recall_at_5:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from itertools import product

In [None]:
df = pd.read_csv("data_file_20k.csv")

In [None]:
df.head()

Unnamed: 0,index,question_string,image_index,label(True or False,Title,X-label,Y-label,X-tick,Y-tick,Legend,qid,template,answer,type,question_id
0,0,Is the sum of the percentage of females employ...,26010,1,Percentage of female workers employed in Indus...,Employment (as % of female employment),Country,0 10 20 30 40 50,France Hong Kong Israel,1980 1981,CD10,compound,No,hbar,1101082
1,1,"In how many years, is the ppp conversion facto...",26011,1,PPP conversion factor for GDP and private cons...,PPP conversion factor (LCU per international $),Year,0 50 100 150 200,2000 2001 2002 2003 2004 2005,GDP Private consumption,C4,comparison,6,hbar,1101085
2,2,What is the ratio of the employment in public ...,26012,1,Total employment in public sector,Employment (as % of total employment),Country,0 5 10 15 20 25 30 35,Philippines Portugal Romania,1980 1981,C5,comparison,0.9272237989409738,hbar,1101094
3,3,Is the net bilateral aid flow in Ghana in 2009...,26013,1,Net bilateral aid flow in an economy from Greece,Aid flow (current US$),Years,0 20000000 40000000 60000000 80000000 10000000...,2007 2008 2009 2010,Europe(all income levels) Ghana,C6,comparison,No,hbar,1101103
4,4,Is the sum of the merchandise exports in Bulga...,26014,1,Trade statistics with developing economies of ...,Trade with economies of Sub-Saharan Africa(%),Country,0 5 10 15 20 25,Brazil Bulgaria Burkina Faso Burundi Cabo Verde,Merchandise exports Merchandise imports,CD10,compound,Yes,hbar,1101112


In [None]:
df['concatenaed_string'] = df['question_string'] + ' ' + df['Title'] + ' ' + df['X-label'] + ' ' + df['Y-label'] + ' ' + df['Legend']

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
df['concatenaed_string'] = df['concatenaed_string'].apply(preprocess)



In [None]:
new_df = df[['concatenaed_string']].copy()

In [None]:
new_df.head()

Unnamed: 0,concatenaed_string
0,sum percentage female employ industrial sector...
1,year ppp conversion factor private consumption...
2,ratio employment public sector 1981 Philippine...
3,net bilateral aid flow Ghana 2009 2010 net bil...
4,sum merchandise export Bulgaria Burkina Faso g...


In [None]:
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk

nltk.download('punkt')

# Tokenize and create a list of words
new_df['tokens'] = df['concatenaed_string'].apply(word_tokenize)

# Flatten the list of tokensnew
all_tokens = [word for sublist in new_df['tokens'].tolist() for word in sublist]

# Count word frequencies
word_counts = Counter(all_tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from scipy.sparse import coo_matrix
import numpy as np

# Define a context window size
window_size = 5

# Create a mapping from words to indices
vocab = list(word_counts.keys())
word_to_index = {word: i for i, word in enumerate(vocab)}

# Initialize a co-occurrence matrix
co_occurrence = np.zeros((len(vocab), len(vocab)))

# Populate the co-occurrence matrix
for tokens in new_df['tokens']:
    for i, token in enumerate(tokens):
        token_index = word_to_index[token]
        context_indices = [
            word_to_index[context_word]
            for context_word in tokens[max(0, i - window_size): i + window_size + 1]
            if context_word in word_to_index and context_word != token
        ]
        for context_index in context_indices:
            co_occurrence[token_index, context_index] += 1

# Convert to a sparse matrix
co_occurrence = coo_matrix(co_occurrence)


In [None]:
!pip install glove-python3

Collecting glove-python3
  Downloading glove_python3-0.1.0.tar.gz (326 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.0/327.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: glove-python3
  Building wheel for glove-python3 (setup.py) ... [?25l[?25hdone
  Created wheel for glove-python3: filename=glove_python3-0.1.0-cp310-cp310-linux_x86_64.whl size=1064151 sha256=f227ed8abb6fdb3441e6ba2554379b954740c1b2ff5c3bc451bbdc716d441456
  Stored in directory: /root/.cache/pip/wheels/fe/2f/79/34314d44a0907e90e323c8c182ec23f126eb460829e02d98cf
Successfully built glove-python3
Installing collected packages: glove-python3
Successfully installed glove-python3-0.1.0


In [None]:
from glove import Glove

# Train the GloVe model
glove_model = Glove(no_components=100, learning_rate=0.05)
glove_model.fit(co_occurrence, epochs=25, no_threads=4, verbose=True)
glove_model.add_dictionary(word_to_index)


Performing 25 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24


In [None]:
import numpy as np

# Hyperparameters
vector_size = 100
alpha = 0.75
x_max = 100
learning_rate = 0.05
epochs = 25

# Initialize word vectors and biases
word_vectors = np.random.rand(len(vocab), vector_size)
word_biases = np.random.rand(len(vocab))

# Training loop
for epoch in range(epochs):
    for i, j, count in zip(co_occurrence.row, co_occurrence.col, co_occurrence.data):
        weight = min(1.0, (count / x_max)**alpha)
        cost = np.dot(word_vectors[i], word_vectors[j]) + word_biases[i] + word_biases[j] - np.log(count)
        word_vectors[i] -= learning_rate * weight * cost * word_vectors[j]
        word_vectors[j] -= learning_rate * weight * cost * word_vectors[i]
        word_biases[i] -= learning_rate * weight * cost
        word_biases[j] -= learning_rate * weight * cost
    print(f'Epoch {epoch + 1} completed')

# Normalize vectors
for i in range(len(word_vectors)):
    word_vectors[i] /= np.linalg.norm(word_vectors[i])


  word_vectors[j] -= learning_rate * weight * cost * word_vectors[i]


Epoch 1 completed
Epoch 2 completed
Epoch 3 completed
Epoch 4 completed
Epoch 5 completed
Epoch 6 completed
Epoch 7 completed
Epoch 8 completed
Epoch 9 completed
Epoch 10 completed
Epoch 11 completed
Epoch 12 completed
Epoch 13 completed
Epoch 14 completed
Epoch 15 completed
Epoch 16 completed
Epoch 17 completed
Epoch 18 completed
Epoch 19 completed
Epoch 20 completed
Epoch 21 completed
Epoch 22 completed
Epoch 23 completed
Epoch 24 completed
Epoch 25 completed


In [None]:
import pickle

# Load the pickle file
with open('glove_model.pkl', 'rb') as f:
    word_embeddings = pickle.load(f)

# Check the type of the data
print(type(word_embeddings))

# If it's a list or other iterable, inspect the first few elements
if isinstance(word_embeddings, list):
    print(word_embeddings[:5])
elif isinstance(word_embeddings, dict):
    print(list(word_embeddings.items())[:5])
else:
    print(word_embeddings)


<class 'glove.glove.Glove'>
<glove.glove.Glove object at 0x7d130c577ac0>


In [None]:
# Open a file to write
with open('glove_model.txt', 'w') as f:
    for word, index in words.items():
        # Get the vector for the word
        vector = vectors[index]
        # Convert the vector to a space-separated string
        vector_str = ' '.join(map(str, vector))
        # Write the word and vector to the file
        f.write(f"{word} {vector_str}\n")

from gensim.models.keyedvectors import KeyedVectors

def load_glove_format(file_path):
    # Create a dictionary for the vectors
    vectors = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = list(map(float, parts[1:]))
            vectors[word] = vector

    # Get dimensions
    vector_size = len(next(iter(vectors.values())))
    vocab_size = len(vectors)

    # Create a KeyedVectors instance
    glove_vectors = KeyedVectors(vector_size=vector_size)

    # Add words and vectors
    glove_vectors.add_vectors(list(vectors.keys()), list(vectors.values()))

    return glove_vectors

# Load GloVe vectors
glove_model = load_glove_format('glove_model.txt')


In [None]:
import re
import pickle
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from gensim.downloader import load
import random

# Download stopwords
nltk.download('stopwords')

stopwords_set = set(stopwords.words('english'))
# Function to clean text
def clean(text):
    if isinstance(text, str):
        cleaned_tokens = [token.lower() for token in re.findall(r'\b\w+\b', text) if token.lower() not in stopwords_set]
        return cleaned_tokens
    else:
        return []
# Function to get embedding for a word
def get_embedding(tokens, glove_model):
    embeddings = []
    for token in tokens:
        if token in glove_model:
            embeddings.append(glove_model[token])
    if embeddings:
        return np.mean(embeddings, axis=0)  # Average of embeddings
    else:
        return np.zeros(glove_model.vector_size)

# Function to prepare data
def prepare_data(df):
    df['question'] = df['question_string'].fillna('')
    df['concatenated_text'] = (df['Title'] + ' ' + df['X-label'] + ' ' + df['Y-label'] + ' ' + df['legend']).fillna('')
    return df[['question', 'concatenated_text']]


# Function to calculate recall at k
def calculate_recall_at_k(df, model, k):
    recall_counts = {1: 0, 5: 0, 10: 0}
    n = len(df)

    for i in tqdm(df.index):
        q_vector = df['question_vector'].loc[i]
        true_t_vector = df['concatenated_text_vector'].loc[i]

        # Generate 199 random indices excluding the current one
        available_indices = [j for j in df.index if j != i]
        if len(available_indices) < 199:
            other_indices = available_indices
        else:
            other_indices = random.sample(available_indices, 199)

        candidate_vectors = [df['concatenated_text_vector'].loc[j] for j in other_indices]
        candidate_vectors.append(true_t_vector)

        # Calculate cosine similarities
        similarities = cosine_similarity([q_vector], candidate_vectors).flatten()

        # Get the similarity score of the true match
        true_similarity = similarities[-1]

        # Check if true similarity is in top k
        top_k_similarities = np.partition(similarities, -k)[-k:]

        # Handle cases where there are less than k candidates
        if len(top_k_similarities) < k:
            k = len(top_k_similarities)

        sorted_indices = similarities.argsort()[::-1]
        recall_counts[1] += 1 if true_similarity >= top_k_similarities[-1] else 0
        recall_counts[5] += 1 if true_similarity >= np.partition(similarities, -5)[-5] else 0
        recall_counts[10] += 1 if true_similarity >= np.partition(similarities, -10)[-10] else 0

    # Calculate recall for each k
    recall_at_1 = recall_counts[1] / n
    recall_at_5 = recall_counts[5] / n
    recall_at_10 = recall_counts[10] / n

    return recall_at_1, recall_at_5, recall_at_10

# Load and prepare the data
df = pd.read_csv("test_3000_14_data.csv", encoding='ISO-8859-1')
df = prepare_data(df)

# Clean the text data
df['question_tokens'] = df['question'].apply(clean)
df['concatenated_text_tokens'] = df['concatenated_text'].apply(clean)


# Prepare vectors using GloVe embeddings
df['question_vector'] = df['question_tokens'].apply(lambda tokens: get_embedding(tokens , glove_model))
df['concatenated_text_vector'] = df['concatenated_text_tokens'].apply(lambda tokens: get_embedding(tokens , glove_model))

# Calculate recall for the dataset
recall_at_1, recall_at_5, recall_at_10 = calculate_recall_at_k(df, glove_model, 10)

print(f"Recall@1: {recall_at_1:.4f}")
print(f"Recall@5: {recall_at_5:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  0%|          | 0/3000 [00:00<?, ?it/s]

Recall@1: 0.4420
Recall@5: 0.4850
Recall@10: 0.5930


In [None]:
import re
import pickle
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from gensim.downloader import load
import random

# Download stopwords
nltk.download('stopwords')

stopwords_set = set(stopwords.words('english'))
# Function to clean text
def clean(text):
    if isinstance(text, str):
        cleaned_tokens = [token.lower() for token in re.findall(r'\b\w+\b', text) if token.lower() not in stopwords_set]
        return cleaned_tokens
    else:
        return []
# Function to get embedding for a word
def get_embedding(tokens, glove_model):
    embeddings = []
    for token in tokens:
        if token in glove_model:
            embeddings.append(glove_model[token])
    if embeddings:
        return np.mean(embeddings, axis=0)  # Average of embeddings
    else:
        return np.zeros(glove_model.vector_size)

# Function to prepare data
def prepare_data(df):
    df['question'] = df['question_string'].fillna('')
    df['concatenated_text'] = (df['Title'] + ' ' + df['X-label'] + ' ' + df['Y-label'] + ' ' + df['legend']).fillna('')
    return df[['question', 'concatenated_text']]


# Function to calculate recall at k
def calculate_recall_at_k(df, model, k):
    recall_counts = {1: 0, 5: 0, 10: 0}
    n = len(df)

    for i in tqdm(df.index):
        q_vector = df['question_vector'].loc[i]
        true_t_vector = df['concatenated_text_vector'].loc[i]

        # Generate 199 random indices excluding the current one
        available_indices = [j for j in df.index if j != i]
        if len(available_indices) < 799:
            other_indices = available_indices
        else:
            other_indices = random.sample(available_indices, 199)

        candidate_vectors = [df['concatenated_text_vector'].loc[j] for j in other_indices]
        candidate_vectors.append(true_t_vector)

        # Calculate cosine similarities
        similarities = cosine_similarity([q_vector], candidate_vectors).flatten()

        # Get the similarity score of the true match
        true_similarity = similarities[-1]

        # Check if true similarity is in top k
        top_k_similarities = np.partition(similarities, -k)[-k:]

        # Handle cases where there are less than k candidates
        if len(top_k_similarities) < k:
            k = len(top_k_similarities)

        sorted_indices = similarities.argsort()[::-1]
        recall_counts[1] += 1 if true_similarity >= top_k_similarities[-1] else 0
        recall_counts[5] += 1 if true_similarity >= np.partition(similarities, -5)[-5] else 0
        recall_counts[10] += 1 if true_similarity >= np.partition(similarities, -10)[-10] else 0

    # Calculate recall for each k
    recall_at_1 = recall_counts[1] / n
    recall_at_5 = recall_counts[5] / n
    recall_at_10 = recall_counts[10] / n

    return recall_at_1, recall_at_5, recall_at_10

# Load and prepare the data
df = pd.read_csv("test_3000_14_data.csv", encoding='ISO-8859-1')
df = prepare_data(df)

# Clean the text data
df['question_tokens'] = df['question'].apply(clean)
df['concatenated_text_tokens'] = df['concatenated_text'].apply(clean)


# Prepare vectors using GloVe embeddings
df['question_vector'] = df['question_tokens'].apply(lambda tokens: get_embedding(tokens , glove_model))
df['concatenated_text_vector'] = df['concatenated_text_tokens'].apply(lambda tokens: get_embedding(tokens , glove_model))

# Calculate recall for the dataset
recall_at_1, recall_at_5, recall_at_10 = calculate_recall_at_k(df, glove_model, 10)

print(f"Recall@1: {recall_at_1:.4f}")
print(f"Recall@5: {recall_at_5:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  0%|          | 0/3000 [00:00<?, ?it/s]

Recall@1: 0.4387
Recall@5: 0.4847
Recall@10: 0.5880
