DistilBERT(model_1)

In [None]:
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Load your dataset containing text documents
# Assuming your dataset is stored in a CSV file named 'documents.csv' with a column named 'content'
dataset = pd.read_csv('news_articles.csv')

# Replace 'content' with the correct column name if it's different
text_column_name = 'category'

# Check if the text_column_name exists in the dataset
if text_column_name not in dataset.columns:
    raise KeyError(f"Column '{text_column_name}' not found in the dataset.")

# Initialize the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Function to tokenize and get DistilBERT embeddings for text
def get_distilbert_embeddings(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = distilbert_model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pool the embeddings
    return embeddings

# Get the source document index from the user
source_doc_index = int(input("Enter the index of the source document: "))

# Calculate DistilBERT embeddings for the source document
source_text = dataset.iloc[source_doc_index][text_column_name]
source_embeddings = get_distilbert_embeddings(source_text).numpy()

# Calculate cosine similarity of the source document with all other documents
similarities = []
for _, row in dataset.iterrows():
    text = row[text_column_name]
    embeddings = get_distilbert_embeddings(text).numpy()
    similarity = cosine_similarity([source_embeddings.flatten()], [embeddings.flatten()])[0][0]
    similarities.append(similarity)

# Rank the documents based on similarity (descending order)
ranked_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)

# Print the ranked similarity scores and corresponding document indices
print("Similarity Ranking (Descending Order):")
for rank, idx in enumerate(ranked_indices):
    print(f"Rank {rank+1}: Document Index {idx}, Similarity Score {similarities[idx]}")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Enter the index of the source document: 2
Similarity Ranking (Descending Order):
Rank 1: Document Index 0, Similarity Score 1.000000238418579
Rank 2: Document Index 1, Similarity Score 1.000000238418579
Rank 3: Document Index 2, Similarity Score 1.000000238418579
Rank 4: Document Index 3, Similarity Score 1.000000238418579
Rank 5: Document Index 4, Similarity Score 1.000000238418579
Rank 6: Document Index 5, Similarity Score 1.000000238418579
Rank 7: Document Index 6, Similarity Score 1.000000238418579
Rank 8: Document Index 7, Similarity Score 1.000000238418579
Rank 9: Document Index 8, Similarity Score 1.000000238418579
Rank 10: Document Index 9, Similarity Score 1.000000238418579
Rank 11: Document Index 10, Similarity Score 1.000000238418579
Rank 12: Document Index 11, Similarity Score 1.000000238418579
Rank 13: Document Index 12, Similarity Score 1.000000238418579
Rank 14: Document Index 13, Similarity Score 1.000000238418579
Rank 15: Document Index 14, Similarity Score 1.000000238

RoBERT(model_2)

In [None]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Load your dataset containing text documents
# Assuming your dataset is stored in a CSV file named 'documents.csv' with a column named 'content'
dataset = pd.read_csv('news_articles.csv')

# Replace 'content' with the correct column name if it's different
text_column_name = 'category'

# Check if the text_column_name exists in the dataset
if text_column_name not in dataset.columns:
    raise KeyError(f"Column '{text_column_name}' not found in the dataset.")

# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')

# Function to tokenize and get RoBERTa embeddings for text
def get_roberta_embeddings(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = roberta_model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pool the embeddings
    return embeddings

# Get the source document index from the user
source_doc_index = int(input("Enter the index of the source document: "))

# Calculate RoBERTa embeddings for the source document
source_text = dataset.iloc[source_doc_index][text_column_name]
source_embeddings = get_roberta_embeddings(source_text).numpy()

# Calculate cosine similarity of the source document with all other documents
similarities = []
for _, row in dataset.iterrows():
    text = row[text_column_name]
    embeddings = get_roberta_embeddings(text).numpy()
    similarity = cosine_similarity([source_embeddings.flatten()], [embeddings.flatten()])[0][0]
    similarities.append(similarity)

# Rank the documents based on similarity (descending order)
ranked_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)

# Print the ranked similarity scores and corresponding document indices
print("Similarity Ranking (Descending Order):")
for rank, idx in enumerate(ranked_indices):
    print(f"Rank {rank+1}: Document Index {idx}, Similarity Score {similarities[idx]}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enter the index of the source document: 2
Similarity Ranking (Descending Order):
Rank 1: Document Index 0, Similarity Score 1.0000001192092896
Rank 2: Document Index 1, Similarity Score 1.0000001192092896
Rank 3: Document Index 2, Similarity Score 1.0000001192092896
Rank 4: Document Index 3, Similarity Score 1.0000001192092896
Rank 5: Document Index 4, Similarity Score 1.0000001192092896
Rank 6: Document Index 5, Similarity Score 1.0000001192092896
Rank 7: Document Index 6, Similarity Score 1.0000001192092896
Rank 8: Document Index 7, Similarity Score 1.0000001192092896
Rank 9: Document Index 8, Similarity Score 1.0000001192092896
Rank 10: Document Index 9, Similarity Score 1.0000001192092896
Rank 11: Document Index 10, Similarity Score 1.0000001192092896
Rank 12: Document Index 11, Similarity Score 1.0000001192092896
Rank 13: Document Index 12, Similarity Score 1.0000001192092896
Rank 14: Document Index 13, Similarity Score 1.0000001192092896
Rank 15: Document Index 14, Similarity Sco

FasteX(model_3)

In [None]:
#Fastex
import pandas as pd
from gensim.models import FastText
import numpy as np  # Import numpy for numerical operations

# Load your dataset containing text documents
# Assuming your dataset is stored in a CSV file named 'news_articles.csv' with a column named 'content'
dataset = pd.read_csv('news_articles.csv')

# Replace 'content' with the correct column name if it's different
text_column_name = 'category'

# Check if the text_column_name exists in the dataset
if text_column_name not in dataset.columns:
    raise KeyError(f"Column '{text_column_name}' not found in the dataset.")

# Build vocabulary from the text data
sentences = dataset[text_column_name].apply(lambda x: x.split())
model = FastText(sg=1,  # Use skip-gram (sg=1) or CBOW (sg=0)
                  min_count=5,  # Minimum word count
                  window=5,  # Context window size
                  vector_size=100)  # Embedding dimension
model.build_vocab(sentences)  # Build vocabulary before training

# Train the fastText model on your dataset
try:
  model.train(sentences=sentences, epochs=10)  # Train for 10 epochs (adjust as needed)
except Exception as e:
  print("An error occurred during training:", e)
  exit()  # Exit if training fails

# Function to preprocess text for fastText (optional, customize based on your needs)
def preprocess_text(text):
  # Lowercase, remove punctuation, etc.
  text = text.lower()
  text = ''.join([c for c in text if c.isalnum() or c.isspace()])
  return text

# Function to get fastText embeddings for text
def get_fasttext_embeddings(text):
  preprocessed_text = preprocess_text(text)
  tokens = preprocessed_text.split()
  # Get average of word vectors (consider other aggregation methods if needed)
  embeddings = np.mean([model.wv[token] for token in tokens if token in model.wv], axis=0)
  return embeddings

# Get the source document index from the user
source_doc_index = int(input("Enter the index of the source document: "))

# Get the source document text
source_text = dataset.iloc[source_doc_index][text_column_name]

# Calculate fastText embeddings for the source document
source_embeddings = get_fasttext_embeddings(source_text)

# Calculate cosine similarity of the source document with all other documents
similarities = []
for _, row in dataset.iterrows():
  text = row[text_column_name]
  embeddings = get_fasttext_embeddings(text)
  similarity = np.dot(source_embeddings, embeddings) / (np.linalg.norm(source_embeddings) * np.linalg.norm(embeddings))
  similarities.append(similarity)

# Rank the documents based on similarity (descending order)
ranked_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)

# Print the ranked similarity scores and corresponding document indices
print("Similarity Ranking (Descending Order):")
for rank, idx in enumerate(ranked_indices):
  print(f"Rank {rank+1}: Document Index {idx}, Similarity Score {similarities[idx]}")


An error occurred during training: You must specify either total_examples or total_words, for proper learning-rate and progress calculations. If you've just built the vocabulary using the same corpus, using the count cached in the model is sufficient: total_examples=model.corpus_count.
Enter the index of the source document: 2
Similarity Ranking (Descending Order):
Rank 1: Document Index 0, Similarity Score 1.0000001192092896
Rank 2: Document Index 1, Similarity Score 1.0000001192092896
Rank 3: Document Index 2, Similarity Score 1.0000001192092896
Rank 4: Document Index 3, Similarity Score 1.0000001192092896
Rank 5: Document Index 4, Similarity Score 1.0000001192092896
Rank 6: Document Index 5, Similarity Score 1.0000001192092896
Rank 7: Document Index 6, Similarity Score 1.0000001192092896
Rank 8: Document Index 7, Similarity Score 1.0000001192092896
Rank 9: Document Index 8, Similarity Score 1.0000001192092896
Rank 10: Document Index 9, Similarity Score 1.0000001192092896
Rank 11: Do

XLNet(model_4)

In [None]:
import pandas as pd
from transformers import XLNetTokenizer, XLNetModel
import torch
import numpy as np

# Try reading the CSV file with error handling
try:
    dataset = pd.read_csv('news_articles.csv')
except Exception as e:
    print("An error occurred while reading the CSV file:", e)
    exit()

# Replace 'content' with the correct column name if it's different
text_column_name = 'category'

# Function to preprocess text for XLNet (optional, customize based on your needs)
def preprocess_text(text):
    # Lowercase, remove punctuation, etc.
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    return text

# Initialize XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')

# Function to get XLNet embeddings for text
def get_xlnet_embeddings(text):
    preprocessed_text = preprocess_text(text)
    input_ids = torch.tensor(tokenizer.encode(preprocessed_text, add_special_tokens=True)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    # Get average of last hidden states as embeddings (consider other aggregation methods if needed)
    embeddings = torch.mean(last_hidden_states, dim=1).detach().numpy()
    return embeddings

# Get the source document index from the user
source_doc_index = int(input("Enter the index of the source document: "))

# Get the source document text
source_text = dataset.iloc[source_doc_index][text_column_name]

# Calculate XLNet embeddings for the source document
source_embeddings = get_xlnet_embeddings(source_text)

# Calculate cosine similarity of the source document with all other documents
similarities = []
for _, row in dataset.iterrows():
    text = row[text_column_name]
    embeddings = get_xlnet_embeddings(text)
    similarity = np.dot(source_embeddings.flatten(), embeddings.flatten()) / (np.linalg.norm(source_embeddings) * np.linalg.norm(embeddings))
    similarities.append(similarity)

# Rank the documents based on similarity (descending order)
ranked_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)

# Print the ranked similarity scores and corresponding document indices
print("Similarity Ranking (Descending Order):")
for rank, idx in enumerate(ranked_indices):
    print(f"Rank {rank+1}: Document Index {idx}, Similarity Score {similarities[idx]}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Enter the index of the source document: 2
Similarity Ranking (Descending Order):
Rank 1: Document Index 0, Similarity Score 0.9999998807907104
Rank 2: Document Index 1, Similarity Score 0.9999998807907104
Rank 3: Document Index 2, Similarity Score 0.9999998807907104
Rank 4: Document Index 3, Similarity Score 0.9999998807907104
Rank 5: Document Index 4, Similarity Score 0.9999998807907104
Rank 6: Document Index 5, Similarity Score 0.9999998807907104
Rank 7: Document Index 6, Similarity Score 0.9999998807907104
Rank 8: Document Index 7, Similarity Score 0.9999998807907104
Rank 9: Document Index 8, Similarity Score 0.9999998807907104
Rank 10: Document Index 9, Similarity Score 0.9999998807907104
Rank 11: Document Index 10, Similarity Score 0.9999998807907104
Rank 12: Document Index 11, Similarity Score 0.9999998807907104
Rank 13: Document Index 12, Similarity Score 0.9999998807907104
Rank 14: Document Index 13, Similarity Score 0.9999998807907104
Rank 15: Document Index 14, Similarity Sco

In [None]:
pip install gensim




In [None]:
pip install gensim nltk




Word2vec

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import csv
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)

# Load nltk data
nltk.download('punkt')

# Load your dataset containing text documents, handle errors during parsing
dataset = []
with open('news_articles.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        if len(row) == 3:  # Check if the row has the expected number of columns
            dataset.append(row)
        else:
            logging.warning(f"Skipped row: {row} - Number of columns: {len(row)}")

# Convert the list of lists to a DataFrame
dataset = pd.DataFrame(dataset[1:], columns=dataset[0])

text_column_name = 'category'

if text_column_name not in dataset.columns:
    raise KeyError(f"Column '{text_column_name}' not found in the dataset.")

# Preprocess text data for Word2Vec
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing
    return [token for token in tokens if token.isalnum()]  # Remove non-alphanumeric tokens

# Apply preprocessing to the dataset
dataset['clean_text'] = dataset[text_column_name].apply(preprocess_text)

# Train Word2Vec model
word2vec_model = Word2Vec(dataset['clean_text'], vector_size=300, window=5, min_count=1, workers=4)

# Function to get Word2Vec embeddings for text
def get_word2vec_embeddings(text):
    tokens = preprocess_text(text)
    embeddings = []
    for token in tokens:
        if token in word2vec_model.wv:
            embeddings.append(word2vec_model.wv[token])
    if embeddings:
        return sum(embeddings) / len(embeddings)  # Average embeddings of all tokens
    else:
        return None  # Return None if no valid tokens are found

# Get the source document index from the user
source_doc_index = int(input("Enter the index of the source document: "))

# Calculate Word2Vec embeddings for the source document
source_text = dataset.iloc[source_doc_index][text_column_name]
source_embeddings = get_word2vec_embeddings(source_text)

if source_embeddings is None:
    raise ValueError("No valid tokens found in the source document.")

# Calculate cosine similarity of the source document with all other documents
similarities = []
num_instances = 1676  # Specify the number of instances to consider
processed_instances = 0  # Track the number of processed instances

for idx, row in dataset.head(num_instances).iterrows():  # Limit to the desired number of instances
    processed_instances += 1
    text = row[text_column_name]
    embeddings = get_word2vec_embeddings(text)
    if embeddings is not None:
        similarity = cosine_similarity([source_embeddings], [embeddings])[0][0]
        similarities.append((idx, similarity))

# Rank the documents based on similarity (descending order)
ranked_indices = sorted(similarities, key=lambda x: x[1], reverse=True)

# Print the ranked similarity scores and corresponding document indices
print("Similarity Ranking (Descending Order):")
for rank, (idx, similarity) in enumerate(ranked_indices):
    print(f"Rank {rank+1}: Document Index {idx}, Similarity Score {similarity}")

# Check if exactly 1676 instances were processed
if processed_instances != num_instances:
    raise ValueError(f"Expected to process {num_instances} instances, but processed {processed_instances} instances.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter the index of the source document: 2
Similarity Ranking (Descending Order):
Rank 1: Document Index 0, Similarity Score 0.9999999403953552
Rank 2: Document Index 1, Similarity Score 0.9999999403953552
Rank 3: Document Index 2, Similarity Score 0.9999999403953552
Rank 4: Document Index 3, Similarity Score 0.9999999403953552
Rank 5: Document Index 4, Similarity Score 0.9999999403953552
Rank 6: Document Index 5, Similarity Score 0.9999999403953552
Rank 7: Document Index 6, Similarity Score 0.9999999403953552
Rank 8: Document Index 7, Similarity Score 0.9999999403953552
Rank 9: Document Index 8, Similarity Score 0.9999999403953552
Rank 10: Document Index 9, Similarity Score 0.9999999403953552
Rank 11: Document Index 10, Similarity Score 0.9999999403953552
Rank 12: Document Index 11, Similarity Score 0.9999999403953552
Rank 13: Document Index 12, Similarity Score 0.9999999403953552
Rank 14: Document Index 13, Similarity Score 0.9999999403953552
Rank 15: Document Index 14, Similarity Sco

In [None]:


glove_file = 'glove.6B.300d.txt'


In [None]:
glove_file = '/full/path/to/glove.6B.300d.txt'


GloVe

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import csv

# Load your dataset containing text documents, handle errors during parsing
dataset = []
with open('news_articles.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        if len(row) == 3:  # Check if the row has the expected number of columns
            dataset.append(row)
        else:
            print(f"Skipped row: {row} - Number of columns: {len(row)}")

# Convert the list of lists to a DataFrame
dataset = pd.DataFrame(dataset[1:], columns=dataset[0])

text_column_name = 'category'

if text_column_name not in dataset.columns:
    raise KeyError(f"Column '{text_column_name}' not found in the dataset.")

# Preprocess text data for GloVe
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing
    return [token for token in tokens if token.isalnum()]  # Remove non-alphanumeric tokens

# Apply preprocessing to the dataset
dataset['clean_text'] = dataset[text_column_name].apply(preprocess_text)

# Download and load pre-trained GloVe embeddings
glove_model = api.load("glove-wiki-gigaword-300")

# Function to get GloVe embeddings for text
def get_glove_embeddings(text):
    tokens = preprocess_text(text)
    embeddings = []
    for token in tokens:
        if token in glove_model:
            embeddings.append(glove_model[token])
    if embeddings:
        return sum(embeddings) / len(embeddings)  # Average embeddings of all tokens
    else:
        return None  # Return None if no valid tokens are found

# Get the source document index from the user
source_doc_index = int(input("Enter the index of the source document: "))

# Calculate GloVe embeddings for the source document
source_text = dataset.iloc[source_doc_index][text_column_name]
source_embeddings = get_glove_embeddings(source_text)

if source_embeddings is None:
    raise ValueError("No valid tokens found in the source document.")

# Calculate cosine similarity of the source document with the first 1676 documents
similarities = []
for idx, row in dataset.head(1676).iterrows():
    text = row[text_column_name]
    embeddings = get_glove_embeddings(text)
    if embeddings is not None:
        similarity = cosine_similarity([source_embeddings], [embeddings])[0][0]
        similarities.append((idx, similarity))

# Rank the documents based on similarity (descending order)
ranked_indices = sorted(similarities, key=lambda x: x[1], reverse=True)

# Print the ranked similarity scores and corresponding document indices
print("Similarity Ranking (Descending Order):")
for rank, (idx, similarity) in enumerate(ranked_indices):
    print(f"Rank {rank+1}: Document Index {idx}, Similarity Score {similarity}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Skipped row: ['-known version of the sistine chapel: a war memorial building designed to house sir stanley spencer’s magnificent series of 19 vast murals inspired by his first world war experiences as a medical orderly.  spencer devoted six years adorning the sandham memorial chapel', ' in burghclere', ' with his quirky', ' detailed scenes of everyday life to honour the forgotten dead. now owned by the national trust', ' the working chapel sits in gardens with views of the hampshire downs (nationaltrust.org.uk).  the value of spencer’s work has soared since his death in 1959 – in 2011', ' a painting sold for £5.4 million', ' making the collection one of britain’s most valuable art treasures.  check in: the pheasant', ' hungerford', ' berkshire. feast on venison and mash at this inn', ' then head upstairs to chic country bedrooms with balconies overlooking fields. b&b rooms cost from £115 a night (thepheasant-inn.co.uk).  a father’s life in his young son’s hands  liverpool’s grand walke

SBERT

In [None]:
pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load your dataset containing text documents
# Assuming your dataset is stored in a CSV file named 'news_articles.csv' with a column named 'content'
dataset = pd.read_csv('news_articles.csv')

# Replace 'content' with the correct column name if it's different
text_column_name = 'category'

# Check if the text_column_name exists in the dataset
if text_column_name not in dataset.columns:
    raise KeyError(f"Column '{text_column_name}' not found in the dataset.")

# Initialize the SBERT model
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to get SBERT embeddings for text
def get_sbert_embeddings(text):
    embeddings = sbert_model.encode(text, convert_to_tensor=True)
    return embeddings

# Get the source document index from the user
source_doc_index = int(input("Enter the index of the source document: "))

# Calculate SBERT embeddings for the source document
source_text = dataset.iloc[source_doc_index][text_column_name]
source_embeddings = get_sbert_embeddings(source_text).cpu().numpy()

# Calculate cosine similarity of the source document with all other documents
similarities = []
for _, row in dataset.iterrows():
    text = row[text_column_name]
    embeddings = get_sbert_embeddings(text).cpu().numpy()
    similarity = cosine_similarity([source_embeddings], [embeddings])[0][0]
    similarities.append(similarity)

# Rank the documents based on similarity (descending order)
ranked_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)

# Print the ranked similarity scores and corresponding document indices
print("Similarity Ranking (Descending Order):")
for rank, idx in enumerate(ranked_indices):
    print(f"Rank {rank+1}: Document Index {idx}, Similarity Score {similarities[idx]}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Enter the index of the source document: 2
Similarity Ranking (Descending Order):
Rank 1: Document Index 0, Similarity Score 1.0
Rank 2: Document Index 1, Similarity Score 1.0
Rank 3: Document Index 2, Similarity Score 1.0
Rank 4: Document Index 3, Similarity Score 1.0
Rank 5: Document Index 4, Similarity Score 1.0
Rank 6: Document Index 5, Similarity Score 1.0
Rank 7: Document Index 6, Similarity Score 1.0
Rank 8: Document Index 7, Similarity Score 1.0
Rank 9: Document Index 8, Similarity Score 1.0
Rank 10: Document Index 9, Similarity Score 1.0
Rank 11: Document Index 10, Similarity Score 1.0
Rank 12: Document Index 11, Similarity Score 1.0
Rank 13: Document Index 12, Similarity Score 1.0
Rank 14: Document Index 13, Similarity Score 1.0
Rank 15: Document Index 14, Similarity Score 1.0
Rank 16: Document Index 15, Similarity Score 1.0
Rank 17: Document Index 16, Similarity Score 1.0
Rank 18: Document Index 17, Similarity Score 1.0
Rank 19: Document Index 18, Similarity Score 1.0
Rank 20

ELMO

In [None]:
pip install allennlp allennlp-models


Collecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl (730 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.2/730.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting allennlp-models
  Downloading allennlp_models-2.10.1-py3-none-any.whl (464 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m464.5/464.5 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch<1.13.0,>=1.10.0 (from allennlp)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision<0.14.0,>=0.8.1 (from allennlp)
  Downloading torchvision-0.13.1-cp310-cp310-manylinux1_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cached-path<1.2.0,>=1.1.3 (from allennlp)
  Downloading cach

In [None]:
import pandas as pd
from allennlp.modules.elmo import Elmo, batch_to_ids
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load your dataset containing text documents
# Assuming your dataset is stored in a CSV file named 'news_articles.csv' with a column named 'content'
dataset = pd.read_csv('news_articles.csv')

# Replace 'content' with the correct column name if it's different
text_column_name = 'category'

# Check if the text_column_name exists in the dataset
if text_column_name not in dataset.columns:
    raise KeyError(f"Column '{text_column_name}' not found in the dataset.")

# Initialize the ELMo model
options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
elmo = Elmo(options_file, weight_file, 1, dropout=0)

# Function to get ELMo embeddings for text
def get_elmo_embeddings(text):
    character_ids = batch_to_ids([text.split()])
    embeddings = elmo(character_ids)['elmo_representations'][0]
    mean_embeddings = embeddings.mean(dim=1).squeeze().detach().numpy()
    return mean_embeddings

# Get the source document index from the user
source_doc_index = int(input("Enter the index of the source document: "))

# Calculate ELMo embeddings for the source document
source_text = dataset.iloc[source_doc_index][text_column_name]
source_embeddings = get_elmo_embeddings(source_text)

# Calculate cosine similarity of the source document with all other documents
similarities = []
for _, row in dataset.iterrows():
    text = row[text_column_name]
    embeddings = get_elmo_embeddings(text)
    similarity = cosine_similarity([source_embeddings], [embeddings])[0][0]
    similarities.append(similarity)

# Rank the documents based on similarity (descending order)
ranked_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)

# Print the ranked similarity scores and corresponding document indices
print("Similarity Ranking (Descending Order):")
for rank, idx in enumerate(ranked_indices):
    print(f"Rank {rank+1}: Document Index {idx}, Similarity Score {similarities[idx]}")


Output()

Output()

Enter the index of the source document: 2
Similarity Ranking (Descending Order):
Rank 1: Document Index 0, Similarity Score 0.9811131358146667
Rank 2: Document Index 1, Similarity Score 0.9794154167175293
Rank 3: Document Index 2, Similarity Score 0.9788295030593872
Rank 4: Document Index 3, Similarity Score 0.9785739183425903
Rank 5: Document Index 4, Similarity Score 0.9784493446350098
Rank 6: Document Index 5, Similarity Score 0.9784117341041565
Rank 7: Document Index 6, Similarity Score 0.9783874154090881
Rank 8: Document Index 7, Similarity Score 0.978371798992157
Rank 9: Document Index 8, Similarity Score 0.9783621430397034
Rank 10: Document Index 9, Similarity Score 0.9783560633659363
Rank 11: Document Index 10, Similarity Score 0.9783520698547363
Rank 12: Document Index 11, Similarity Score 0.9783496260643005
Rank 13: Document Index 12, Similarity Score 0.9783481359481812
Rank 14: Document Index 13, Similarity Score 0.9783471822738647
Rank 15: Document Index 14, Similarity Scor

In [None]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m112.6/171.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [None]:
!pip install allennlp


Collecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl (730 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.2/730.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch<1.13.0,>=1.10.0 (from allennlp)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m780.3 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision<0.14.0,>=0.8.1 (from allennlp)
  Downloading torchvision-0.13.1-cp310-cp310-manylinux1_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cached-path<1.2.0,>=1.1.3 (from allennlp)
  Downloading cached_path-1.1.6-py3-none-any.whl (26 kB)
Collecting fairscale==0.4.6 (from allennlp)
  Downloading fairscale-0.4.6.tar.gz (248 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.2/248.2 kB[0m [31

In [None]:
!pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m40.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.10.1
    Uninstalling huggingface-hub-0.10.1:
      Successfully uninstalled huggingface-hub-0.10.1
  Attempting u

In [None]:
pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [None]:
pip install allennlp


Collecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl (730 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.2/730.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch<1.13.0,>=1.10.0 (from allennlp)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m954.2 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision<0.14.0,>=0.8.1 (from allennlp)
  Downloading torchvision-0.13.1-cp310-cp310-manylinux1_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cached-path<1.2.0,>=1.1.3 (from allennlp)
  Downloading cached_path-1.1.6-py3-none-any.whl (26 kB)
Collecting fairscale==0.4.6 (from allennlp)
  Downloading fairscale-0.4.6.tar.gz (248 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.2/248.2 kB[0m [31m

In [None]:
pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.10.1
    Uninstalling huggingface-hub-0.10.1:
      Successfully uninstalled huggingface-hub-0.10.1
  Attempting u

In [None]:
pip install --upgrade sentence-transformers




In [None]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m163.8/171.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [None]:
!pip install allennlp


Collecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl (730 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.2/730.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch<1.13.0,>=1.10.0 (from allennlp)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision<0.14.0,>=0.8.1 (from allennlp)
  Downloading torchvision-0.13.1-cp310-cp310-manylinux1_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cached-path<1.2.0,>=1.1.3 (from allennlp)
  Downloading cached_path-1.1.6-py3-none-any.whl (26 kB)
Collecting fairscale==0.4.6 (from allennlp)
  Downloading fairscale-0.4.6.tar.gz (248 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.2/248.2 kB[0m [31m

In [None]:
!pip install --upgrade sentence-transformers


Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Downloading transformers-4.41.0-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.15.1 (from sentence-transformers)
  Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers<5.0.0,>=4.34.0->sentence-transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.10.1
    Uninstalling huggin

In [None]:
!pip install sentence-transformers==2.1.0


Collecting sentence-transformers==2.1.0
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.1.0-py3-none-any.whl size=120986 sha256=d67a0234dc32f0f71c85d426dc965dbd9aa89091da61ea1f3e02df6b14833d67
  Stored in directory: /root/.cache/pip/wheels/7b/ed/fd/16b8222e673f5eaa48ed71a0a2a6b66767b5e31bea0e5d3895
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 2.7.0
    Uninstalling sentence-transformers-2.7.0:
      Successfully uninstalled sentence-transformers

In [None]:
!pip install --upgrade sentence-transformers transformers


Collecting sentence-transformers
  Using cached sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
Installing collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 2.1.0
    Uninstalling sentence-transformers-2.1.0:
      Successfully uninstalled sentence-transformers-2.1.0
Successfully installed sentence-transformers-2.7.0


In [None]:
!pip install sentence-transformers==2.0.0 transformers==4.12.0


Collecting sentence-transformers==2.0.0
  Downloading sentence-transformers-2.0.0.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/85.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers==4.12.0
  Downloading transformers-4.12.0-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.11,>=0.10.1 (from transformers==4.12.0)
  Downloading tokenizers-0.10.3.tar.gz (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.7/212.7 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build d

Printing the similarity scores for each model given a source document Index

In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel, RobertaTokenizer, RobertaModel, XLNetTokenizer, XLNetModel
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import FastText, Word2Vec
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from allennlp.modules.elmo import Elmo, batch_to_ids
import nltk

# Download nltk data
nltk.download('punkt')

# Load your dataset containing text documents
dataset = pd.read_csv('news_articles.csv')

# Replace 'category' with the correct column name if it's different
text_column_name = 'category'

if text_column_name not in dataset.columns:
    raise KeyError(f"Column '{text_column_name}' not found in the dataset.")

# Preprocess text data for tokenization and removing non-alphanumeric characters
def preprocess_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    return text

# Initialize tokenizers and models
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')

xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = XLNetModel.from_pretrained('xlnet-base-cased')

sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

glove_model = api.load("glove-wiki-gigaword-300")

elmo_options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
elmo_weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
elmo_model = Elmo(elmo_options_file, elmo_weight_file, 1, dropout=0)

# Function to tokenize and get embeddings for DistilBERT
def get_distilbert_embeddings(text):
    tokens = distilbert_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = distilbert_model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pool the embeddings
    return embeddings.numpy()

# Function to tokenize and get embeddings for RoBERTa
def get_roberta_embeddings(text):
    tokens = roberta_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = roberta_model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pool the embeddings
    return embeddings.numpy()

# Function to tokenize and get embeddings for XLNet
def get_xlnet_embeddings(text):
    input_ids = torch.tensor(xlnet_tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
    with torch.no_grad():
        outputs = xlnet_model(input_ids)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings

# Function to get embeddings for SBERT
def get_sbert_embeddings(text):
    embeddings = sbert_model.encode(text, convert_to_tensor=True)
    return embeddings.cpu().numpy()

# Function to get embeddings for GloVe
def get_glove_embeddings(text):
    tokens = word_tokenize(preprocess_text(text))
    embeddings = [glove_model[token] for token in tokens if token in glove_model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    return None

# Function to get ELMo embeddings for text
def get_elmo_embeddings(text):
    character_ids = batch_to_ids([text.split()])
    embeddings = elmo_model(character_ids)['elmo_representations'][0]
    mean_embeddings = embeddings.mean(dim=1).squeeze().detach().numpy()
    return mean_embeddings

# Function to get embeddings for FastText
def get_fasttext_embeddings(text, model):
    tokens = preprocess_text(text).split()
    embeddings = [model.wv[token] for token in tokens if token in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    return None

# Function to get embeddings for Word2Vec
def get_word2vec_embeddings(text, model):
    tokens = preprocess_text(text).split()
    embeddings = [model.wv[token] for token in tokens if token in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    return None

# Train FastText and Word2Vec models on the dataset
sentences = dataset[text_column_name].apply(lambda x: x.split())
fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=1, sg=1, epochs=10)
word2vec_model = Word2Vec(sentences, vector_size=300, window=5, min_count=1, workers=4)

# Get the source document index from the user
source_doc_index = int(input("Enter the index of the source document: "))  # Change this to any valid index from your dataset

if source_doc_index < 0 or source_doc_index >= len(dataset):
    raise IndexError(f"Invalid index. Please provide a valid index between 0 and {len(dataset) - 1}.")

# Calculate embeddings for the source document for all models
source_text = dataset.iloc[source_doc_index][text_column_name]
source_embeddings = {
    "DistilBERT": get_distilbert_embeddings(source_text).flatten(),
    "RoBERTa": get_roberta_embeddings(source_text).flatten(),
    "XLNet": get_xlnet_embeddings(source_text).flatten(),
    "SBERT": get_sbert_embeddings(source_text).flatten(),
    "GloVe": get_glove_embeddings(source_text),
    "ELMo": get_elmo_embeddings(source_text),
    "FastText": get_fasttext_embeddings(source_text, fasttext_model),
    "Word2Vec": get_word2vec_embeddings(source_text, word2vec_model)
}

# Calculate cosine similarity of the source document with all other documents (first 10 instances)
# Calculate cosine similarity of the source document with all other documents (first 10 instances)
similarities = {model: [] for model in source_embeddings}

for idx, row in dataset.head(10).iterrows():  # Changed to head(10)
    text = row[text_column_name]
    for model_name, source_emb in source_embeddings.items():
        if model_name == "DistilBERT":
            target_emb = get_distilbert_embeddings(text).flatten()
        elif model_name == "RoBERTa":
            target_emb = get_roberta_embeddings(text).flatten()
        elif model_name == "XLNet":
            target_emb = get_xlnet_embeddings(text).flatten()
        elif model_name == "SBERT":
            target_emb = get_sbert_embeddings(text).flatten()
        elif model_name == "GloVe":
            target_emb = get_glove_embeddings(text)
        elif model_name == "ELMo":
            target_emb = get_elmo_embeddings(text)
        elif model_name == "FastText":
            target_emb = get_fasttext_embeddings(text, fasttext_model)
        elif model_name == "Word2Vec":
            target_emb = get_word2vec_embeddings(text, word2vec_model)

        if target_emb is not None:
            similarity = cosine_similarity([source_emb], [target_emb])[0][0]
            similarities[model_name].append((idx, similarity))

# Rank the documents based on similarity (no specific order) for each model
ranked_indices = {model: sim_list for model, sim_list in similarities.items()}

# Print the similarity scores and corresponding document indices for each model
for model_name, ranks in ranked_indices.items():
    print(f"\nSimilarity Scores for {model_name}:")
    for idx, similarity in ranks:
        print(f"Document Index {idx}, Similarity Score {similarity}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enter the index of the source document: 1000

Similarity Scores for DistilBERT:
Document Index 0, Similarity Score 0.9234899282455444
Document Index 1, Similarity Score 0.9234899282455444
Document Index 2, Similarity Score 0.9234899282455444
Document Index 3, Similarity Score 0.9234899282455444
Document Index 4, Similarity Score 0.9234899282455444
Document Index 5, Similarity Score 0.9234899282455444
Document Index 6, Similarity Score 0.9234899282455444
Document Index 7, Similarity Score 0.9234899282455444
Document Index 8, Similarity Score 0.9234899282455444
Document Index 9, Similarity Score 0.9234899282455444

Similarity Scores for RoBERTa:
Document Index 0, Similarity Score 0.9928135275840759
Document Index 1, Similarity Score 0.9928135275840759
Document Index 2, Similarity Score 0.9928135275840759
Document Index 3, Similarity Score 0.9928135275840759
Document Index 4, Similarity Score 0.9928135275840759
Document Index 5, Similarity Score 0.9928135275840759
Document Index 6, Simila

In [None]:
!pip install sentence-transformers allennlp


Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl (730 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.2/730.2 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision<0.14.0,>=0.8.1 (from allennlp)
  Downloading torchvision-0.13.1-cp310-cp310-manylinux1_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cached-path<1.2.0,>=1.1.3 (from allennlp)
  Downl

In [None]:
!pip install pandas torch numpy transformers gensim nltk sentence-transformers allennlp allennlp-models scikit-learn


Collecting sentence-transformers
  Using cached sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
Collecting allennlp
  Using cached allennlp-2.10.1-py3-none-any.whl (730 kB)
Collecting allennlp-models
  Using cached allennlp_models-2.10.1-py3-none-any.whl (464 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-

In [None]:
!pip install sentence-transformers


Collecting sentence-transformers
  Using cached sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Downloading transformers-4.41.0-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.15.1 (from sentence-transformers)
  Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers<5.0.0,>=4.34.0->sentence-transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, tokenizers, transformers, sentence-transfo

In [None]:
!pip install allennlp allennlp-models


Collecting allennlp
  Using cached allennlp-2.10.1-py3-none-any.whl (730 kB)
Collecting allennlp-models
  Using cached allennlp_models-2.10.1-py3-none-any.whl (464 kB)
Collecting torch<1.13.0,>=1.10.0 (from allennlp)
  Using cached torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
Collecting torchvision<0.14.0,>=0.8.1 (from allennlp)
  Using cached torchvision-0.13.1-cp310-cp310-manylinux1_x86_64.whl (19.1 MB)
Collecting cached-path<1.2.0,>=1.1.3 (from allennlp)
  Using cached cached_path-1.1.6-py3-none-any.whl (26 kB)
Collecting fairscale==0.4.6 (from allennlp)
  Using cached fairscale-0.4.6.tar.gz (248 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting spacy<3.4,>=2.1.0 (from allennlp)
  Using cached spacy-3.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.3 MB)
Collecting

In [None]:
!pip install pandas torch numpy transformers gensim nltk scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

Recommending Top 10 Documents given a source document index


In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
import nltk

# Download nltk data
nltk.download('punkt')

# Load your dataset containing text documents
dataset = pd.read_csv('news_articles.csv')

# Check if the 'body' column exists in the dataset
if 'body' not in dataset.columns:
    raise KeyError("Column 'body' not found in the dataset.")

# Initialize tokenizers and models
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to preprocess text data
def preprocess_text(text):
    text = text.lower()
    text = ' '.join(word_tokenize(text))
    return text

# Function to tokenize and get embeddings for DistilBERT
def get_distilbert_embeddings(text):
    tokens = distilbert_tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = distilbert_model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pool the embeddings
    return embeddings.squeeze().numpy()  # Squeeze to remove extra dimensions

# Function to get embeddings for SBERT
def get_sbert_embeddings(text):
    embeddings = sbert_model.encode(text, convert_to_tensor=True)
    return embeddings.cpu().numpy()

# Get the source document index from the user
source_doc_index = int(input("Enter the index of the source document: "))  # Change this to any valid index from your dataset

# Filter dataset by the category of the source document
source_category = dataset.iloc[source_doc_index]['category']
filtered_dataset = dataset[dataset['category'] == source_category]

# Preprocess the source document
source_text = preprocess_text(dataset.iloc[source_doc_index]['body'])

# Calculate embeddings for the source document
source_distilbert_emb = get_distilbert_embeddings(source_text)
source_sbert_emb = get_sbert_embeddings(source_text)

# Calculate cosine similarity of the source document with all other documents in the same category
similarities = []

for idx, row in filtered_dataset.iterrows():
    text = preprocess_text(row['body'])
    distilbert_emb = get_distilbert_embeddings(text)
    sbert_emb = get_sbert_embeddings(text)

    distilbert_sim = cosine_similarity([source_distilbert_emb], [distilbert_emb])[0][0]
    sbert_sim = cosine_similarity([source_sbert_emb], [sbert_emb])[0][0]

    # Combine similarity scores from both models (you can customize this)
    combined_sim = (distilbert_sim + sbert_sim) / 2

    similarities.append((idx, combined_sim))

# Rank the documents based on similarity in descending order
ranked_documents = sorted(similarities, key=lambda x: x[1], reverse=True)[:10]

# Print the ranked documents
print(f"\nTop 10 Recommended Documents for Source Document Index {source_doc_index}:")
for idx, similarity in ranked_documents:
    print(f"Document Index {idx}, Similarity Score {similarity}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter the index of the source document: 10

Top 10 Recommended Documents for Source Document Index 10:
Document Index 10, Similarity Score 1.0
Document Index 88, Similarity Score 0.8216865062713623
Document Index 35, Similarity Score 0.8016748428344727
Document Index 84, Similarity Score 0.7917804718017578
Document Index 74, Similarity Score 0.7847641706466675
Document Index 165, Similarity Score 0.769930362701416
Document Index 58, Similarity Score 0.740818977355957
Document Index 91, Similarity Score 0.7087909579277039
Document Index 27, Similarity Score 0.7056753039360046
Document Index 86, Similarity Score 0.6991989016532898
