<a href="https://colab.research.google.com/github/AmoghTantradi/CS182-final-project/blob/main/Token_Identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from collections import Counter
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

model = api.load("word2vec-google-news-300")



def get_rarities(tokenizer):
    # Download the Gutenberg Corpus (if not already downloaded)
    nltk.download('gutenberg')
    # Download the Punkt tokenizer models
    nltk.download('punkt')
    # Load the Gutenberg Corpus
    corpus = gutenberg.raw('bible-kjv.txt')  # You can choose a specific Gutenberg text

    # Tokenize the entire corpus
    tokens = word_tokenize(corpus)

    # Calculate token frequencies
    token_frequencies = Counter(tokens)

    # Calculate relative rarity for each token
    total_tokens = len(tokens)
    relative_rarity = {token: freq / total_tokens for token, freq in token_frequencies.items()}

    # Sort tokens by rarity (from rarest to most common)
    sorted_tokens_by_rarity = sorted(relative_rarity.items(), key=lambda x: x[1], reverse=True)

    return sorted_tokens_by_rarity


In [None]:
def get_similarities(tokens, category):
  similarities = []
  if category not in model:
    print("ERROR: THE CATEGORY IS NOT IN THE VOCABULARY")
    return
  for token_pair in tokens:
    token = token_pair[0]


    # Calculate the cosine similarity between the word vectors
    if token not in model:
      similarities.append((token, 1)) #If the word is not present, we will not choose it, so we set maximum cosine similarity
    else:

      cosine_sim = cosine_similarity([model[token]], [model[category]])

      similarities.append((token, 1+cosine_sim.item()))
  #normalize similarities:
  total_sim = 0
  for i in range(len(similarities)):
    total_sim += similarities[i][1]
  for i in range(len(similarities)):
    similarities[i] = (similarities[i][0], similarities[i][1]/total_sim)

  #sort by similarity:

  return similarities


In [None]:
def composite_token_ranking(token_rarities, token_similarities):
  composite_ranking = []
  n = len(token_rarities)
  for i in range(n):
    token = token_rarities[i][0]
    rarity = token_rarities[i][1]
    similarity = token_similarities[i][1]
    composite_ranking.append((token, rarity + similarity + ((rarity*similarity)**2)))
  return composite_ranking


In [None]:
def find_tokens(category, num_tokens):
  token_rarities = get_rarities(tokenizer)
  token_similarities = get_similarities(token_rarities, category)
  composite_ranking = composite_token_ranking(token_rarities, token_similarities)
  composite_ranking = sorted(composite_ranking, key=lambda x: x[1])
  return composite_ranking[:num_tokens]

In [None]:
def find_tokens_two_categories(category1, category2, num_tokens):
  tokens1 = find_tokens(category1, num_tokens)
  tokens2 = find_tokens(category2, num_tokens)
  two_token_ranking = []
  n = len(tokens1)
  for i in range(n):
    for j in range(n):
      token1 = tokens1[i][0]
      token2 = tokens2[j][0]
      cosine_sim = cosine_similarity([model[token1]], [model[token2]]).item()
      two_token_ranking.append((token1, token2, cosine_sim))
  two_token_ranking = sorted(two_token_ranking, key=lambda x: x[2])
  return two_token_ranking[:num_tokens]

In [None]:
'''
How to use: replace "bear" with the class of the object that Stable Diffusion is bring trained to generated.
'''
class_name = "bear"
find_tokens(class_name, 10)

In [None]:
'''
How to use: replace arguments with the two categories. If they are the same, category, put the same word twice.
'''

find_tokens_two_categories("cat", "dog", 10)