### Importing Libraries

In [38]:
import re
import nltk
import string
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords,twitter_samples
from nltk.stem import PorterStemmer
import gensim
import numpy as np
from gensim.models import KeyedVectors

In [39]:
nltk.download('stopwords')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

### Loading Embeddings

## The data

Download
* English embeddings from Google code archive word2vec
[look for GoogleNews-vectors-negative300.bin.gz](https://code.google.com/archive/p/word2vec/)
    * You'll need to unzip the file first.
* and the French embeddings from
[cross_lingual_text_classification](https://github.com/vjstark/crosslingual_text_classification).
    * in the terminal, type (in one line)
    `curl -o ./wiki.multi.fr.vec https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fr.vec`

The two files will be named as 
* `GoogleNews-vectors-negative300.bin`
* `wiki.multi.fr.vec`

These files have been used in the code below.

In [40]:
en_embeddings = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin',binary=True)
fr_embeddings = KeyedVectors.load_word2vec_format('./wiki.multi.fr.vec')

### Loading Tweets

In [41]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = all_positive_tweets + all_negative_tweets

In [42]:
def process_tweet(tweet):
    '''
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    '''
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [43]:
def get_document_embedding(document:str,en_embeddings):
    """
    Calculates the embedding vector for a given document.
    
    Args:
        document (str): The input document.
        en_embeddings: The word embeddings model.
        
    Returns:
        numpy.ndarray: The document embedding vector.
    """
    
    processed_doc = process_tweet(document)
    document_embedding = np.zeros(300)
    for i in range(len(processed_doc)):
        try:
            document_embedding += en_embeddings.get_vector(processed_doc[i])
        except KeyError:
            pass
    return document_embedding

In [44]:
def get_document_vecs(all_docs, en_embeddings):
    '''
    Input:
        - all_docs: list of strings - all tweets in our dataset.
        - en_embeddings: dictionary with words as the keys and their embeddings as the values.
    Output:
        - document_vec_matrix: matrix of tweet embeddings.
        - ind2Doc_dict: dictionary with indices of tweets in vecs as keys and their embeddings as the values.
    '''
    ind2Doc_dict = {}
    document_matrix = []
    for index,doc in enumerate(all_docs):
        document_embedding = get_document_embedding(doc,en_embeddings)
        ind2Doc_dict[index] = document_embedding
        document_matrix.append(document_embedding)
    document_matrix = np.vstack(document_matrix)
    return document_matrix,ind2Doc_dict

### Embeddings of each tweet

In [45]:
document_vecs, ind2Tweet = get_document_vecs(all_tweets, en_embeddings)

### Hyperparameters

In [46]:
N_PLANES = 10
N_UNIVERSES = 25
N_DIMS = 300

### Initializing planes

In [47]:
np.random.seed(0)

def generate_planes(N_DIMS, N_PLANES, N_UNIVERSES):
    """
    Generate random planes for document search.

    Parameters:
    - N_DIMS (int): Number of dimensions for each plane.
    - N_PLANES (int): Number of planes to generate.
    - N_UNIVERSES (int): Number of universes.

    Returns:
    - planes_l (list): List of randomly generated planes.
    """
    planes_l = [np.random.normal(size=(N_DIMS,N_PLANES)) for _ in range(N_UNIVERSES)]
    return planes_l

planes_l = generate_planes(N_DIMS, N_PLANES, N_UNIVERSES)


In [48]:
def hash_value_of_vector(v, planes):
    """Create a hash for a vector; hash_id says which random hash to use.
    Input:
        - v:  vector of tweet. It's dimension is (1, N_DIMS)
        - planes: matrix of dimension (N_DIMS, N_PLANES) - the set of planes that divide up the region
    Output:
        - res: a number which is used as a hash for your vector
    """
    h = 0
    for i in range(N_PLANES):
        p = planes[:,i]
        h += (np.sign(np.dot(p,v.T))>=0)*np.power(2,i)
    return h.item()

In [49]:
def make_hash_table(vecs, planes):
    """
    Input:
        - vecs: list of vectors to be hashed.
        - planes: the matrix of planes in a single "universe", with shape (embedding dimensions, number of planes).
    Output:
        - hash_table: dictionary - keys are hashes, values are lists of vectors (hash buckets)
        - id_table: dictionary - keys are hashes, values are list of vectors id's
                            (it's used to know which tweet corresponds to the hashed vector)
    """
    buckets = 2**N_PLANES
    hash_table = {i:[] for i in range(buckets)}
    id_table = {i:[] for i in range(buckets)}
    for i,v in enumerate(vecs):
        h = hash_value_of_vector(v,planes)
        hash_table[h].append(v)
        id_table[h].append(i)
    return hash_table,id_table

In [50]:
### Creating the hashtables
hash_tables = []
id_tables = []
for universe_id in range(N_UNIVERSES):  # there are 25 hashes
    print('working on hash universe #:', universe_id)
    planes = planes_l[universe_id]
    hash_table, id_table = make_hash_table(document_vecs, planes)
    hash_tables.append(hash_table)
    id_tables.append(id_table)

working on hash universe #: 0
working on hash universe #: 1
working on hash universe #: 2
working on hash universe #: 3
working on hash universe #: 4
working on hash universe #: 5
working on hash universe #: 6
working on hash universe #: 7
working on hash universe #: 8
working on hash universe #: 9
working on hash universe #: 10
working on hash universe #: 11
working on hash universe #: 12
working on hash universe #: 13
working on hash universe #: 14
working on hash universe #: 15
working on hash universe #: 16
working on hash universe #: 17
working on hash universe #: 18
working on hash universe #: 19
working on hash universe #: 20
working on hash universe #: 21
working on hash universe #: 22
working on hash universe #: 23
working on hash universe #: 24


In [51]:
def k_nearest_neighbours(v, candidates: list, k=1):
    """
    Finds the k nearest neighbours to a given vector v from a list of candidate vectors.

    Parameters:
    v (array-like): The vector for which nearest neighbours need to be found.
    candidates (list): A list of candidate vectors.
    k (int): The number of nearest neighbours to be returned. Default is 1.

    Returns:
    list: The indices of the k nearest neighbours in the candidates list.
    """
    similarity_score = []
    for c in candidates:
        similarity_score.append(np.dot(v, c) / (np.linalg.norm(v) * np.linalg.norm(c)))
    sorted_ids = np.argsort(similarity_score)
    return sorted_ids[-k:]

In [52]:
def approximate_knn(doc_id, v, planes_l, k=1, num_universes_to_use=N_UNIVERSES):
    """Search for k-NN using hashes."""
    neighbours_to_consider = []
    neighbours_to_consider_id = []
    neighbours_to_consider_id_set = set()
    for i in range(num_universes_to_use):
        planes = planes_l[i]
        bucket_id = hash_value_of_vector(v,planes)
        neighbours = hash_tables[i][bucket_id]
        neighbours_id = id_tables[i][bucket_id]
        for index,i_d in enumerate(neighbours_id):
            if i_d == doc_id: continue
            if i_d not in neighbours_to_consider_id_set:
                neighbours_to_consider_id_set.add(i_d)
                neighbours_to_consider_id.append(i_d)
                neighbours_to_consider.append(neighbours[index])
    print("Fast considering %d vecs" % len(neighbours_to_consider))
    nearest_neighbor_id = k_nearest_neighbours(v,neighbours_to_consider,k=k)
    print(nearest_neighbor_id)
    print(neighbours_to_consider_id)
    return [neighbours_to_consider_id[idx] for idx in nearest_neighbor_id]

In [53]:
doc_id = 0
doc_to_search = all_tweets[doc_id]
vec_to_search = document_vecs[doc_id]

In [54]:
nearest_neighbor_ids = approximate_knn(doc_id, vec_to_search, planes_l, k=3, num_universes_to_use=5)

Fast considering 939 vecs
[16  7 35]
[3, 5, 7, 26, 28, 36, 44, 51, 66, 68, 71, 76, 79, 83, 91, 97, 105, 112, 117, 125, 126, 131, 135, 146, 152, 154, 156, 168, 184, 195, 210, 214, 220, 232, 233, 253, 254, 277, 285, 286, 292, 299, 319, 332, 350, 371, 373, 375, 404, 427, 430, 432, 466, 467, 469, 476, 478, 479, 491, 511, 521, 531, 538, 539, 563, 579, 591, 594, 615, 618, 619, 642, 647, 661, 671, 673, 674, 675, 681, 701, 705, 724, 727, 738, 743, 757, 762, 767, 770, 773, 780, 794, 810, 822, 824, 826, 833, 835, 842, 847, 850, 855, 859, 874, 884, 887, 898, 909, 920, 930, 938, 943, 958, 959, 962, 993, 995, 1005, 1012, 1023, 1033, 1039, 1040, 1058, 1065, 1069, 1073, 1075, 1081, 1088, 1107, 1113, 1117, 1142, 1147, 1154, 1172, 1176, 1180, 1185, 1189, 1202, 1211, 1212, 1219, 1220, 1228, 1248, 1249, 1270, 1278, 1280, 1287, 1296, 1304, 1309, 1324, 1328, 1352, 1358, 1383, 1386, 1411, 1415, 1419, 1420, 1423, 1427, 1444, 1451, 1454, 1461, 1462, 1470, 1473, 1477, 1484, 1485, 1488, 1490, 1512, 1513, 1520, 

In [55]:
print(f"Nearest neighbors for document {doc_id}")
print(f"Document contents: {doc_to_search}")
print("")

for neighbor_id in nearest_neighbor_ids:
    print(f"Nearest neighbor at document id {neighbor_id}")
    print(f"document contents: {all_tweets[neighbor_id]}")

Nearest neighbors for document 0
Document contents: #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Nearest neighbor at document id 105
document contents: #FollowFriday @straz_das @DCarsonCPA @GH813600 for being top engaged members in my community this week :)
Nearest neighbor at document id 51
document contents: #FollowFriday @France_Espana @reglisse_menthe @CCI_inter for being top engaged members in my community this week :)
Nearest neighbor at document id 253
document contents: #FollowFriday @CCIFCcanada @AdamEvnmnt @boxcalf1 for being top engaged members in my community this week :)
