In [1]:
import time
import os
import numpy as np
import google.colab as colab
import random
import json
%matplotlib inline
import matplotlib.pyplot as plt
from multiprocessing import Pool
import shutil
from pprint import pprint
import pickle
from random import randint
import pandas as pd

import re
import inspect
import torch
from torch import optim
from torch.autograd import Variable
import torch.nn as nn

import nltk
nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Mount Google Drive

In [2]:
def mount_google_drive():
	'''
	# Functionality
		Mount google drive. Since colab does not save files, we want to make it easier to directly access files in google drive.
	# Arguments
		Nothing
	# Returns
		drive_root: the working directory mounted
	'''
	mount_directory = "/content/gdrive"
	drive = colab.drive
	drive.mount(mount_directory, force_remount=True)
	drive_root = mount_directory + "/" + list(filter(lambda x: x[0] != '.', os.listdir(mount_directory)))[0]
	return drive_root

In [3]:
# Please Set up mounted directories here. Notice whether you want to balance dataset
ROOT_DIR =  mount_google_drive() + "/05839-Final-Project/code/"

DATASET_PATH = ROOT_DIR + "quora.csv"

NLI_NET_DIR = ROOT_DIR + "models/NliNetUtils/"

CHECKPOINT_DIR = ROOT_DIR + "checkpoints/"

Mounted at /content/gdrive


In [4]:
# Migrate utils from drive to current dir so that we don't need to upload a folder from local every time
shutil.rmtree('utils/', ignore_errors=True)
_ = shutil.copytree(ROOT_DIR +"/utils/", "utils/")

In [5]:
# Load custimizable utils here
from utils.file_utils import *
from utils.image_utils import *
from utils.generator_utils import *
from utils.tqdm_utils import *
from utils.keras_utils import *

In [6]:
# Load infersent model related files
shutil.rmtree('models.py', ignore_errors=True)
shutil.copy(NLI_NET_DIR + "models.py", "models.py")

shutil.rmtree('data.py', ignore_errors=True)
shutil.copy(NLI_NET_DIR + "data.py", "data.py")

shutil.rmtree('mutils.py', ignore_errors=True)
shutil.copy(NLI_NET_DIR + "mutils.py", "mutils.py")


'mutils.py'

In [7]:
# shutil.rmtree('fastText/', ignore_errors=True)
# shutil.copytree(ROOT_DIR + "fastText/", "fastText/")

In [8]:
from data import get_nli, get_batch, build_vocab
from mutils import get_optimizer
from models import NLINet

In [9]:
def get_optimizer(s):
    """
    Parse optimizer parameters.
    Input should be of the form:
        - "sgd,lr=0.01"
        - "adagrad,lr=0.1,lr_decay=0.05"
    """
    if "," in s:
        method = s[:s.find(',')]
        optim_params = {}
        for x in s[s.find(',') + 1:].split(','):
            split = x.split('=')
            assert len(split) == 2
            assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
            optim_params[split[0]] = float(split[1])
    else:
        method = s
        optim_params = {}

    if method == 'adadelta':
        optim_fn = optim.Adadelta
    elif method == 'adagrad':
        optim_fn = optim.Adagrad
    elif method == 'adam':
        optim_fn = optim.Adam
    elif method == 'adamax':
        optim_fn = optim.Adamax
    elif method == 'asgd':
        optim_fn = optim.ASGD
    elif method == 'rmsprop':
        optim_fn = optim.RMSprop
    elif method == 'rprop':
        optim_fn = optim.Rprop
    elif method == 'sgd':
        optim_fn = optim.SGD
        assert 'lr' in optim_params
    else:
        raise Exception('Unknown optimization method: "%s"' % method)

    # check that we give good parameters to the optimizer
    expected_args = inspect.getargspec(optim_fn.__init__)[0]
    assert expected_args[:2] == ['self', 'params']
    if not all(k in expected_args[2:] for k in optim_params.keys()):
        raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
            str(expected_args[2:]), str(optim_params.keys())))

    return optim_fn, optim_params

In [10]:
torch.cuda.is_available()

True

In [11]:
class InferSent(nn.Module):
    def __init__(self, config):
        super(InferSent, self).__init__()
        self.bsize = config['bsize']
        self.word_emb_dim = config['word_emb_dim']
        self.enc_lstm_dim = config['enc_lstm_dim']
        self.pool_type = config['pool_type']
        self.dpout_model = config['dpout_model']
        self.version = 1 if 'version' not in config else config['version']

        self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
                                bidirectional=True, dropout=self.dpout_model)

        assert self.version in [1, 2]
        if self.version == 1:
            self.bos = '<s>'
            self.eos = '</s>'
            self.max_pad = True
            self.moses_tok = False
        elif self.version == 2:
            self.bos = '<p>'
            self.eos = '</p>'
            self.max_pad = False
            self.moses_tok = True

    def is_cuda(self):
        # either all weights are on cpu or they are on gpu
        return self.enc_lstm.bias_hh_l0.data.is_cuda

    def forward(self, sent_tuple):
        # sent_len: [max_len, ..., min_len] (bsize)
        # sent: (seqlen x bsize x worddim)
        sent, sent_len = sent_tuple

        # Sort by length (keep idx)
        sent_len_sorted, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len)
        sent_len_sorted = sent_len_sorted.copy()
        idx_unsort = np.argsort(idx_sort)

        idx_sort = torch.from_numpy(idx_sort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_sort)
        sent = sent.index_select(1, idx_sort)

        # Handling padding in Recurrent Networks
        sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len_sorted)
        sent_output = self.enc_lstm(sent_packed)[0]  # seqlen x batch x 2*nhid
        sent_output = nn.utils.rnn.pad_packed_sequence(sent_output)[0]

        # Un-sort by length
        idx_unsort = torch.from_numpy(idx_unsort).cuda() if self.is_cuda() \
            else torch.from_numpy(idx_unsort)
        sent_output = sent_output.index_select(1, idx_unsort)

        # Pooling
        if self.pool_type == "mean":
            sent_len = torch.FloatTensor(sent_len.copy()).unsqueeze(1).cuda()
            emb = torch.sum(sent_output, 0).squeeze(0)
            emb = emb / sent_len.expand_as(emb)
        elif self.pool_type == "max":
            if not self.max_pad:
                sent_output[sent_output == 0] = -1e9
            emb = torch.max(sent_output, 0)[0]
            if emb.ndimension() == 3:
                emb = emb.squeeze(0)
                assert emb.ndimension() == 2

        return emb

    def set_w2v_path(self, w2v_path):
        self.w2v_path = w2v_path

    def get_word_dict(self, sentences, tokenize=True):
        # create vocab of words
        word_dict = {}
        sentences = [s.split() if not tokenize else self.tokenize(s) for s in sentences]
        for sent in sentences:
            for word in sent:
                if word not in word_dict:
                    word_dict[word] = ''
        word_dict[self.bos] = ''
        word_dict[self.eos] = ''
        return word_dict

    def get_w2v(self, word_dict):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with w2v vectors
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if word in word_dict:
                    word_vec[word] = np.fromstring(vec, sep=' ')
        print('Found %s(/%s) words with w2v vectors' % (len(word_vec), len(word_dict)))
        return word_vec

    def get_w2v_k(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        # create word_vec with k first w2v vectors
        k = 0
        word_vec = {}
        with open(self.w2v_path, encoding='utf-8') as f:
            for line in f:
                word, vec = line.split(' ', 1)
                if k <= K:
                    word_vec[word] = np.fromstring(vec, sep=' ')
                    k += 1
                if k > K:
                    if word in [self.bos, self.eos]:
                        word_vec[word] = np.fromstring(vec, sep=' ')

                if k > K and all([w in word_vec for w in [self.bos, self.eos]]):
                    break
        return word_vec

    def build_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        word_dict = self.get_word_dict(sentences, tokenize)
        self.word_vec = self.get_w2v(word_dict)
        print('Vocab size : %s' % (len(self.word_vec)))

    # build w2v vocab with k most frequent words
    def build_vocab_k_words(self, K):
        assert hasattr(self, 'w2v_path'), 'w2v path not set'
        self.word_vec = self.get_w2v_k(K)
        print('Vocab size : %s' % (K))

    def update_vocab(self, sentences, tokenize=True):
        assert hasattr(self, 'w2v_path'), 'warning : w2v path not set'
        assert hasattr(self, 'word_vec'), 'build_vocab before updating it'
        word_dict = self.get_word_dict(sentences, tokenize)

        # keep only new words
        for word in self.word_vec:
            if word in word_dict:
                del word_dict[word]

        # udpate vocabulary
        if word_dict:
            new_word_vec = self.get_w2v(word_dict)
            self.word_vec.update(new_word_vec)
        else:
            new_word_vec = []
        print('New vocab size : %s (added %s words)'% (len(self.word_vec), len(new_word_vec)))

    def get_batch(self, batch):
        # sent in batch in decreasing order of lengths
        # batch: (bsize, max_len, word_dim)
        embed = np.zeros((len(batch[0]), len(batch), self.word_emb_dim))

        for i in range(len(batch)):
            for j in range(len(batch[i])):
                embed[j, i, :] = self.word_vec[batch[i][j]]

        return torch.FloatTensor(embed)

    def tokenize(self, s):
        from nltk.tokenize import word_tokenize
        if self.moses_tok:
            s = ' '.join(word_tokenize(s))
            s = s.replace(" n't ", "n 't ")  # HACK to get ~MOSES tokenization
            return s.split()
        else:
            return word_tokenize(s)

    def prepare_samples(self, sentences, bsize, tokenize, verbose):
        sentences = [[self.bos] + s.split() + [self.eos] if not tokenize else
                     [self.bos] + self.tokenize(s) + [self.eos] for s in sentences]
        n_w = np.sum([len(x) for x in sentences])

        # filters words without w2v vectors
        for i in range(len(sentences)):
            s_f = [word for word in sentences[i] if word in self.word_vec]
            if not s_f:
                import warnings
                warnings.warn('No words in "%s" (idx=%s) have w2v vectors. \
                               Replacing by "</s>"..' % (sentences[i], i))
                s_f = [self.eos]
            sentences[i] = s_f

        lengths = np.array([len(s) for s in sentences])
        n_wk = np.sum(lengths)
        if verbose:
            print('Nb words kept : %s/%s (%.1f%s)' % (
                        n_wk, n_w, 100.0 * n_wk / n_w, '%'))

        # sort by decreasing length
        lengths, idx_sort = np.sort(lengths)[::-1], np.argsort(-lengths)
        sentences = np.array(sentences)[idx_sort]

        return sentences, lengths, idx_sort

    def encode(self, sentences, bsize=64, tokenize=True, verbose=False):
        tic = time.time()
        sentences, lengths, idx_sort = self.prepare_samples(
                        sentences, bsize, tokenize, verbose)

        embeddings = []
        for stidx in range(0, len(sentences), bsize):
            batch = self.get_batch(sentences[stidx:stidx + bsize])
            if self.is_cuda():
                batch = batch.cuda()
            with torch.no_grad():
                batch = self.forward((batch, lengths[stidx:stidx + bsize])).data.cpu().numpy()
            embeddings.append(batch)
        embeddings = np.vstack(embeddings)

        # unsort
        idx_unsort = np.argsort(idx_sort)
        embeddings = embeddings[idx_unsort]

        if verbose:
            print('Speed : %.1f sentences/s (%s mode, bsize=%s)' % (
                    len(embeddings)/(time.time()-tic),
                    'gpu' if self.is_cuda() else 'cpu', bsize))
        return embeddings

    def visualize(self, sent, tokenize=True):

        sent = sent.split() if not tokenize else self.tokenize(sent)
        sent = [[self.bos] + [word for word in sent if word in self.word_vec] + [self.eos]]

        if ' '.join(sent[0]) == '%s %s' % (self.bos, self.eos):
            import warnings
            warnings.warn('No words in "%s" have w2v vectors. Replacing \
                           by "%s %s"..' % (sent, self.bos, self.eos))
        batch = self.get_batch(sent)

        if self.is_cuda():
            batch = batch.cuda()
        output = self.enc_lstm(batch)[0]
        output, idxs = torch.max(output, 0)
        # output, idxs = output.squeeze(), idxs.squeeze()
        idxs = idxs.data.cpu().numpy()
        argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))]

        # visualize model
        import matplotlib.pyplot as plt
        x = range(len(sent[0]))
        y = [100.0 * n / np.sum(argmaxs) for n in argmaxs]
        plt.xticks(x, sent[0], rotation=45)
        plt.bar(x, y)
        plt.ylabel('%')
        plt.title('Visualisation of words importance')
        plt.show()

        return output, idxs

In [None]:
# !mkdir fastText
# !curl -Lo fastText/crawl-300d-2M.vec.zip https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
# !unzip fastText/crawl-300d-2M.vec.zip -d fastText/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1453M  100 1453M    0     0  22.9M      0  0:01:03  0:01:03 --:--:-- 22.8M
Archive:  fastText/crawl-300d-2M.vec.zip
  inflating: fastText/crawl-300d-2M.vec  


In [None]:
!mkdir encoder
!curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  19.9M      0  0:00:07  0:00:07 --:--:-- 23.3M


In [None]:
def build_nli_net():
  V = 2
  MODEL_PATH = 'encoder/infersent%s.pkl' % V
  params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                  'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
  infersent = InferSent(params_model)
  infersent.load_state_dict(torch.load(MODEL_PATH))
  return infersent

In [None]:
infersent = build_nli_net()

In [None]:
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [None]:
infersent.build_vocab_k_words(K=500000)

Vocab size : 500000


In [None]:
def text_prepare(text):
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    text = str(text)
    # text = " ".join([word for word in text.split(" ") if re.search('[a-zA-Z]', word)])
    # text = text.lower()
    # text = re.sub(REPLACE_BY_SPACE_RE, " ", text)
    # text = re.sub(BAD_SYMBOLS_RE, "", text)
    return text

def cosine(u, v):
  # compute the similarity between two embeddings
  # u and v are matrices!
    result = np.einsum('ij,ij->i', u, v) / ((np.linalg.norm(u, axis=1) * np.linalg.norm(v, axis=1)))
    return np.log(result) + 1

In [None]:
tweet_1 = "Since the start of the pandemic, a total 65 WHO staff stationed in Geneva - working from home and onsite - have tested positive for #COVID19. We have not yet established whether any transmission has occurred on campus, but are looking into the matter."
tweet_2 = "WHO staff who were confirmed positive with #COVID19 in Geneva have received the necessary medical attention. WHO carried out full contact tracing and related protocols. Enhanced cleaning protocols were implemented in relevant offices."
tweet_3 = "Any tweets only my own views. More Guns,Less Crime (Univ Chicago Press, 3rd ed);10 books, 100+academic articles. PhD Econ, Advisor for Research & Science #USDOJ"

print("The similarity score between premise and hypoetheis 1 is:")
print(cosine(infersent.encode([text_prepare(tweet_1)]), infersent.encode([text_prepare(tweet_2)])).tolist()[0])
print("The similarity score between premise and hypoetheis 2 is:")
print(cosine(infersent.encode([text_prepare(tweet_1)]), infersent.encode([text_prepare(tweet_3)])).tolist()[0])

The similarity score between premise and hypoetheis 1 is:
0.6036133766174316
The similarity score between premise and hypoetheis 2 is:
0.5492755174636841


## Look at twitter data

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("tweets.csv")

In [None]:
df.head()

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,place,possibly_sensitive,retweet_count,retweet_id,retweet_screen_name,source,text,tweet_url,user_created_at,user_screen_name,user_default_profile_image,user_description,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name.1,user_statuses_count,user_time_zone,user_urls,user_verified
0,,Sun Nov 01 03:52:22 +0000 2020,SootinClaimon,,http://sootinclaimon.com/2020/11/01/the-corona...,0,1322748291646476288,,,,en,,False,0,,,"<a href=""http://publicize.wp.com/"" rel=""nofoll...",The coronavirus keeps most London theaters dar...,https://twitter.com/SoilFertilizer/status/1322...,Mon Oct 12 06:58:23 +0000 2009,SoilFertilizer,False,"Twitter for Agriculture, Environment, Crops, S...",2,2142,4613,33,"Bangkok, Thailand",sootin claimon,SoilFertilizer,414636,,http://soclaimon.wordpress.com/,False
1,,Sun Nov 01 03:52:52 +0000 2020,,,,0,1322748414463979520,,,,en,,,0,,,"<a href=""http://twitter.com/download/iphone"" r...",COVID-19 = Plague,https://twitter.com/TheMJ_OfHustlin/status/132...,Wed Sep 22 03:00:31 +0000 2010,TheMJ_OfHustlin,False,#𝗠𝗿𝗦𝗵𝗼𝗽𝗪𝗶𝘁𝗵𝗠𝗲♦️𝗶𝗺 𝗼𝗻 𝘁𝘄𝗶𝘁𝘁𝗲𝗿 𝗯𝘂𝘁 𝗶𝗺 𝗻𝗼𝘁 𝗮 𝘁𝘄𝗶𝘁...,15194,2041,877,9,Dunder Mifflin,Dr. ShopWithMe™️ 🍭🆓,TheMJ_OfHustlin,89685,,https://soundcloud.com/fuck12-1,False
2,,Sun Nov 01 03:52:42 +0000 2020,,,https://twitter.com/wdunlap/status/13226760205...,0,1322748372055433216,,,,en,,False,0,,,"<a href=""http://twitter.com/download/iphone"" r...",Truly truly sad End the lockdowns re-elect Tru...,https://twitter.com/TCiffo/status/132274837205...,Sat Mar 19 12:44:11 +0000 2016,TCiffo,True,,24629,88,239,3,,Trisha L. Ciffo,TCiffo,30268,,,False
3,,Sun Nov 01 03:52:34 +0000 2020,MurdererInChief,https://twitter.com/LaylaFanucci/status/132274...,,1,1322748339872440320,realDonaldTrump,1.32272e+18,25073877.0,en,,True,0,,,"<a href=""http://twitter.com/download/iphone"" r...",@realDonaldTrump A group of Stanford Universit...,https://twitter.com/LaylaFanucci/status/132274...,Wed Nov 16 12:59:30 +0000 2011,LaylaFanucci,False,Layla Fanucci is an International Artist and a...,30547,385,551,8,"St Helena, California",Layla Fanucci,LaylaFanucci,95534,,http://laylafanucci.com,False
4,,Sun Nov 01 03:52:26 +0000 2020,COVID19,,,2,1322748305898573825,,,,en,,,0,,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",A month or so ago around the 26k COV19 cases l...,https://twitter.com/ExcelYourself/status/13227...,Mon Oct 11 14:03:38 +0000 2010,ExcelYourself,False,"Author, Excel Expert, CPA. I write the Excel Y...",7045,1010,783,107,Western Australia,Neale Blackwood,ExcelYourself,34576,,http://a4accounting.com.au,False


In [None]:
tweets = df.text.tolist()

In [None]:
processed_tweets = list(map(text_prepare, tweets))

In [None]:
assert len(tweets) == len(df) == len(processed_tweets)

In [None]:
processed_tweets[:5]

['The coronavirus keeps most London theaters dark, while performers stock grocery shelves #SootinClaimon.Com https://t.co/QIClzfE6Zw',
 'COVID-19 = Plague',
 'Truly truly sad End the lockdowns re-elect Trump🇺🇸the choice is clear! https://t.co/uvz04Yg7A5',
 '@realDonaldTrump A group of Stanford University economists estimates that there have been at least 30,000 coronavirus infections and 700 deaths as a result of 18 campaign rallies President Trump held between June and September. #MurdererInChief https://t.co/JBnmASpmJH',
 'A month or so ago around the 26k COV19 cases level Australia + Austria were very close on both total cases + daily numbers. Australia has just had a zero day with 27k total cases + Austria has 104k total cases and 5k per day. Things can change rapidly with #COVID19']

In [None]:
infersent = infersent.cuda()

In [None]:
infersent.is_cuda()

True

In [None]:
all_tweets_emb = infersent.encode(processed_tweets)

In [None]:
all_tweets_emb.shape

(1177, 4096)

In [None]:
all_scores = np.zeros((len(tweets), len(tweets)))
for i in range(len(processed_tweets)):
  candidate_emb = infersent.encode([processed_tweets[i]])
  all_scores[i] = cosine(np.repeat(candidate_emb, len(processed_tweets), axis=0), all_tweets_emb)
all_scores[np.isnan(all_scores)] = -np.inf

  from ipykernel import kernelapp as app


In [None]:
all_scores

array([[ 0.99999964,        -inf, -0.12126875, ..., -0.83017373,
         0.17898244,  0.40660894],
       [       -inf,  1.00000012, -2.23927355, ..., -0.15725315,
        -0.871364  ,        -inf],
       [-0.12126875, -2.23927402,  1.        , ..., -0.16158199,
        -0.29706645,  0.08433688],
       ...,
       [-0.83017361, -0.15725315, -0.16158211, ...,  1.00000012,
        -0.17319942, -0.10724366],
       [ 0.17898244, -0.87136412, -0.29706645, ..., -0.1731993 ,
         0.9999997 ,  0.1917876 ],
       [ 0.40660882,        -inf,  0.08433682, ..., -0.10724354,
         0.1917876 ,  0.99999988]])

In [None]:
with open('adjacency_matrix.npy', 'wb') as f:
  np.save(f, all_scores)

In [None]:
all_scores

array([[ 0.99999964,        -inf, -0.12126875, ..., -0.83017373,
         0.17898244,  0.40660894],
       [       -inf,  1.00000012, -2.23927355, ..., -0.15725315,
        -0.871364  ,        -inf],
       [-0.12126875, -2.23927402,  1.        , ..., -0.16158199,
        -0.29706645,  0.08433688],
       ...,
       [-0.83017361, -0.15725315, -0.16158211, ...,  1.00000012,
        -0.17319942, -0.10724366],
       [ 0.17898244, -0.87136412, -0.29706645, ..., -0.1731993 ,
         0.9999997 ,  0.1917876 ],
       [ 0.40660882,        -inf,  0.08433682, ..., -0.10724354,
         0.1917876 ,  0.99999988]])

In [None]:
with open('adjacency_matrix.npy', 'rb') as f:
  all_scores = np.load(f)

In [None]:
[tweet0: for all tweet similartiy score]
[tweet1: for all tweet similartiy score]

In [None]:
all_scores.shape

(1177, 1177)

In [None]:
sorted_row_idx = np.argsort(all_scores, axis=1)[:,all_scores.shape[1]-6::]

In [None]:
sorted_row_idx

array([[1102,  996,  988,  155,  298,    0],
       [ 300,  396,  715,  595, 1104,    1],
       [  85,  215,  733, 1169,  145,    2],
       ...,
       [ 121,  774,  150,  170,  745, 1174],
       [ 361,  129,  280,  901, 1001, 1175],
       [  10,  933,  822,  855,  879, 1176]])

In [None]:
tweets[0]

'The coronavirus keeps most London theaters dark, while performers stock grocery shelves #SootinClaimon.Com https://t.co/QIClzfE6Zw'

In [None]:
tweets[298]

'Coronavirus updates LIVE: Victoria, NSW record zero new cases; Australia on track for internal travel bubble by Christmas; England enters national lockdown with retail, hospitality\xa0closed https://t.co/OTUnqcl3nR'