In [2]:
import time
import os
import numpy as np
import google.colab as colab
import random
import json
%matplotlib inline
import matplotlib.pyplot as plt
from multiprocessing import Pool
import shutil
from pprint import pprint
import pickle
from random import randint
import pandas as pd

import re
import inspect
import torch
from torch import optim
from torch.autograd import Variable
import torch.nn as nn

import nltk
nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Mount Google Drive

In [4]:
def mount_google_drive():
	'''
	# Functionality
		Mount google drive. Since colab does not save files, we want to make it easier to directly access files in google drive.
	# Arguments
		Nothing
	# Returns
		drive_root: the working directory mounted
	'''
	mount_directory = "/content/gdrive"
	drive = colab.drive
	drive.mount(mount_directory, force_remount=True)
	drive_root = mount_directory + "/" + list(filter(lambda x: x[0] != '.', os.listdir(mount_directory)))[0]
	return drive_root

In [5]:
# Please Set up mounted directories here. Notice whether you want to balance dataset
ROOT_DIR =  mount_google_drive() + "/05839-Final-Project/code/"

DATASET_PATH = ROOT_DIR + "quora.csv"

NLI_NET_DIR = ROOT_DIR + "models/NliNetUtils/"

CHECKPOINT_DIR = ROOT_DIR + "checkpoints/"

Mounted at /content/gdrive


In [6]:
# Migrate utils from drive to current dir so that we don't need to upload a folder from local every time
shutil.rmtree('utils/', ignore_errors=True)
_ = shutil.copytree(ROOT_DIR +"/utils/", "utils/")

In [7]:
# Load custimizable utils here
from utils.file_utils import *
from utils.image_utils import *
from utils.generator_utils import *
from utils.tqdm_utils import *
from utils.keras_utils import *

In [8]:
# Load infersent model related files
shutil.rmtree('models.py', ignore_errors=True)
shutil.copy(NLI_NET_DIR + "models.py", "models.py")

shutil.rmtree('data.py', ignore_errors=True)
shutil.copy(NLI_NET_DIR + "data.py", "data.py")

shutil.rmtree('mutils.py', ignore_errors=True)
shutil.copy(NLI_NET_DIR + "mutils.py", "mutils.py")


'mutils.py'

In [9]:
# shutil.rmtree('fastText/', ignore_errors=True)
# shutil.copytree(ROOT_DIR + "fastText/", "fastText/")

In [10]:
from data import get_nli, get_batch, build_vocab
from mutils import get_optimizer
from models import NLINet

In [11]:
def get_optimizer(s):
    """
    Parse optimizer parameters.
    Input should be of the form:
        - "sgd,lr=0.01"
        - "adagrad,lr=0.1,lr_decay=0.05"
    """
    if "," in s:
        method = s[:s.find(',')]
        optim_params = {}
        for x in s[s.find(',') + 1:].split(','):
            split = x.split('=')
            assert len(split) == 2
            assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None
            optim_params[split[0]] = float(split[1])
    else:
        method = s
        optim_params = {}

    if method == 'adadelta':
        optim_fn = optim.Adadelta
    elif method == 'adagrad':
        optim_fn = optim.Adagrad
    elif method == 'adam':
        optim_fn = optim.Adam
    elif method == 'adamax':
        optim_fn = optim.Adamax
    elif method == 'asgd':
        optim_fn = optim.ASGD
    elif method == 'rmsprop':
        optim_fn = optim.RMSprop
    elif method == 'rprop':
        optim_fn = optim.Rprop
    elif method == 'sgd':
        optim_fn = optim.SGD
        assert 'lr' in optim_params
    else:
        raise Exception('Unknown optimization method: "%s"' % method)

    # check that we give good parameters to the optimizer
    expected_args = inspect.getargspec(optim_fn.__init__)[0]
    assert expected_args[:2] == ['self', 'params']
    if not all(k in expected_args[2:] for k in optim_params.keys()):
        raise Exception('Unexpected parameters: expected "%s", got "%s"' % (
            str(expected_args[2:]), str(optim_params.keys())))

    return optim_fn, optim_params

In [12]:
torch.cuda.is_available()

True

In [22]:
config_nli_model = {
    'n_words'        :  72764                 , # Number of distinct words in the wordvec
    'word_emb_dim'   :  300                   , # Dimension of word embeddings
    'dpout_model'    :  0.                    , # Dropout
    'enc_lstm_dim'   :  2048                  ,
    'dpout_fc'       :  0.5                   ,
    'fc_dim'         :  512                   ,
    'bsize'          :  64                    ,
    'n_classes'      :  2                     ,
    'pool_type'      :  'max'                 ,
    'nonlinear_fc'   :  0                     ,
    'encoder_type'   :  'InferSent'           , # see list of encoders
    'use_cuda'       :  True                  ,
    'optimizer'      :  "adam"         ,
    'decay'          :  0.99                  ,
    'max_norm'       :  5.                    ,
    'minlr'          :  1e-5                  ,
    'outputdir'      :  CHECKPOINT_DIR        ,
    'outputmodelname':  'dmodel.pickle.encoder.pkl'     ,
    'lrshrink'       :  5                     ,
    'n_epochs'       :  10
}


In [23]:
nli_net = NLINet(config_nli_model)
print(nli_net)

NLINet(
  (encoder): InferSent(
    (enc_lstm): LSTM(300, 2048, bidirectional=True)
  )
  (classifier): Sequential(
    (0): Linear(in_features=16384, out_features=2, bias=True)
  )
)


In [33]:
infersent = nli_net.encoder

In [34]:
infersent.load_state_dict(torch.load(os.path.join(config_nli_model['outputdir'], config_nli_model['outputmodelname'])))

<All keys matched successfully>

In [36]:
infersent.set_w2v_path(ROOT_DIR + "glove.840B.300d.txt")
infersent.build_vocab_k_words(K=500000)

Vocab size : 500000


In [37]:
def text_prepare(text):
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    text = str(text)
    # text = " ".join([word for word in text.split(" ") if re.search('[a-zA-Z]', word)])
    # text = text.lower()
    # text = re.sub(REPLACE_BY_SPACE_RE, " ", text)
    # text = re.sub(BAD_SYMBOLS_RE, "", text)
    return text

def cosine(u, v):
  # compute the similarity between two embeddings
  # u and v are matrices!
    result = np.einsum('ij,ij->i', u, v) / ((np.linalg.norm(u, axis=1) * np.linalg.norm(v, axis=1)))
    return np.log(result) + 1

In [38]:
tweet_1 = "Since the start of the pandemic, a total 65 WHO staff stationed in Geneva - working from home and onsite - have tested positive for #COVID19. We have not yet established whether any transmission has occurred on campus, but are looking into the matter."
tweet_2 = "WHO staff who were confirmed positive with #COVID19 in Geneva have received the necessary medical attention. WHO carried out full contact tracing and related protocols. Enhanced cleaning protocols were implemented in relevant offices."
tweet_3 = "Any tweets only my own views. More Guns,Less Crime (Univ Chicago Press, 3rd ed);10 books, 100+academic articles. PhD Econ, Advisor for Research & Science #USDOJ"

print("The similarity score between premise and hypoetheis 1 is:")
print(cosine(infersent.encode([text_prepare(tweet_1)]), infersent.encode([text_prepare(tweet_2)])).tolist()[0])
print("The similarity score between premise and hypoetheis 2 is:")
print(cosine(infersent.encode([text_prepare(tweet_1)]), infersent.encode([text_prepare(tweet_3)])).tolist()[0])

The similarity score between premise and hypoetheis 1 is:
0.7963457703590393
The similarity score between premise and hypoetheis 2 is:
0.7654701471328735


## Look at twitter data

In [39]:
import pandas as pd

In [40]:
df = pd.read_csv(ROOT_DIR + "tweets_sample_100000.csv")

In [41]:
len(df)

86522

In [42]:
df.head()

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,place,possibly_sensitive,retweet_count,retweet_id,retweet_screen_name,source,text,tweet_url,user_created_at,user_screen_name,user_default_profile_image,user_description,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name.1,user_statuses_count,user_time_zone,user_urls,user_verified
0,,Mon Jan 27 17:09:17 +0000 2020,,https://twitter.com/AjnabhiiTweets/status/1221...,,0,1221842625680023555,,,,en,,False,0,,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",When the news ticker reads ‘China fights coron...,https://twitter.com/AjnabhiiTweets/status/1221...,Sat Aug 20 11:17:27 +0000 2011,AjnabhiiTweets,False,SLEAS | Translator | Writer | Poet | Critique ...,3371,1317,5,28,"7°29′10″ N, 80°21′44″ E",Rimzan Amanullah,AjnabhiiTweets,42441,,http://spurrism-ajnabhii.blogspot.com,False
1,,Tue Jan 28 18:26:32 +0000 2020,,,https://www.jdsupra.com/legalnews/coronavirus-...,0,1222224454807277568,,,,en,,False,0,,,"<a href=""https://www.jdsupra.com"" rel=""nofollo...",Coronavirus Raises Privacy Concerns For Health...,https://twitter.com/BizAdvisories/status/12222...,Wed Oct 08 16:21:55 +0000 2008,BizAdvisories,False,A daily dose of news & insights for entreprene...,0,4318,2133,294,JDSupra.com,Business Advisories,BizAdvisories,379229,,http://www.jdsupra.com/,False
2,,Mon Jan 27 18:59:43 +0000 2020,,,https://goo.gl/fb/quCuCE,0,1221870418727927809,,,,en,,False,0,,,"<a href=""https://www.google.com/"" rel=""nofollo...","Coronavirus Disrupts League Of Legends, CS:GO ...",https://twitter.com/freemmogamer/status/122187...,Tue Mar 17 20:31:23 +0000 2009,freemmogamer,False,All about free to play MMORPG´s and MMO Games!...,1,1999,2441,49,,FreeMMOGamer.com,freemmogamer,12943,,,False
3,,Mon Jan 27 17:53:40 +0000 2020,coronavirus Stop5G,,,0,1221853795178745862,DEADLINE,1.221836e+18,586032653.0,en,,,0,,,"<a href=""https://mobile.twitter.com"" rel=""nofo...","@DEADLINE ""5G weakens the immune system. Wuhan...",https://twitter.com/npsgirl/status/12218537951...,Fri May 23 14:11:42 +0000 2008,npsgirl,False,No Police State Girls blog is about everything...,6358,1153,2331,38,New York,npsgirl,npsgirl,16838,,http://www.nopolicestate.blogspot.com,False
4,,Mon Jan 27 19:53:56 +0000 2020,,https://twitter.com/donkorgh/status/1221884062...,http://Donkorsblog.com http://bit.ly/2O327mA,0,1221884062954151942,,,,en,,False,0,,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://t.co/vOTSz4VFzy: Ivory Coast tests fir...,https://twitter.com/donkorgh/status/1221884062...,Sun Jul 24 20:37:18 +0000 2011,donkorgh,False,"God first, Radio is my Passion, i love Radio. ...",14677,1217,1919,31,"Accra, Ghana",Emmanuel Donkor #donkorsblog,donkorgh,38100,,http://www.donkorsblog.com,False


In [43]:
tweets = df.text.tolist()

In [45]:
processed_tweets = list(map(text_prepare, tweets))

In [46]:
assert len(tweets) == len(df) == len(processed_tweets)

In [47]:
processed_tweets[:5]

['When the news ticker reads ‘China fights coronavirus outbreak’, our local Jet Li fanboys’ imagination be like: https://t.co/mIa5OMuY01',
 'Coronavirus Raises Privacy Concerns For Healthcare Providers And Their Workers https://t.co/oct7sgGsj3 | by @jacksonlewispc',
 'Coronavirus Disrupts League Of Legends, CS:GO Esports Events In Asia https://t.co/0SHNMXambY',
 '@DEADLINE "5G weakens the immune system. Wuhan, China is the test site for a new 5G highway system with driverless cars. It\'s one of the largest 5G test sites with 10,000 5G Base stations. Many predicted a risk of viral or bacterial outbreaks in such high-density EMF" #coronavirus #Stop5G',
 'https://t.co/vOTSz4VFzy: Ivory Coast tests first person in Africa for Coronavirus https://t.co/BM2frBj1PK https://t.co/kLBN3uVDiJ']

In [48]:
infersent = infersent.cuda()

In [49]:
infersent.is_cuda()

True

In [50]:
all_tweets_emb = infersent.encode(processed_tweets)

In [51]:
all_tweets_emb.shape

(86522, 4096)

In [None]:
all_scores = np.zeros((len(tweets), len(tweets)))
for i in range(len(processed_tweets)):
  candidate_emb = infersent.encode([processed_tweets[i]])
  all_scores[i] = cosine(np.repeat(candidate_emb, len(processed_tweets), axis=0), all_tweets_emb)
all_scores[np.isnan(all_scores)] = -np.inf

In [None]:
all_scores

In [None]:
with open('adjacency_matrix.npy', 'wb') as f:
  np.save(f, all_scores)