In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import json
import re
import numpy as np
import pickle
import fasttext.util

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

# Load data

In [2]:
tweets = pd.read_pickle('data/tweets.pkl.bz2', compression='bz2')
tweets.head()

Unnamed: 0,id,created_at,source,possibly_sensitive,author_id,text,person_annotations,place_annotations,product_annotations,organization_annotations,...,like_count,quote_count,author_name,verified,protected,profile_image_url,author_followers_count,animated_gif_count,photo_count,video_count
0,1588682493833551872,2022-11-04 23:59:57,Twitter for iPhone,0,580633644,The business of businesses is climate-change a...,0,0,0,0,...,0,0,Robert Han @RobertHon0911,0,0,1,57,0,0,0
1,1588682474141339648,2022-11-04 23:59:52,Twitter Web App,1,2711676037,#howtowrite #howtopublish @authoraid \nA case ...,0,0,0,0,...,1,0,Krishna Koushik @SapiensMyopia,0,0,1,56,0,0,0
2,1588682395393622016,2022-11-04 23:59:34,Twitter for iPhone,0,1573331383984226304,"Hundreds of elephants, zebras die as Kenya wea...",0,0,0,0,...,1,0,Nature Chick @EarthBulletin,0,0,1,553,0,0,0
3,1588682374103334912,2022-11-04 23:59:28,Twitter for iPhone,0,1674604256,If the free market is the answer to our climat...,0,1,0,0,...,0,0,Nick Yates @nickyatesworld,0,0,1,190,0,0,0
4,1588682360253739008,2022-11-04 23:59:25,Twitter for iPhone,0,1144336826133442560,"Tuned in to the local Seattle fake, corrupt, w...",0,1,0,0,...,3,0,greg vanommeren @GregVanommeren,0,0,1,521,0,0,0


In [3]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7581574 entries, 0 to 7581709
Data columns (total 30 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   id                        int64         
 1   created_at                datetime64[ns]
 2   source                    object        
 3   possibly_sensitive        int64         
 4   author_id                 int64         
 5   text                      object        
 6   person_annotations        int64         
 7   place_annotations         int64         
 8   product_annotations       int64         
 9   organization_annotations  int64         
 10  other_annotations         int64         
 11  urls_count                int64         
 12  url_image_count           int64         
 13  mentions_count            int64         
 14  mentions_verifications    int64         
 15  mentions_followers        int64         
 16  hashtags                  int64         
 17  cashtags

# Preprocess data

Before tokenize text, remove all the no-alphabetic content and the @, URL, emojis. For Hashtag here we only remove the '#' and treat it as word

In [6]:
tweets['text'].head().values

array(['The business of businesses is climate-change adaptation https://t.co/q5jeTSdsOT from \u2066@TheEconomist\u2069  https://t.co/q5jeTSdsOT Title: The business of businesses is climate-change adaptation Desciption: Big ones are waking up to the fact',
       '#howtowrite #howtopublish @authoraid \nA case study: \non writing a peer-reviewed article on climate change https://t.co/JYyNE4F8Vo \n#phdvoice #postdocjobs #AcademicChatter Title: AuthorAID - A case study on writing a peer-reviewed journal article on climate change Desciption: This case study is about writing an original peer-reviewed journal article on climate change research by three authors from Bangladesh. This briefly describes the context of and motivation for the research, and strategies taken to collect data (as a part of research design) and to write the article. Peer-review process is a crucial part of publishing an article ― lessons from this article are describe as well. Open access experience and beyond publicati

In [5]:
# remove emojis

def find_emojis(text):
        regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
        emojis = regrex_pattern.search(text)
        return emojis


#tweets.text[~tweets['text'].map(find_emojis).isna()]


40180      :droplet:  Water scarcity &amp; rising energy ...
92294      :droplet:  Water scarcity &amp; rising energy ...
120404     VOTE DOUG MASTRIANO\n#PA GOVENOR\n@Fighting4PA...
139908     1/3 Today, the office of the Chief Public Heal...
194635     :droplet:  Water scarcity &amp; rising energy ...
                                 ...                        
6654655    Ik engenes we've been experiencing this drough...
6658907    🇨 🇱 🇮 🇲 🇦 🇹 🇪 🇨 🇭 🇦 🇳 🇬 🇪 \n\n20% decline in g...
6927097    :high_voltage: #Hydrogen 🇭:keycap_2::\n\n:righ...
6979392    You work for a billion dollar company. I do wo...
7155838    :loudspeaker: Today, Wilton Park and @UKinNige...
Name: text, Length: 90, dtype: object

In [4]:
def clean_tweets(x):
    x = x.lower()
    x = x.replace('\n',' ')
    # remove @
    cleaned_string = re.sub("@[A-Za-z0-9_]+",'', x)
    # remove hashtag
    cleaned_string = re.sub("#","", cleaned_string)
    # remove url
    cleaned_string = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b','', str(cleaned_string), flags=re.MULTILINE)
    
    # remove emojis
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    cleaned_string = regrex_pattern.sub(r'',  cleaned_string)
        
        
    cleaned_string = re.sub(r"[^a-zA-Z0-9?!.,]+", ' ', cleaned_string)
    return cleaned_string

tqdm.pandas() 
text_clean = tweets['text'].progress_map(clean_tweets)

  0%|          | 0/7581574 [00:00<?, ?it/s]

Let's check the result.

In [5]:
text_clean.head(3).values

array(['the business of businesses is climate change adaptation from title the business of businesses is climate change adaptation desciption big ones are waking up to the fact',
       'howtowrite howtopublish a case study on writing a peer reviewed article on climate change phdvoice postdocjobs academicchatter title authoraid a case study on writing a peer reviewed journal article on climate change desciption this case study is about writing an original peer reviewed journal article on climate change research by three authors from bangladesh. this briefly describes the context of and motivation for the research, and strategies taken to collect data as a part of research design and to write the article. peer review process is a crucial part of publishing an article lessons from this article are describe as well. open access experience and beyond publication impact of this article are also touched upon.',
       'hundreds of elephants, zebras die as kenya weathers drought from title hu

In [9]:
print('Tokenizing...')
tokenizer = RegexpTokenizer('[a-zA-Z0-9]\w+')
tokens = [tokenizer.tokenize(tweet) for tweet in tqdm(text_clean)]

Tokenizing ..


  0%|          | 0/7581574 [00:00<?, ?it/s]

In [10]:
print(tokens[1])

['howtowrite', 'howtopublish', 'case', 'study', 'on', 'writing', 'peer', 'reviewed', 'article', 'on', 'climate', 'change', 'phdvoice', 'postdocjobs', 'academicchatter', 'title', 'authoraid', 'case', 'study', 'on', 'writing', 'peer', 'reviewed', 'journal', 'article', 'on', 'climate', 'change', 'desciption', 'this', 'case', 'study', 'is', 'about', 'writing', 'an', 'original', 'peer', 'reviewed', 'journal', 'article', 'on', 'climate', 'change', 'research', 'by', 'three', 'authors', 'from', 'bangladesh', 'this', 'briefly', 'describes', 'the', 'context', 'of', 'and', 'motivation', 'for', 'the', 'research', 'and', 'strategies', 'taken', 'to', 'collect', 'data', 'as', 'part', 'of', 'research', 'design', 'and', 'to', 'write', 'the', 'article', 'peer', 'review', 'process', 'is', 'crucial', 'part', 'of', 'publishing', 'an', 'article', 'lessons', 'from', 'this', 'article', 'are', 'describe', 'as', 'well', 'open', 'access', 'experience', 'and', 'beyond', 'publication', 'impact', 'of', 'this', 'art

# Word embedding

Let's load the model.

In [11]:
# fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')
ft.get_dimension()



300

Let's see example for encoding of the word `hello`.

In [12]:
ft.get_word_vector('hello')

array([ 1.57576188e-01,  4.37820926e-02, -4.51271934e-03,  6.65931404e-02,
        7.70346820e-02,  4.85855248e-03,  8.19822028e-03,  6.52402919e-03,
        9.25899856e-03,  3.53899002e-02, -2.31395271e-02, -4.91807126e-02,
       -8.32642540e-02,  1.56014524e-02,  2.54856616e-01,  3.45423706e-02,
       -1.07451361e-02, -7.80188590e-02, -7.08099529e-02,  7.62385577e-02,
       -6.09613657e-02,  4.48625796e-02, -7.29744136e-02,  1.30583309e-02,
        3.14881057e-02, -3.10055036e-02,  1.66004002e-02,  1.74405202e-02,
       -7.35838860e-02,  1.18252613e-01, -1.21330231e-01, -4.09253240e-02,
        2.93969568e-02,  4.84445989e-02, -1.33816330e-02, -1.74765270e-02,
        7.51308873e-02,  9.97046307e-02, -4.00476977e-02,  4.05735290e-03,
       -7.21896589e-02, -4.43356819e-02, -1.22628408e-03,  7.56693557e-02,
        3.98401320e-02,  3.22643593e-02,  1.95914153e-02,  4.68016043e-02,
       -1.46228177e-02,  1.12967767e-01,  3.15065160e-02, -1.02312110e-01,
        1.58124104e-01, -

Now let's create document embedding (average of word-embeddings inside the text).

In [13]:
dic_wv = {}
word_out = []
all_tweets_emb = []

for tweet in tqdm(tokens):
    tweet_emb = []
    for w in tweet:
        try :
            emb = ft.get_word_vector(w)
            tweet_emb.append(emb)   
            dic_wv.update({w : emb})
        except: 
            word_out = word_out.append(w)
            continue
            
    all_tweets_emb.append(np.average(tweet_emb, axis=0))

  0%|          | 0/7581574 [00:00<?, ?it/s]

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Let's make sure we have the correct shape for all_tweets_emb.

In [14]:
print(f"Shape: ({len(all_tweets_emb)}, {len(all_tweets_emb[0])})")

Shape: (7581574, 300)


In [15]:
ids = tweets.id
ids.to_csv("tweets_id_emb.csv")

In [16]:
tweets['embedding'] = all_tweets_emb
tweets_save = tweets[['id','embedding']]
tweets_save.head()

Unnamed: 0,id,embedding
0,1588682493833551872,"[-0.046264533, -0.05307058, 0.0045004673, 0.02..."
1,1588682474141339648,"[-0.013356083, 0.036624912, 0.021634554, 0.034..."
2,1588682395393622016,"[0.0073349774, -0.020072784, -0.027865734, 0.0..."
3,1588682374103334912,"[-0.041744243, -0.034809932, -0.011789519, 0.0..."
4,1588682360253739008,"[-0.026468642, -0.016022408, 0.024429057, 0.05..."


In [17]:
tweets_save.to_pickle('tweets_embd.pkl.bz2',compression='bz2')

In [18]:
tweets_save.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7581574 entries, 0 to 7581709
Data columns (total 2 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   id         int64 
 1   embedding  object
dtypes: int64(1), object(1)
memory usage: 173.5+ MB


# Add embeddings to pairs dataset

In [3]:
tweets_save = pd.read_pickle('tweets_embd.pkl.bz2',compression='bz2')
tweets_save.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7581574 entries, 0 to 7581709
Data columns (total 2 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   id         int64 
 1   embedding  object
dtypes: int64(1), object(1)
memory usage: 173.5+ MB


In [5]:
pairs = pd.read_pickle('data/pairs.pkl.bz2',compression='bz2')
pairs.head()

Unnamed: 0,author,author_followers_count,verified,tweet_id1,urls_count1,url_image_count1,hashtags_count1,animated_gif_count1,photo_count1,video_count1,tweet_id2,urls_count2,url_image_count2,hashtags_count2,animated_gif_count2,photo_count2,video_count2,max_date,min_date,winner
17416113,1172657864118194176,4431,0,1344795571865989120,2,0,0,0,1,0,1344800219972726784,2,0,0,1,0,0,2021-01-01 00:18:52,2021-01-01 00:00:24,0
17416112,1172657864118194176,4431,0,1344803663647272960,2,0,0,0,1,0,1344795571865989120,2,0,0,0,1,0,2021-01-01 00:32:33,2021-01-01 00:00:24,0
17416111,1172657864118194176,4431,0,1344803663647272960,2,0,0,0,1,0,1344800219972726784,2,0,0,1,0,0,2021-01-01 00:32:33,2021-01-01 00:18:52,0
21264726,1337163419326279680,193,0,1344805720143888384,1,0,0,0,1,0,1344804446698672128,1,0,0,0,1,0,2021-01-01 00:40:43,2021-01-01 00:35:40,1
3279199,20207761,16032,0,1344806765590884352,2,0,8,0,1,0,1344806944675131392,2,0,8,0,1,0,2021-01-01 00:45:35,2021-01-01 00:44:52,0


In [6]:
pairs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23753614 entries, 17416113 to 23748673
Data columns (total 20 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   author                  int64         
 1   author_followers_count  int64         
 2   verified                int64         
 3   tweet_id1               int64         
 4   urls_count1             int64         
 5   url_image_count1        int64         
 6   hashtags_count1         int64         
 7   animated_gif_count1     int64         
 8   photo_count1            int64         
 9   video_count1            int64         
 10  tweet_id2               int64         
 11  urls_count2             int64         
 12  url_image_count2        int64         
 13  hashtags_count2         int64         
 14  animated_gif_count2     int64         
 15  photo_count2            int64         
 16  video_count2            int64         
 17  max_date                datetime64[ns

In [7]:
# drop columns we don't need for the BT model
pairs.drop(['author_followers_count', 'verified'], axis=1, inplace=True)
pairs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23753614 entries, 17416113 to 23748673
Data columns (total 18 columns):
 #   Column               Dtype         
---  ------               -----         
 0   author               int64         
 1   tweet_id1            int64         
 2   urls_count1          int64         
 3   url_image_count1     int64         
 4   hashtags_count1      int64         
 5   animated_gif_count1  int64         
 6   photo_count1         int64         
 7   video_count1         int64         
 8   tweet_id2            int64         
 9   urls_count2          int64         
 10  url_image_count2     int64         
 11  hashtags_count2      int64         
 12  animated_gif_count2  int64         
 13  photo_count2         int64         
 14  video_count2         int64         
 15  max_date             datetime64[ns]
 16  min_date             datetime64[ns]
 17  winner               int64         
dtypes: datetime64[ns](2), int64(16)
memory usage: 3.4 GB


Let's merge `url_image_count` and `photo_count` into one feature `image_count`.

In [8]:
pairs['image1'] = pairs['url_image_count1'] + pairs['photo_count1']
pairs['image2'] = pairs['url_image_count2'] + pairs['photo_count2']
pairs.drop(['url_image_count1', 'url_image_count2', 'photo_count1', 'photo_count2'], axis=1, inplace=True)
pairs.head()

Unnamed: 0,author,tweet_id1,urls_count1,hashtags_count1,animated_gif_count1,video_count1,tweet_id2,urls_count2,hashtags_count2,animated_gif_count2,video_count2,max_date,min_date,winner,image1,image2
17416113,1172657864118194176,1344795571865989120,2,0,0,0,1344800219972726784,2,0,1,0,2021-01-01 00:18:52,2021-01-01 00:00:24,0,1,0
17416112,1172657864118194176,1344803663647272960,2,0,0,0,1344795571865989120,2,0,0,0,2021-01-01 00:32:33,2021-01-01 00:00:24,0,1,1
17416111,1172657864118194176,1344803663647272960,2,0,0,0,1344800219972726784,2,0,1,0,2021-01-01 00:32:33,2021-01-01 00:18:52,0,1,0
21264726,1337163419326279680,1344805720143888384,1,0,0,0,1344804446698672128,1,0,0,0,2021-01-01 00:40:43,2021-01-01 00:35:40,1,1,1
3279199,20207761,1344806765590884352,2,8,0,0,1344806944675131392,2,8,0,0,2021-01-01 00:45:35,2021-01-01 00:44:52,0,1,1


Let's make just flagged variables from count variables and then substract their values for corresponding features for each pair.

In [9]:
pairs.columns

Index(['author', 'tweet_id1', 'urls_count1', 'hashtags_count1',
       'animated_gif_count1', 'video_count1', 'tweet_id2', 'urls_count2',
       'hashtags_count2', 'animated_gif_count2', 'video_count2', 'max_date',
       'min_date', 'winner', 'image1', 'image2'],
      dtype='object')

In [10]:
for column in ['urls_count1', 'video_count1', 'hashtags_count1', 'image1', 'animated_gif_count1',
               'urls_count2', 'video_count2', 'hashtags_count2', 'image2', 'animated_gif_count2']:
    pairs[column] = pairs[column].astype(bool).astype(int)
    
pairs.head()

Unnamed: 0,author,tweet_id1,urls_count1,hashtags_count1,animated_gif_count1,video_count1,tweet_id2,urls_count2,hashtags_count2,animated_gif_count2,video_count2,max_date,min_date,winner,image1,image2
17416113,1172657864118194176,1344795571865989120,1,0,0,0,1344800219972726784,1,0,1,0,2021-01-01 00:18:52,2021-01-01 00:00:24,0,1,0
17416112,1172657864118194176,1344803663647272960,1,0,0,0,1344795571865989120,1,0,0,0,2021-01-01 00:32:33,2021-01-01 00:00:24,0,1,1
17416111,1172657864118194176,1344803663647272960,1,0,0,0,1344800219972726784,1,0,1,0,2021-01-01 00:32:33,2021-01-01 00:18:52,0,1,0
21264726,1337163419326279680,1344805720143888384,1,0,0,0,1344804446698672128,1,0,0,0,2021-01-01 00:40:43,2021-01-01 00:35:40,1,1,1
3279199,20207761,1344806765590884352,1,1,0,0,1344806944675131392,1,1,0,0,2021-01-01 00:45:35,2021-01-01 00:44:52,0,1,1


In [11]:
for column in ['urls', 'video', 'hashtags', 'animated_gif', 'image']:
    if column == 'image':
        pairs[column] = pairs[column + '1'] - pairs[column + '2']
        pairs.drop([column + '1', column + '2'], axis=1, inplace=True)
    else:
        pairs[column] = pairs[column + '_count1'] - pairs[column + '_count2']
        pairs.drop([column + '_count1', column + '_count2'], axis=1, inplace=True)

pairs.head()

Unnamed: 0,author,tweet_id1,tweet_id2,max_date,min_date,winner,urls,video,hashtags,animated_gif,image
17416113,1172657864118194176,1344795571865989120,1344800219972726784,2021-01-01 00:18:52,2021-01-01 00:00:24,0,0,0,0,-1,1
17416112,1172657864118194176,1344803663647272960,1344795571865989120,2021-01-01 00:32:33,2021-01-01 00:00:24,0,0,0,0,0,0
17416111,1172657864118194176,1344803663647272960,1344800219972726784,2021-01-01 00:32:33,2021-01-01 00:18:52,0,0,0,0,-1,1
21264726,1337163419326279680,1344805720143888384,1344804446698672128,2021-01-01 00:40:43,2021-01-01 00:35:40,1,0,0,0,0,0
3279199,20207761,1344806765590884352,1344806944675131392,2021-01-01 00:45:35,2021-01-01 00:44:52,0,0,0,0,0,0


Let's add embeddings to pairs.

In [12]:
tweets_save.set_index("id", inplace=True)
tweets_save.head()

Unnamed: 0_level_0,embedding
id,Unnamed: 1_level_1
1588682493833551872,"[-0.046264533, -0.05307058, 0.0045004673, 0.02..."
1588682474141339648,"[-0.013356083, 0.036624912, 0.021634554, 0.034..."
1588682395393622016,"[0.0073349774, -0.020072784, -0.027865734, 0.0..."
1588682374103334912,"[-0.041744243, -0.034809932, -0.011789519, 0.0..."
1588682360253739008,"[-0.026468642, -0.016022408, 0.024429057, 0.05..."


In [None]:
embeddings = tweets_save["embedding"].to_dict()
embedding_features = [str(i) for i in range(300)]
pairs[embedding_features] = np.stack(np.subtract(
    pairs['tweet_id1'].map(embeddings), pairs['tweet_id2'].map(embeddings)), axis=0)
pairs.head()

In [16]:
pairs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23753614 entries, 17416113 to 23748673
Columns: 311 entries, author to 299
dtypes: datetime64[ns](2), float32(300), int64(9)
memory usage: 28.7 GB


In [None]:
pairs.drop(["tweet_id1", "tweet_id2"], axis=1, inplace=True)
pairs.info()

In [None]:
pairs.dropna()

In [None]:
NaN_list = [i for i, x in enumerate(embeddings_diff) if np.isnan(x).any()]
print(NaN_list)

In [None]:
pairs.reset_index(drop=True, inplace=True)
pairs.drop(NaN_list, inplace=True)
pairs

In [None]:
embeddings_diff = np.delete(embeddings_diff, NaN_list, 0)
embeddings_diff.shape

In [None]:
pairs["embedding"] = embeddings_diff