In [73]:
import os
from tqdm.notebook import tqdm
import gzip
import pandas as pd
import seaborn as sns
import timeit
import json
import matplotlib.pyplot as plt
import numpy as np
import emoji
import pickle

# Load data

In [3]:
tweets = pd.read_pickle('../data/tweets.pkl.bz2', compression='bz2')
tweets.head()

Unnamed: 0,id,created_at,source,possibly_sensitive,author_id,text,person_annotations,place_annotations,product_annotations,organization_annotations,...,like_count,quote_count,author_name,verified,protected,profile_image_url,author_followers_count,animated_gif_count,photo_count,video_count
0,1588682493833551872,2022-11-04 23:59:57,Twitter for iPhone,0,580633644,The business of businesses is climate-change a...,0,0,0,0,...,0,0,Robert Han @RobertHon0911,0,0,1,57,0,0,0
1,1588682474141339648,2022-11-04 23:59:52,Twitter Web App,1,2711676037,#howtowrite #howtopublish @authoraid \nA case ...,0,0,0,0,...,1,0,Krishna Koushik @SapiensMyopia,0,0,1,56,0,0,0
2,1588682395393622016,2022-11-04 23:59:34,Twitter for iPhone,0,1573331383984226304,"Hundreds of elephants, zebras die as Kenya wea...",0,0,0,0,...,1,0,Nature Chick @EarthBulletin,0,0,1,553,0,0,0
3,1588682374103334912,2022-11-04 23:59:28,Twitter for iPhone,0,1674604256,If the free market is the answer to our climat...,0,1,0,0,...,0,0,Nick Yates @nickyatesworld,0,0,1,190,0,0,0
4,1588682360253739008,2022-11-04 23:59:25,Twitter for iPhone,0,1144336826133442560,"Tuned in to the local Seattle fake, corrupt, w...",0,1,0,0,...,3,0,greg vanommeren @GregVanommeren,0,0,1,521,0,0,0


In [13]:
tweets.columns

Index(['id', 'author_id', 'urls_count', 'hashtags', 'animated_gif_count',
       'video_count', 'image'],
      dtype='object')

In [10]:
tweets.drop(['created_at', 'source', 'possibly_sensitive', 'text',
       'person_annotations', 'place_annotations', 'product_annotations',
       'organization_annotations', 'other_annotations',
       'mentions_count', 'mentions_verifications',
       'mentions_followers', 'cashtags', 'retweet_count',
       'reply_count', 'like_count', 'quote_count', 'author_name', 'verified', 'protected',
       'profile_image_url', 'author_followers_count'
], axis=1, inplace=True)
tweets

Unnamed: 0,id,author_id,urls_count,url_image_count,hashtags,animated_gif_count,photo_count,video_count
0,1588682493833551872,580633644,2,1,0,0,0,0
1,1588682474141339648,2711676037,1,1,5,0,0,0
2,1588682395393622016,1573331383984226304,1,0,0,0,0,0
3,1588682374103334912,1674604256,0,0,2,0,0,0
4,1588682360253739008,1144336826133442560,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
7581705,1344795498180448256,217362857,0,0,0,0,0,0
7581706,1344795491540889600,1097385765263204352,1,0,0,0,1,0
7581707,1344795484188073984,1061820436143529984,2,1,0,0,1,0
7581708,1344795473844994048,487922319,1,0,2,0,1,0


In [12]:
tweets['image'] = tweets['photo_count'] + tweets['url_image_count']
tweets.drop(['photo_count', 'url_image_count'], axis=1, inplace=True)
tweets

Unnamed: 0,id,author_id,urls_count,hashtags,animated_gif_count,video_count,image
0,1588682493833551872,580633644,2,0,0,0,1
1,1588682474141339648,2711676037,1,5,0,0,1
2,1588682395393622016,1573331383984226304,1,0,0,0,0
3,1588682374103334912,1674604256,0,2,0,0,0
4,1588682360253739008,1144336826133442560,0,0,0,0,0
...,...,...,...,...,...,...,...
7581705,1344795498180448256,217362857,0,0,0,0,0
7581706,1344795491540889600,1097385765263204352,1,0,0,0,1
7581707,1344795484188073984,1061820436143529984,2,0,0,0,2
7581708,1344795473844994048,487922319,1,2,0,0,1


In [14]:
for column in ['urls_count', 'hashtags', 'animated_gif_count', 'video_count', 'image']:
    tweets[column] = tweets[column].astype(bool).astype(int)
tweets

Unnamed: 0,id,author_id,urls_count,hashtags,animated_gif_count,video_count,image
0,1588682493833551872,580633644,1,0,0,0,1
1,1588682474141339648,2711676037,1,1,0,0,1
2,1588682395393622016,1573331383984226304,1,0,0,0,0
3,1588682374103334912,1674604256,0,1,0,0,0
4,1588682360253739008,1144336826133442560,0,0,0,0,0
...,...,...,...,...,...,...,...
7581705,1344795498180448256,217362857,0,0,0,0,0
7581706,1344795491540889600,1097385765263204352,1,0,0,0,1
7581707,1344795484188073984,1061820436143529984,1,0,0,0,1
7581708,1344795473844994048,487922319,1,1,0,0,1


In [29]:
embeddings = pd.read_pickle('../data/tweets_embd.pkl.bz2',compression='bz2')
embeddings.head()

Unnamed: 0,id,embedding
0,1588682493833551872,"[-0.046264533, -0.05307058, 0.0045004673, 0.02..."
1,1588682474141339648,"[-0.013356083, 0.036624912, 0.021634554, 0.034..."
2,1588682395393622016,"[0.0073349774, -0.020072784, -0.027865734, 0.0..."
3,1588682374103334912,"[-0.041744243, -0.034809932, -0.011789519, 0.0..."
4,1588682360253739008,"[-0.026468642, -0.016022408, 0.024429057, 0.05..."


In [30]:
embeddings.set_index("id", inplace=True)
embeddings = embeddings['embedding'].to_dict()
len(embeddings)

7581574

In [31]:
NaN_list = [id for id in embeddings if np.isnan(embeddings[id]).any()]
print(NaN_list)

[1580845904281550848, 1578478861053853696, 1550918701519609856, 1526371520402030592, 1521895820374974464, 1508173255831654400, 1504874340197621760, 1498387708645093376, 1472568986588094464, 1471145465311813632, 1460186457742610432, 1455260273833660416, 1438118587495956480, 1437370062860525568, 1435785026864652288, 1435291047634997248, 1422954077705617408, 1417221417746108416, 1413661754257670144, 1411920145635856384, 1409568597685551104, 1406896802473799680, 1403217891110133760, 1403010350044286976, 1401009999275610112, 1396571754164850688, 1392849450033418240, 1386641168994390016, 1379341825010327552, 1372220560848138240, 1362052341059497984, 1360318536816721920, 1354901308470247424]


In [32]:
for id in NaN_list:
    del embeddings[id]
len(embeddings)

7581541

In [33]:
tweets.drop(tweets[tweets['id'].isin(NaN_list)].index, inplace=True)
tweets.reset_index(inplace=True, drop=True)
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7581541 entries, 0 to 7581540
Data columns (total 7 columns):
 #   Column              Dtype
---  ------              -----
 0   id                  int64
 1   author_id           int64
 2   urls_count          int64
 3   hashtags            int64
 4   animated_gif_count  int64
 5   video_count         int64
 6   image               int64
dtypes: int64(7)
memory usage: 404.9 MB


In [44]:
pairs = pd.read_pickle('../data/pairs.pkl.bz2',compression='bz2')
pairs.head()

Unnamed: 0,author,author_followers_count,verified,tweet_id1,urls_count1,url_image_count1,hashtags_count1,animated_gif_count1,photo_count1,video_count1,tweet_id2,urls_count2,url_image_count2,hashtags_count2,animated_gif_count2,photo_count2,video_count2,max_date,min_date,winner
17416113,1172657864118194176,4431,0,1344795571865989120,2,0,0,0,1,0,1344800219972726784,2,0,0,1,0,0,2021-01-01 00:18:52,2021-01-01 00:00:24,0
17416112,1172657864118194176,4431,0,1344803663647272960,2,0,0,0,1,0,1344795571865989120,2,0,0,0,1,0,2021-01-01 00:32:33,2021-01-01 00:00:24,0
17416111,1172657864118194176,4431,0,1344803663647272960,2,0,0,0,1,0,1344800219972726784,2,0,0,1,0,0,2021-01-01 00:32:33,2021-01-01 00:18:52,0
21264726,1337163419326279680,193,0,1344805720143888384,1,0,0,0,1,0,1344804446698672128,1,0,0,0,1,0,2021-01-01 00:40:43,2021-01-01 00:35:40,1
3279199,20207761,16032,0,1344806765590884352,2,0,8,0,1,0,1344806944675131392,2,0,8,0,1,0,2021-01-01 00:45:35,2021-01-01 00:44:52,0


In [53]:
used_authors = set(pairs['author'].unique())
len(used_authors)

209309

# Create author vector

In [34]:
authors = tweets.groupby("author_id") 
len(authors)

1752484

In [35]:
authors = authors.filter(lambda x : x.shape[0] > 1).groupby("author_id") 
authors.apply(lambda x : x.shape[0]).describe()

count    659349.000000
mean          9.840625
std          89.256731
min           2.000000
25%           2.000000
50%           3.000000
75%           6.000000
max       31630.000000
dtype: float64

In [54]:
authorW = {}
for author in tqdm(authors):
    if author[0] not in used_authors:
        continue
    
    tweet_emb = []
    for tweet in author[1]['id']:
        tweet_emb.append(embeddings[tweet])
    
    authorW[author[0]] = np.average(tweet_emb, axis=0)
    
len(authorW)

  0%|          | 0/659349 [00:00<?, ?it/s]

209309

In [55]:
authors = authors.mean()
authors.head()

Unnamed: 0_level_0,id,urls_count,hashtags,animated_gif_count,video_count,image
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22,1.446121e+18,0.923077,0.0,0.0,0.0,0.384615
47,1.494877e+18,0.0,0.0,0.0,0.0,0.0
224,1.466583e+18,0.55,0.0,0.0,0.0,0.45
246,1.417842e+18,0.6,0.0,0.0,0.0,0.6
259,1.462918e+18,0.0,0.0,0.0,0.0,0.0


In [59]:
authors = authors[authors.index.isin(used_authors)]
authors

Unnamed: 0_level_0,id,urls_count,hashtags,animated_gif_count,video_count,image
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22,1.446121e+18,0.923077,0.000000,0.0,0.0,0.384615
224,1.466583e+18,0.550000,0.000000,0.0,0.0,0.450000
246,1.417842e+18,0.600000,0.000000,0.0,0.0,0.600000
985,1.469586e+18,1.000000,0.888889,0.0,0.0,0.222222
989,1.455126e+18,1.000000,0.100000,0.0,0.0,0.400000
...,...,...,...,...,...,...
1587452313806970880,1.587454e+18,1.000000,0.000000,0.0,0.0,0.000000
1587460305604546560,1.587806e+18,0.000000,0.000000,0.0,0.0,0.000000
1587510806366789632,1.588149e+18,1.000000,0.000000,0.0,0.0,0.000000
1587528864930070528,1.587684e+18,0.500000,0.500000,0.0,0.0,0.500000


In [60]:
authors.index.is_monotonic_increasing

True

In [63]:
authors['embedding'] = authors.index.map(authorW)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  authors['embedding'] = authors.index.map(authorW)


In [67]:
authors.drop("id", axis=1, inplace=True)
authors.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  authors.drop("id", axis=1, inplace=True)


Index(['urls_count', 'hashtags', 'animated_gif_count', 'video_count', 'image',
       'embedding'],
      dtype='object')

In [70]:
authors = authors[['embedding', 'urls_count', 'hashtags', 'animated_gif_count', 'video_count', 'image']]
authors

Unnamed: 0_level_0,embedding,urls_count,hashtags,animated_gif_count,video_count,image
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22,"[-0.011831568, 8.823024e-05, 0.0044340463, 0.0...",0.923077,0.000000,0.0,0.0,0.384615
224,"[-0.026186835, -0.029001068, 0.015409039, 0.04...",0.550000,0.000000,0.0,0.0,0.450000
246,"[-0.021350076, -0.0067993426, 0.015069405, 0.0...",0.600000,0.000000,0.0,0.0,0.600000
985,"[-0.02677762, -0.007397843, -0.00015252017, 0....",1.000000,0.888889,0.0,0.0,0.222222
989,"[-0.008732015, -0.005051827, -0.0018818371, 0....",1.000000,0.100000,0.0,0.0,0.400000
...,...,...,...,...,...,...
1587452313806970880,"[-0.0003473599, -0.038943876, -0.02784068, 0.0...",1.000000,0.000000,0.0,0.0,0.000000
1587460305604546560,"[-0.036397807, 0.0075382213, 0.013312943, 0.03...",0.000000,0.000000,0.0,0.0,0.000000
1587510806366789632,"[0.0068783686, -9.928821e-06, 0.008889898, 0.0...",1.000000,0.000000,0.0,0.0,0.000000
1587528864930070528,"[-0.0045513655, -0.025294915, -0.0018953065, 0...",0.500000,0.500000,0.0,0.0,0.500000


In [71]:
authors.to_pickle('../data/authors_weights.pkl.bz2',compression='bz2')

In [75]:
authorW = {}
for index, row in tqdm(authors.iterrows()):
    weights = list(row['embedding'])
    weights.append(row['urls_count'])
    weights.append(row['hashtags'])
    weights.append(row['animated_gif_count'])
    weights.append(row['video_count'])
    weights.append(row['image'])
    
    authorW[index] = np.array(weights)

0it [00:00, ?it/s]

In [78]:
with open("../data/authors_weights.pickle", "wb") as file:
    pickle.dump(authorW, file, pickle.HIGHEST_PROTOCOL)