In [1]:
import scipy
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from math import e

# from scipy.stats import entropy

In [2]:
# custom entropy
def pandas_entropy(column, base=2):
    vc = pd.Series(column).value_counts(normalize=True, sort=False)
    base = e if base is None else base
    return -(vc * np.log(vc)/np.log(base)).sum()

In [3]:
users = pd.read_csv("dataset/cleaned_users.csv", index_col=0)
users.head()

Unnamed: 0_level_0,name,lang,bot,created_at,statuses_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,53
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085


In [4]:
tweets = pd.read_csv("dataset/cleaned_tweets.csv", index_col = 0, lineterminator = '\n')

In [5]:
tweets.retweet_count.std()

40.194537775592835

In [6]:
tweets.head()

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
509354017856950272,327746321,0,0,0,0,0,0,2019-09-11 14:53:55,"If man is a little lower than angels, then ang..."
583002592529121280,333722906,1,0,0,0,0,1,2020-04-01 20:27:04,"""@BestWSHHVids: how do you say these words wit..."
461498835362013185,2379755827,0,0,0,0,0,1,2019-05-02 13:34:31,@LOLatComedy awsome
528808127366692864,466226882,0,0,0,0,0,0,2019-11-04 07:17:37,Stephen Hawkins: i buchi neri non esistono se ...
575336690904006656,1355537995,114,0,0,1,0,1,2020-03-11 16:45:31,RT @tibbs_montris: So ready for Wednesday!


In [7]:
users.rename(columns = {'id':'user_id'}, inplace = True)

In [8]:
joined_df = tweets.merge(users, how="inner", right_index=True, left_on="user_id")

In [9]:
joined_df.head()

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at_x,text,name,lang,bot,created_at_y,statuses_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
509354017856950272,327746321,0,0,0,0,0,0,2019-09-11 14:53:55,"If man is a little lower than angels, then ang...",The Amazing Rhythm,en,1,2016-07-03 02:37:46,53
523891871761039360,327746321,0,0,0,0,1,0,2019-10-21 17:42:10,Read the Biography of Don Henley http://t.co/...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,53
528202577788559360,327746321,0,0,0,0,0,0,2019-11-02 15:11:22,Don't tell me where your priorities are. Show ...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,53
530119198710964225,327746321,0,0,0,0,1,0,2019-11-07 22:07:20,Learn About the Great Music of Bill Justis ht...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,53
520435124048248832,327746321,0,0,0,0,1,0,2019-10-12 04:46:17,Do you love James Bond? Â Check out these cool...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,53


In [10]:
joined_df.shape

(11226977, 14)

## Average length of a tweet

In [11]:
text = joined_df['text'].astype(str)

In [12]:
print('Average length of a tweet:', text.apply(len).mean())

Average length of a tweet: 71.53352429598813


## Ratio between the number of likes and the number of tweets

In [13]:
print('Ratio between favorite_count and number of tweets:', joined_df['favorite_count'].sum()/len(joined_df))

Ratio between favorite_count and number of tweets: 0.4137094963319155


## Average Lenght of tweet per user

In [14]:
joined_df["text"] = joined_df['text'].astype(str)

In [15]:
avg_length = joined_df.groupby(['user_id'])["text"].apply(lambda x: np.mean(x.apply(len))).to_frame(name="avg_length")

In [16]:
users = users.merge(avg_length, how="inner", left_index=True, right_index=True)
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,avg_length
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,62.340909
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,69.082645
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,53,19.25
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,86.944871
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,72.311246


## Average Special Chars of tweet per user

In [17]:
def count_special_chars(tweet):
    return len(tweet) - len(re.findall('[\w]', tweet))

avg_special_char = joined_df.groupby(['user_id'])["text"].apply(lambda x: np.mean(x.apply(count_special_chars))).to_frame(name="avg_special_chars")

In [18]:
users = users.merge(avg_special_char, how="inner", left_index=True, right_index=True)
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,avg_length,avg_special_chars
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,62.340909,14.015152
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,69.082645,15.041322
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,53,19.25,5.25
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,86.944871,18.689463
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,72.311246,14.582073


## Total number of tweets per user

In [19]:
num_tweets = joined_df.groupby(['user_id']).size().to_frame(name="num_tweets")

In [20]:
num_tweets

Unnamed: 0_level_0,num_tweets
user_id,Unnamed: 1_level_1
678033,2374
722623,2021
755116,2144
755746,2017
785080,3436
...,...
3156622237,3701
3158349782,109
3159993463,125
3161171948,6


## Ratio between the number of urls and number of tweets (per user)

In [21]:
num_urls = joined_df.groupby(['user_id'])['num_urls'].sum().to_frame(name="num_urls")
num_urls

Unnamed: 0_level_0,num_urls
user_id,Unnamed: 1_level_1
678033,793
722623,345
755116,317
755746,848
785080,234
...,...
3156622237,6
3158349782,1
3159993463,41
3161171948,1


In [22]:
urls_ratio = num_urls["num_urls"]/num_tweets["num_tweets"]
urls_ratio = urls_ratio.to_frame(name="urls_ratio")

In [23]:
urls_ratio

Unnamed: 0_level_0,urls_ratio
user_id,Unnamed: 1_level_1
678033,0.334035
722623,0.170708
755116,0.147854
755746,0.420426
785080,0.068102
...,...
3156622237,0.001621
3158349782,0.009174
3159993463,0.328000
3161171948,0.166667


In [24]:
users = users.merge(urls_ratio, how="inner", left_index=True, right_index=True)
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,avg_length,avg_special_chars,urls_ratio
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,62.340909,14.015152,0.0
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,69.082645,15.041322,0.0
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,53,19.25,5.25,0.0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,86.944871,18.689463,0.022331
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,72.311246,14.582073,0.000825


## Ratio between the number of mention and number of tweets (per user)

In [25]:
num_mentions = joined_df.groupby(['user_id'])['num_mentions'].sum().to_frame(name="num_mentions")

In [26]:
mentions_ratio = num_mentions['num_mentions']/num_tweets['num_tweets']
mentions_ratio = mentions_ratio.to_frame(name="mentions_ratio")
mentions_ratio

Unnamed: 0_level_0,mentions_ratio
user_id,Unnamed: 1_level_1
678033,1.093092
722623,0.664028
755116,0.959422
755746,0.727814
785080,0.761350
...,...
3156622237,0.035396
3158349782,0.798165
3159993463,0.272000
3161171948,0.333333


In [27]:
users = users.merge(mentions_ratio, how="inner", left_index=True, right_index=True)
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,avg_length,avg_special_chars,urls_ratio,mentions_ratio
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,62.340909,14.015152,0.0,0.272727
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,69.082645,15.041322,0.0,0.338843
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,53,19.25,5.25,0.0,0.0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,86.944871,18.689463,0.022331,0.006281
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,72.311246,14.582073,0.000825,0.506461


## Ratio between the number of hashtags and number of tweets (per user)

In [28]:
num_hashtags = joined_df.groupby(['user_id'])['num_hashtags'].sum().to_frame()

In [29]:
hashtags_ratio = num_hashtags['num_hashtags']/num_tweets['num_tweets']
hashtags_ratio = hashtags_ratio.to_frame(name="hashtags_ratio")
hashtags_ratio

Unnamed: 0_level_0,hashtags_ratio
user_id,Unnamed: 1_level_1
678033,0.359730
722623,0.369619
755116,0.029851
755746,0.241448
785080,0.053260
...,...
3156622237,0.093218
3158349782,0.183486
3159993463,0.176000
3161171948,0.000000


In [30]:
users = users.merge(hashtags_ratio, how="inner", left_index=True, right_index=True)
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,avg_length,avg_special_chars,urls_ratio,mentions_ratio,hashtags_ratio
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,62.340909,14.015152,0.0,0.272727,0.098485
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,69.082645,15.041322,0.0,0.338843,0.024793
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,53,19.25,5.25,0.0,0.0,0.0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,86.944871,18.689463,0.022331,0.006281,0.072575
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,72.311246,14.582073,0.000825,0.506461,0.118229


## Std | Avg | Entropy, for count values (per user)

In [31]:
reply_count_mean = joined_df.groupby(['user_id'])['reply_count'].mean().to_frame(name='reply_count_mean')
users = users.merge(reply_count_mean, how="inner", left_index=True, right_index=True)
print(reply_count_mean.shape)
reply_count_std = joined_df.groupby(['user_id'])['reply_count'].apply(np.std).to_frame(name='reply_count_std')
users = users.merge(reply_count_std, how="inner", left_index=True, right_index=True)
print(reply_count_std.shape)
reply_count_entropy = joined_df.groupby(['user_id'])['reply_count'].apply(pandas_entropy).to_frame(name='reply_count_entropy')
reply_count_entropy.fillna(0, inplace=True)
users = users.merge(reply_count_entropy, how="inner", left_index=True, right_index=True)
print(reply_count_entropy.shape)

(11508, 1)
(11508, 1)
(11508, 1)


In [32]:
favorite_count_mean = joined_df.groupby(['user_id'])['favorite_count'].mean().to_frame(name='favorite_count_mean')
users = users.merge(favorite_count_mean, how="inner", left_index=True, right_index=True)
print(favorite_count_mean.shape)
favorite_count_std = joined_df.groupby(['user_id'])['favorite_count'].apply(np.std).to_frame(name='favorite_count_std')
users = users.merge(favorite_count_std, how="inner", left_index=True, right_index=True)
print(favorite_count_std.shape)
favorite_count_entropy = joined_df.groupby(['user_id'])['favorite_count'].apply(pandas_entropy).to_frame(name='favorite_count_entropy')
favorite_count_entropy.fillna(0, inplace=True)
users = users.merge(favorite_count_entropy, how="inner", left_index=True, right_index=True)
print(favorite_count_entropy.shape)

(11508, 1)
(11508, 1)
(11508, 1)


In [33]:
retweet_count_mean = joined_df.groupby(['user_id'])['retweet_count'].mean().to_frame(name='retweet_count_mean')
users = users.merge(retweet_count_mean, how="inner", left_index=True, right_index=True)
print(retweet_count_mean.shape)
retweet_count_std = joined_df.groupby(['user_id'])['retweet_count'].apply(np.std).to_frame(name='retweet_count_std')
users = users.merge(retweet_count_std, how="inner", left_index=True, right_index=True)
print(retweet_count_std.shape)
retweet_count_entropy = joined_df.groupby(['user_id'])['retweet_count'].apply(pandas_entropy).to_frame(name='retweet_count_entropy')
retweet_count_entropy.fillna(0, inplace=True)
users = users.merge(retweet_count_entropy, how="inner", left_index=True, right_index=True)
print(retweet_count_entropy.shape)

(11508, 1)
(11508, 1)
(11508, 1)


In [34]:
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,avg_length,avg_special_chars,urls_ratio,mentions_ratio,hashtags_ratio,reply_count_mean,reply_count_std,reply_count_entropy,favorite_count_mean,favorite_count_std,favorite_count_entropy,retweet_count_mean,retweet_count_std,retweet_count_entropy
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,62.340909,14.015152,0.0,0.272727,0.098485,0.0,0.0,-0.0,0.037879,0.190903,0.232481,0.037879,0.190903,0.232481
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,69.082645,15.041322,0.0,0.338843,0.024793,0.0,0.0,-0.0,0.049587,0.21709,0.284639,0.024793,0.155495,0.167568
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,53,19.25,5.25,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,86.944871,18.689463,0.022331,0.006281,0.072575,0.0,0.0,-0.0,0.165387,0.530838,0.669155,0.826239,13.034008,0.39285
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,72.311246,14.582073,0.000825,0.506461,0.118229,0.0,0.0,-0.0,0.056365,0.243387,0.317182,0.016772,0.142619,0.120737


In [35]:
users.shape

(11508, 19)

In [36]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11508 entries, 2353593986 to 933183398
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    11508 non-null  object 
 1   lang                    11508 non-null  object 
 2   bot                     11508 non-null  int64  
 3   created_at              11508 non-null  object 
 4   statuses_count          11508 non-null  int64  
 5   avg_length              11508 non-null  float64
 6   avg_special_chars       11508 non-null  float64
 7   urls_ratio              11508 non-null  float64
 8   mentions_ratio          11508 non-null  float64
 9   hashtags_ratio          11508 non-null  float64
 10  reply_count_mean        11508 non-null  float64
 11  reply_count_std         11508 non-null  float64
 12  reply_count_entropy     11508 non-null  float64
 13  favorite_count_mean     11508 non-null  float64
 14  favorite_count_std      1

### Saving file

In [37]:
users.to_csv("user_profiles.csv")