In [82]:
import scipy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import entropy

In [83]:
users = pd.read_csv("dataset/cleaned_users.csv", index_col=0)
users.head()

Unnamed: 0_level_0,name,lang,bot,created_at,statuses_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,4
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,7
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,8


In [84]:
tweets = pd.read_csv("dataset/cleaned_tweets.csv", index_col = 0, lineterminator = '\n')

In [87]:
tweets.head()

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
509354017856950272,327746321,0,0,0,0,0,0,2019-09-11 14:53:55,"If man is a little lower than angels, then ang..."
583002592529121280,333722906,1,0,0,0,1,1,2020-04-01 20:27:04,"""@BestWSHHVids: how do you say these words wit..."
461498835362013185,2379755827,0,0,0,0,0,1,2019-05-02 13:34:31,@LOLatComedy awsome
528808127366692864,466226882,0,0,0,0,0,0,2019-11-04 07:17:37,Stephen Hawkins: i buchi neri non esistono se ...
575336690904006656,1355537995,20,0,0,0,0,1,2020-03-11 16:45:31,RT @tibbs_montris: So ready for Wednesday!


In [85]:
users.rename(columns = {'id':'user_id'}, inplace = True)

In [88]:
joined_df = tweets.merge(users, right_index=True, left_on="user_id")

In [89]:
joined_df.head()

Unnamed: 0_level_0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at_x,text,name,lang,bot,created_at_y,statuses_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
509354017856950272,327746321,0,0,0,0,0,0,2019-09-11 14:53:55,"If man is a little lower than angels, then ang...",The Amazing Rhythm,en,1,2016-07-03 02:37:46,291
523891871761039360,327746321,0,0,0,0,0,0,2019-10-21 17:42:10,Read the Biography of Don Henley http://t.co/...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291
528202577788559360,327746321,0,0,0,0,0,0,2019-11-02 15:11:22,Don't tell me where your priorities are. Show ...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291
530119198710964225,327746321,0,0,0,0,0,0,2019-11-07 22:07:20,Learn About the Great Music of Bill Justis ht...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291
520435124048248832,327746321,0,0,0,0,0,0,2019-10-12 04:46:17,Do you love James Bond? Â Check out these cool...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291


In [91]:
joined_df.shape

(11226977, 14)

## Average length of a tweet

In [92]:
text = joined_df['text'].astype(str)

In [93]:
print('Average length of a tweet:', text.apply(len).mean())

Average length of a tweet: 71.53352429598813


## Ratio between the number of likes and the number of tweets

In [94]:
print('Ratio between favorite_count and number of tweets:', joined_df['favorite_count'].sum()/len(joined_df))

Ratio between favorite_count and number of tweets: 0.18481920823388165


## Total number of tweets per user

In [95]:
num_tweets = joined_df.groupby(['user_id']).size().to_frame(name="num_tweets")

In [96]:
num_tweets

Unnamed: 0_level_0,num_tweets
user_id,Unnamed: 1_level_1
678033,2374
722623,2021
755116,2144
755746,2017
785080,3436
...,...
3156622237,3701
3158349782,109
3159993463,125
3161171948,6


In [97]:
users = users.merge(num_tweets, how="inner", left_index=True, right_index=True)
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,num_tweets
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,132
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,4,121
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,7,4
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,1433
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,8,3637


## Ratio between the number of urls and number of tweets (per user)

In [98]:
num_urls = joined_df.groupby(['user_id'])['num_urls'].sum().to_frame(name="num_urls")
num_urls

Unnamed: 0_level_0,num_urls
user_id,Unnamed: 1_level_1
678033,0
722623,0
755116,0
755746,0
785080,0
...,...
3156622237,0
3158349782,0
3159993463,0
3161171948,0


In [99]:
users = users.merge(num_urls, how="inner", left_index=True, right_index=True)
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,num_tweets,num_urls
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,132,0
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,4,121,0
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,7,4,0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,1433,0
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,8,3637,0


In [100]:
urls_ratio = num_urls["num_urls"]/num_tweets["num_tweets"]
urls_ratio = urls_ratio.to_frame(name="urls_ratio")

In [101]:
urls_ratio

Unnamed: 0_level_0,urls_ratio
user_id,Unnamed: 1_level_1
678033,0.0
722623,0.0
755116,0.0
755746,0.0
785080,0.0
...,...
3156622237,0.0
3158349782,0.0
3159993463,0.0
3161171948,0.0


In [102]:
users = users.merge(urls_ratio, how="inner", left_index=True, right_index=True)
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,num_tweets,num_urls,urls_ratio
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,132,0,0.0
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,4,121,0,0.0
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,7,4,0,0.0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,1433,0,0.0
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,8,3637,0,0.0


## Ratio between the number of mention and number of tweets (per user)

In [103]:
num_mentions = joined_df.groupby(['user_id'])['num_mentions'].sum().to_frame(name="num_mentions")

In [104]:
mentions_ratio = num_mentions['num_mentions']/num_tweets['num_tweets']
mentions_ratio = mentions_ratio.to_frame(name="mentions_ratio")
mentions_ratio

Unnamed: 0_level_0,mentions_ratio
user_id,Unnamed: 1_level_1
678033,0.748947
722623,0.536863
755116,0.610075
755746,0.615270
785080,0.587602
...,...
3156622237,0.031343
3158349782,0.688073
3159993463,0.208000
3161171948,0.166667


In [105]:
users = users.merge(mentions_ratio, how="inner", left_index=True, right_index=True)
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,num_tweets,num_urls,urls_ratio,mentions_ratio
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,132,0,0.0,0.272727
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,4,121,0,0.0,0.338843
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,7,4,0,0.0,0.0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,1433,0,0.0,0.004885
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,8,3637,0,0.0,0.448996


## Ratio between the number of hashtags and number of tweets (per user)

In [106]:
num_hashtags = joined_df.groupby(['user_id'])['num_hashtags'].sum().to_frame()

In [107]:
hashtags_ratio = num_hashtags['num_hashtags']/num_tweets['num_tweets']
hashtags_ratio = hashtags_ratio.to_frame(name="hashtags_ratio")
hashtags_ratio

Unnamed: 0_level_0,hashtags_ratio
user_id,Unnamed: 1_level_1
678033,0.000000
722623,0.000000
755116,0.000000
755746,0.000000
785080,0.000000
...,...
3156622237,0.001081
3158349782,0.000000
3159993463,0.000000
3161171948,0.000000


In [108]:
users = users.merge(hashtags_ratio, how="inner", left_index=True, right_index=True)
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,num_tweets,num_urls,urls_ratio,mentions_ratio,hashtags_ratio
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,132,0,0.0,0.272727,0.0
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,4,121,0,0.0,0.338843,0.0
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,7,4,0,0.0,0.0,0.0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,1433,0,0.0,0.004885,0.0
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,8,3637,0,0.0,0.448996,0.0


## Total | Std | Avg | Entropy, for every numerical features of the tweets and users ds (per user)

## Num_hashtags

In [109]:
num_hashtags_total = joined_df.groupby(['user_id'])['num_hashtags'].sum().to_frame(name='num_hashtags_total')
print(num_hashtags_total.shape)
users = users.merge(num_hashtags_total, how="inner", left_index=True, right_index=True)
num_hashtags_mean = joined_df.groupby(['user_id'])['num_hashtags'].mean().to_frame(name='num_hashtags_mean')
print(num_hashtags_mean.shape)
users = users.merge(num_hashtags_mean, how="inner", left_index=True, right_index=True)
num_hashtags_std = joined_df.groupby(['user_id'])['num_hashtags'].std().to_frame(name='num_hashtags_std')
print(num_hashtags_std.shape)
users = users.merge(num_hashtags_std, how="inner", left_index=True, right_index=True)
num_hashtags_entropy = joined_df.groupby(['user_id'])['num_hashtags'].apply(entropy).to_frame(name='num_hashtags_entropy')
num_hashtags_entropy.fillna(0, inplace=True)
users = users.merge(num_hashtags_entropy, how="inner", left_index=True, right_index=True)
print(num_hashtags_entropy.shape)
users.head()

(11508, 1)
(11508, 1)
(11508, 1)


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


(11508, 1)


Unnamed: 0,name,lang,bot,created_at,statuses_count,num_tweets,num_urls,urls_ratio,mentions_ratio,hashtags_ratio,num_hashtags_total,num_hashtags_mean,num_hashtags_std,num_hashtags_entropy
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,132,0,0.0,0.272727,0.0,0,0.0,0.0,0.0
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,4,121,0,0.0,0.338843,0.0,0,0.0,0.0,0.0
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,7,4,0,0.0,0.0,0.0,0,0.0,0.0,0.0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,1433,0,0.0,0.004885,0.0,0,0.0,0.0,0.0
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,8,3637,0,0.0,0.448996,0.0,0,0.0,0.0,0.0


In [110]:
num_urls_total = joined_df.groupby(['user_id'])['num_urls'].sum().to_frame(name='num_urls_total')
users = users.merge(num_urls_total, how="inner", left_index=True, right_index=True)
print(num_urls_total.shape)
num_urls_mean = joined_df.groupby(['user_id'])['num_urls'].mean().to_frame(name='num_urls_mean')
users = users.merge(num_urls_mean, how="inner", left_index=True, right_index=True)
print(num_urls_mean.shape)
num_urls_std = joined_df.groupby(['user_id'])['num_urls'].std().to_frame(name='num_urls_std')
users = users.merge(num_urls_std, how="inner", left_index=True, right_index=True)
print(num_urls_std.shape)
num_urls_entropy = joined_df.groupby(['user_id'])['num_urls'].apply(entropy).to_frame(name='num_urls_entropy')
num_urls_entropy.fillna(0, inplace=True)
users = users.merge(num_urls_entropy, how="inner", left_index=True, right_index=True)
print(num_urls_entropy.shape)

(11508, 1)
(11508, 1)
(11508, 1)


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


(11508, 1)


In [112]:
num_mentions_total = joined_df.groupby(['user_id'])['num_mentions'].sum().to_frame(name='num_mentions_total')
users = users.merge(num_mentions_total, how="inner", left_index=True, right_index=True)
print(num_mentions_total.shape)
num_mentions_mean = joined_df.groupby(['user_id'])['num_mentions'].mean().to_frame(name='num_mentions_mean')
users = users.merge(num_mentions_mean, how="inner", left_index=True, right_index=True)
print(num_mentions_mean.shape)
num_mentions_std = joined_df.groupby(['user_id'])['num_mentions'].std().to_frame(name='num_mentions_std')
users = users.merge(num_mentions_std, how="inner", left_index=True, right_index=True)
print(num_mentions_std.shape)
num_mentions_entropy = joined_df.groupby(['user_id'])['num_mentions'].apply(entropy).to_frame(name='num_mentions_entropy')
num_mentions_entropy.fillna(0, inplace=True)
users = users.merge(num_mentions_entropy, how="inner", left_index=True, right_index=True)
print(num_mentions_entropy.shape)

(11508, 1)
(11508, 1)
(11508, 1)


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


(11508, 1)


In [113]:
reply_count_total = joined_df.groupby(['user_id'])['reply_count'].sum().to_frame(name='reply_count_total')
users = users.merge(reply_count_total, how="inner", left_index=True, right_index=True)
print(reply_count_total.shape)
reply_count_mean = joined_df.groupby(['user_id'])['reply_count'].mean().to_frame(name='reply_count_mean')
users = users.merge(reply_count_mean, how="inner", left_index=True, right_index=True)
print(reply_count_mean.shape)
reply_count_std = joined_df.groupby(['user_id'])['reply_count'].std().to_frame(name='reply_count_std')
users = users.merge(reply_count_std, how="inner", left_index=True, right_index=True)
print(reply_count_std.shape)
reply_count_entropy = joined_df.groupby(['user_id'])['reply_count'].apply(entropy).to_frame(name='reply_count_entropy')
reply_count_entropy.fillna(0, inplace=True)
users = users.merge(reply_count_entropy, how="inner", left_index=True, right_index=True)
print(reply_count_entropy.shape)

(11508, 1)
(11508, 1)
(11508, 1)


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


(11508, 1)


In [114]:
favorite_count_total = joined_df.groupby(['user_id'])['favorite_count'].sum().to_frame(name='favorite_count_total')
users = users.merge(favorite_count_total, how="inner", left_index=True, right_index=True)
print(favorite_count_total.shape)
favorite_count_mean = joined_df.groupby(['user_id'])['favorite_count'].mean().to_frame(name='favorite_count_mean')
users = users.merge(favorite_count_mean, how="inner", left_index=True, right_index=True)
print(favorite_count_mean.shape)
favorite_count_std = joined_df.groupby(['user_id'])['favorite_count'].std().to_frame(name='favorite_count_std')
users = users.merge(favorite_count_std, how="inner", left_index=True, right_index=True)
print(favorite_count_std.shape)
favorite_count_entropy = joined_df.groupby(['user_id'])['favorite_count'].apply(entropy).to_frame(name='favorite_count_entropy')
favorite_count_entropy.fillna(0, inplace=True)
users = users.merge(favorite_count_entropy, how="inner", left_index=True, right_index=True)
print(favorite_count_entropy.shape)

(11508, 1)
(11508, 1)
(11508, 1)


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


(11508, 1)


In [115]:
retweet_count_total = joined_df.groupby(['user_id'])['retweet_count'].sum().to_frame(name='retweet_count_total')
users = users.merge(retweet_count_total, how="inner", left_index=True, right_index=True)
print(retweet_count_total.shape)
retweet_count_mean = joined_df.groupby(['user_id'])['retweet_count'].mean().to_frame(name='retweet_count_mean')
users = users.merge(retweet_count_mean, how="inner", left_index=True, right_index=True)
print(retweet_count_mean.shape)
retweet_count_std = joined_df.groupby(['user_id'])['retweet_count'].std().to_frame(name='retweet_count_std')
users = users.merge(retweet_count_std, how="inner", left_index=True, right_index=True)
print(retweet_count_std.shape)
retweet_count_entropy = joined_df.groupby(['user_id'])['retweet_count'].apply(entropy).to_frame(name='retweet_count_entropy')
retweet_count_entropy.fillna(0, inplace=True)
users = users.merge(retweet_count_entropy, how="inner", left_index=True, right_index=True)
print(retweet_count_entropy.shape)

(11508, 1)
(11508, 1)
(11508, 1)


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


(11508, 1)


In [116]:
users.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,num_tweets,num_urls,urls_ratio,mentions_ratio,hashtags_ratio,...,reply_count_std,reply_count_entropy,favorite_count_total,favorite_count_mean,favorite_count_std,favorite_count_entropy,retweet_count_total,retweet_count_mean,retweet_count_std,retweet_count_entropy
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,132,0,0.0,0.272727,0.0,...,0.0,0.0,5,0.037879,0.19163,1.609438,5,0.037879,0.19163,1.609438
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,4,121,0,0.0,0.338843,0.0,...,0.0,0.0,6,0.049587,0.217992,1.791759,3,0.024793,0.156141,1.098612
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,7,4,0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,1433,0,0.0,0.004885,0.0,...,0.0,0.0,187,0.130495,0.336965,5.231109,215,0.150035,1.326761,3.55472
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,8,3637,0,0.0,0.448996,0.0,...,0.0,0.0,194,0.053341,0.224743,5.267858,61,0.016772,0.142638,3.965939


In [117]:
users.shape

(11508, 34)