In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy

In [2]:
users = pd.read_csv("cleaned_users.csv")

In [3]:
tweets = pd.read_csv("cleaned_tweets.csv", index_col = 0, lineterminator = '\n')

In [4]:
users.rename(columns = {'id':'user_id'}, inplace = True)

In [5]:
joined_df = tweets.merge(users, on='user_id')

In [6]:
joined_df.head()

Unnamed: 0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at_x,text,name,lang,bot,created_at_y,statuses_count
0,327746321,0,0,0,0,0,0,2019-09-11 14:53:55,"If man is a little lower than angels, then ang...",The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157
1,327746321,0,0,0,0,0,0,2019-10-21 17:42:10,Read the Biography of Don Henley http://t.co/...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157
2,327746321,0,0,0,0,0,0,2019-11-02 15:11:22,Don't tell me where your priorities are. Show ...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157
3,327746321,0,0,0,0,0,0,2019-11-07 22:07:20,Learn About the Great Music of Bill Justis ht...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157
4,327746321,0,0,0,0,0,0,2019-10-12 04:46:17,Do you love James Bond? Â Check out these cool...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157


## Average length of a tweet

In [7]:
text = joined_df['text'].astype(str)

In [8]:
print('Average length of a tweet:', text.apply(len).mean())

Average length of a tweet: 71.53352429598813


## Ratio between the number of likes and the number of tweets

In [10]:
print('Ratio between favorite_count and number of tweets:', joined_df['favorite_count'].sum()/len(joined_df))

Ratio between favorite_count and number of tweets: 0.18481920823388165


## Total number of tweets per user

In [11]:
num_tweets = joined_df.groupby(['user_id'])['reply_count'].count().to_frame()
num_tweets.rename(columns = {'reply_count':'num_tweets'}, inplace = True)
num_tweets

Unnamed: 0_level_0,num_tweets
user_id,Unnamed: 1_level_1
678033,2374
722623,2021
755116,2144
755746,2017
785080,3436
...,...
3156622237,3701
3158349782,109
3159993463,125
3161171948,6


## Ratio between the number of urls and number of tweets (per user)

In [12]:
num_urls = joined_df.groupby(['user_id'])['num_urls'].sum().to_frame()
num_urls

Unnamed: 0_level_0,num_urls
user_id,Unnamed: 1_level_1
678033,0
722623,0
755116,0
755746,0
785080,0
...,...
3156622237,0
3158349782,0
3159993463,0
3161171948,0


In [13]:
joined_urls = num_tweets.merge(num_urls, on = 'user_id')
joined_urls

Unnamed: 0_level_0,num_tweets,num_urls
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
678033,2374,0
722623,2021,0
755116,2144,0
755746,2017,0
785080,3436,0
...,...,...
3156622237,3701,0
3158349782,109,0
3159993463,125,0
3161171948,6,0


In [14]:
joined_df['urls_ratio'] = joined_urls['num_urls']/joined_urls['num_tweets']
joined_df['urls_ratio'].sort_values()

678033      0.0
10935572    0.0
10866252    0.0
10352582    0.0
9427822     0.0
           ... 
11226972    NaN
11226973    NaN
11226974    NaN
11226975    NaN
11226976    NaN
Name: urls_ratio, Length: 11226977, dtype: float64

## Ratio between the number of mention and number of tweets (per user)

In [15]:
num_mentions = joined_df.groupby(['user_id'])['num_mentions'].sum().to_frame()

In [16]:
joined_mentions = num_tweets.merge(num_mentions, on = 'user_id')

In [17]:
joined_df['mentions_ratio'] = joined_mentions['num_mentions']/joined_urls['num_tweets']
joined_df['mentions_ratio'].sort_values()

3873101     0.102338
8711952     0.162857
10935572    0.268562
11174562    0.313144
9272142     0.365563
              ...   
11226972         NaN
11226973         NaN
11226974         NaN
11226975         NaN
11226976         NaN
Name: mentions_ratio, Length: 11226977, dtype: float64

## Ratio between the number of hashtags and number of tweets (per user)

In [18]:
num_hashtags = joined_df.groupby(['user_id'])['num_hashtags'].sum().to_frame()

In [19]:
joined_hashtags = num_tweets.merge(num_hashtags, on = 'user_id')

In [21]:
joined_df['hashtags_ratio'] = joined_hashtags['num_hashtags']/joined_urls['num_tweets']
joined_df['hashtags_ratio'].sort_values()
joined_df

Unnamed: 0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at_x,text,name,lang,bot,created_at_y,statuses_count,urls_ratio,mentions_ratio,hashtags_ratio
0,327746321,0,0,0,0,0,0,2019-09-11 14:53:55,"If man is a little lower than angels, then ang...",The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157,,,
1,327746321,0,0,0,0,0,0,2019-10-21 17:42:10,Read the Biography of Don Henley http://t.co/...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157,,,
2,327746321,0,0,0,0,0,0,2019-11-02 15:11:22,Don't tell me where your priorities are. Show ...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157,,,
3,327746321,0,0,0,0,0,0,2019-11-07 22:07:20,Learn About the Great Music of Bill Justis ht...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157,,,
4,327746321,0,0,0,0,0,0,2019-10-12 04:46:17,Do you love James Bond? Â Check out these cool...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11226972,67828913,0,0,0,0,0,0,2014-08-24 06:08:32,off to bed,John Garrett,en,0,2014-08-24 06:06:50,0.693147,,,
11226973,1381744988,0,0,0,0,0,0,2018-04-30 07:32:17,End of test :(Í¡à¹Ì¯Í¡à¹) (Í¡à¹Ì¯Í¡à¹) (Í¡...,Zina Cote,en,1,2018-04-28 11:55:55,7.909313,,,
11226974,97859473,0,0,0,0,0,0,2017-07-17 08:38:34,http://t.co/6j63gV95,Alp Yavuzeser,tr,0,2014-12-21 08:52:16,0.693147,,,
11226975,291362552,0,0,0,0,0,0,2016-05-02 22:19:48,Bueno ps me pase el the thing 1 y pues estubo ...,Luis Eduardo Ramirez,es,0,2016-05-02 21:48:58,0.693147,,,


## Total | Std | Avg | Entropy, for every numerical features of the tweets and users ds (per user)