In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy

In [2]:
users = pd.read_csv("cleaned_users.csv")

In [3]:
tweets = pd.read_csv("cleaned_tweets.csv", index_col = 0, lineterminator = '\n')

In [4]:
users.rename(columns = {'id':'user_id'}, inplace = True)

In [5]:
joined_df = tweets.merge(users, on='user_id')

In [6]:
joined_df.head()

Unnamed: 0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at_x,text,name,lang,bot,created_at_y,statuses_count
0,327746321,0,0,0,0,0,0,2019-09-11 14:53:55,"If man is a little lower than angels, then ang...",The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157
1,327746321,0,0,0,0,0,0,2019-10-21 17:42:10,Read the Biography of Don Henley http://t.co/...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157
2,327746321,0,0,0,0,0,0,2019-11-02 15:11:22,Don't tell me where your priorities are. Show ...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157
3,327746321,0,0,0,0,0,0,2019-11-07 22:07:20,Learn About the Great Music of Bill Justis ht...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157
4,327746321,0,0,0,0,0,0,2019-10-12 04:46:17,Do you love James Bond? Â Check out these cool...,The Amazing Rhythm,en,1,2016-07-03 02:37:46,291.615157


## Average length of a tweet

In [7]:
text = joined_df['text'].astype(str)

In [8]:
print('Average length of a tweet:', text.apply(len).mean())

Average length of a tweet: 71.53352429598813


## Ratio between the number of likes and the number of tweets

In [9]:
print('Ratio between favorite_count and number of tweets:', joined_df['favorite_count'].sum()/len(joined_df))

Ratio between favorite_count and number of tweets: 0.18481920823388165


## Total number of tweets per user

In [10]:
num_tweets = joined_df.groupby(['user_id'])['reply_count'].count().to_frame()
num_tweets.rename(columns = {'reply_count':'num_tweets'}, inplace = True)
num_tweets.num_tweets.sort_values()

user_id
21706899         1
1385297372       1
1386172154       1
126216653        1
1208102538       1
              ... 
164940888     3903
157029836     3903
1693274954    3906
497404180     3919
491630583     3929
Name: num_tweets, Length: 11508, dtype: int64

## Ratio between the number of urls and number of tweets (per user)

In [11]:
num_urls = joined_df.groupby(['user_id'])['num_urls'].sum().to_frame()
num_urls

Unnamed: 0_level_0,num_urls
user_id,Unnamed: 1_level_1
678033,0
722623,0
755116,0
755746,0
785080,0
...,...
3156622237,0
3158349782,0
3159993463,0
3161171948,0


In [12]:
joined_urls = num_tweets.merge(num_urls, on = 'user_id')
joined_urls

Unnamed: 0_level_0,num_tweets,num_urls
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
678033,2374,0
722623,2021,0
755116,2144,0
755746,2017,0
785080,3436,0
...,...,...
3156622237,3701,0
3158349782,109,0
3159993463,125,0
3161171948,6,0


In [20]:
num_tweets['urls_ratio'] = joined_urls['num_urls']/joined_urls['num_tweets']
num_tweets['urls_ratio']

user_id
678033        0.0
722623        0.0
755116        0.0
755746        0.0
785080        0.0
             ... 
3156622237    0.0
3158349782    0.0
3159993463    0.0
3161171948    0.0
3164941860    0.0
Name: urls_ratio, Length: 11508, dtype: float64

## Ratio between the number of mention and number of tweets (per user)

In [14]:
num_mentions = joined_df.groupby(['user_id'])['num_mentions'].sum().to_frame()

In [15]:
joined_mentions = num_tweets.merge(num_mentions, on = 'user_id')

In [22]:
num_tweets['mentions_ratio'] = joined_mentions['num_mentions']/joined_urls['num_tweets']
num_tweets['mentions_ratio'].sort_values()
num_tweets['mentions_ratio'].unique()

array([0.74894693, 0.53686294, 0.61007463, ..., 0.68807339, 0.208     ,
       0.84931507])

## Ratio between the number of hashtags and number of tweets (per user)

In [17]:
num_hashtags = joined_df.groupby(['user_id'])['num_hashtags'].sum().to_frame()

In [18]:
joined_hashtags = num_tweets.merge(num_hashtags, on = 'user_id')

In [29]:
num_tweets['hashtags_ratio'] = joined_hashtags['num_hashtags']/joined_urls['num_tweets']
num_tweets['hashtags_ratio'

array([0.00000000e+00, 1.00000000e+00, 3.76081234e-04, 2.27790433e-03,
       2.59268862e-04, 2.57731959e-04, 3.13283208e-04, 5.97371565e-04,
       6.15384615e-04, 9.99070200e-01, 9.99456817e-01, 7.28407908e-03,
       2.71223217e-04, 5.21920668e-04, 6.78794461e-03, 4.14078675e-04,
       5.35905681e-04, 1.12317484e-03, 4.67289720e-04, 5.47345375e-04,
       9.97632576e-01, 9.99490056e-01, 5.37634409e-04, 9.98714653e-01,
       8.20568928e-04, 4.53720508e-04, 2.37404379e-03, 4.19287212e-04,
       4.61254613e-04, 1.73913043e-03, 4.27350427e-04, 5.43773790e-04,
       5.59284116e-04, 9.99640546e-01, 8.80281690e-04, 2.24215247e-03,
       9.99731760e-01, 3.37154417e-04, 4.16666667e-03, 4.76871722e-04,
       9.98335183e-01, 1.36736554e-03, 1.14351058e-03, 7.30460190e-04,
       9.99014293e-01, 5.45553737e-03, 4.59347726e-04, 8.00640512e-04,
       8.11688312e-04, 1.08577633e-03, 3.72578241e-04, 1.37362637e-03,
       8.53242321e-04, 3.25884544e-03, 4.99251123e-04, 5.80720093e-04,
      

## Total | Std | Avg | Entropy, for every numerical features of the tweets and users ds (per user)