In [1]:
# Load modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, os, sys, json, pickle
from tqdm import tqdm
import plotly.express as px
from datetime import datetime
import ast
import spacy
from src.helpers import * 
import time

# Autoreload for src imports
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

## Convert JSONL files to CSV

In [2]:
files_to_convert = sorted([item for item in os.listdir('data/') if item.endswith('.jsonl') and (item.split('.')[0] + '.csv' not in os.listdir('data/'))])

## Analyse the data
* Nombre de Tweet dans le temps
* Nombre de Tweet de differentes langues dans le temps
* Distribution de Tweets par Langue
* Nombre de Sensitive tweets dans le temps
* Hastags les plus utilisés en anglais
* Nombre de in_reply_to_user_id dans le temps (le nombre de replies ->  discussion)
* Nombre de tweets par verified/non verified users in time
* Nombre de tweets supprimés
* Extraction de pronoms 

In [7]:
filenames = sorted([item.split('.')[0] for item in os.listdir('data/') if item.endswith('.csv')])

In [8]:
number_of_tweets_in_time = []
number_of_tweets_per_lang_in_time = []
number_of_tweets_per_lang = []
number_of_sensitive_tweets_in_time = []
hashtag_count_in_english = []
number_of_tweets_by_verified_users_in_time = []
number_of_replies_in_time = []
number_of_deleted_tweets = []
top_retweeted_tweets = []
top_favorited_tweets = []
for filename in tqdm(filenames):
    
    start = datetime.now()

    metoo_df = pd.read_csv(f'data/{filename}.csv' , lineterminator='\n')
    metoo_df_tweet_ids = pd.read_csv(f'data/{filename}.txt', names=['tweet_id'])
        
    metoo_df = apply_common_transformations(metoo_df)
    
    number_of_tweets_in_time.append(extract_number_of_tweets_in_time(metoo_df))
    number_of_tweets_per_lang_in_time.append(extract_number_of_tweets_per_lang_in_time(metoo_df))
    number_of_tweets_per_lang.append(extract_number_of_tweets_per_lang(metoo_df))
    number_of_sensitive_tweets_in_time.append(extract_number_of_sensitive_tweets_in_time(metoo_df))
    hashtag_count_in_english.append(extract_hashtag_count_in_english(metoo_df))
    number_of_tweets_by_verified_users_in_time.append(extract_number_of_tweets_by_verified_users_in_time(metoo_df))
    number_of_replies_in_time.append(extract_number_of_replies_in_time(metoo_df))
    number_of_deleted_tweets.append(extract_number_of_deleted_tweets(metoo_df, metoo_df_tweet_ids))
    
    print(f'First part finished: {datetime.now() - start}')
    
    top_retweeted_tweets_df = metoo_df[(metoo_df.lang == 'en') & (metoo_df.retweet_id.isna())]
    top_retweeted_tweets_df = top_retweeted_tweets_df.sort_values('retweet_count', ascending=False).head(1000)    
    
    top_retweeted_tweets_df = extract_pronouns(top_retweeted_tweets_df)
    top_favorited_tweets_df = extract_pronouns(metoo_df[metoo_df.lang == 'en'].sort_values('favorite_count', ascending=False).head(1000))

    top_retweeted_tweets.append(top_retweeted_tweets_df)
    top_favorited_tweets.append(top_favorited_tweets_df)

    if not filename.endswith('01'):
        metoo_df = extract_pronouns(metoo_df)
        metoo_df.to_csv(f'pronoun_data/{filename}_pronouns.csv', index=False)
    

extracted_data = [(number_of_tweets_in_time, 'number_of_tweets_in_time'), (number_of_tweets_per_lang_in_time, 'number_of_tweets_per_lang_in_time'), 
                  (number_of_tweets_per_lang, 'number_of_tweets_per_lang'), (number_of_sensitive_tweets_in_time, 'number_of_sensitive_tweets_in_time'), 
                  (hashtag_count_in_english, 'hashtag_count_in_english'), (number_of_tweets_by_verified_users_in_time, 'number_of_tweets_by_verified_users_in_time'),
                  (number_of_replies_in_time, 'number_of_replies_in_time'), (number_of_deleted_tweets, 'number_of_deleted_tweets'),
                  (top_retweeted_tweets, 'top_retweeted_tweets'), (top_favorited_tweets, 'top_favorited_tweets')]


for item, item_name in extracted_data:
    pickle_item(item, item_name)

  0%|          | 0/33 [00:00<?, ?it/s]

First part finished: 0:00:42.803181


  3%|▎         | 1/33 [00:51<27:17, 51.18s/it]

First part finished: 0:00:30.495450


  6%|▌         | 2/33 [01:29<22:31, 43.61s/it]

First part finished: 0:00:29.928500


  9%|▉         | 3/33 [02:07<20:27, 40.91s/it]

First part finished: 0:00:24.386688


 12%|█▏        | 4/33 [02:39<18:06, 37.46s/it]

First part finished: 0:00:29.662206


 15%|█▌        | 5/33 [03:16<17:28, 37.44s/it]

First part finished: 0:00:33.502534


 18%|█▊        | 6/33 [03:58<17:28, 38.82s/it]

First part finished: 0:00:46.257422


 21%|██        | 7/33 [04:52<19:00, 43.85s/it]

First part finished: 0:00:39.928378


 24%|██▍       | 8/33 [05:40<18:50, 45.20s/it]

First part finished: 0:00:24.067110


 27%|██▋       | 9/33 [06:12<16:26, 41.10s/it]

First part finished: 0:00:21.244829


 30%|███       | 10/33 [06:42<14:22, 37.49s/it]

First part finished: 0:00:40.108709


 33%|███▎      | 11/33 [07:30<14:55, 40.69s/it]

First part finished: 0:00:41.795597


 36%|███▋      | 12/33 [08:20<15:16, 43.66s/it]

First part finished: 0:00:18.598841


 39%|███▉      | 13/33 [08:46<12:48, 38.45s/it]

First part finished: 0:00:47.604049


 42%|████▏     | 14/33 [09:42<13:46, 43.52s/it]

First part finished: 0:00:45.847256


 45%|████▌     | 15/33 [10:35<13:55, 46.39s/it]

First part finished: 0:00:26.324717


 48%|████▊     | 16/33 [11:08<12:02, 42.52s/it]

First part finished: 0:00:30.587761


 52%|█████▏    | 17/33 [11:46<10:57, 41.12s/it]

First part finished: 0:00:39.851942


 55%|█████▍    | 18/33 [12:34<10:45, 43.03s/it]

First part finished: 0:00:50.717077


 58%|█████▊    | 19/33 [13:32<11:06, 47.61s/it]

First part finished: 0:00:51.205394


 61%|██████    | 20/33 [14:30<11:00, 50.81s/it]

First part finished: 0:00:26.668868


 64%|██████▎   | 21/33 [15:03<09:04, 45.41s/it]

First part finished: 0:00:25.377102


 67%|██████▋   | 22/33 [15:35<07:35, 41.38s/it]

First part finished: 0:00:41.870387


 70%|██████▉   | 23/33 [16:25<07:20, 44.06s/it]

First part finished: 0:00:50.647526


 73%|███████▎  | 24/33 [17:23<07:14, 48.27s/it]

First part finished: 0:00:40.570976


 76%|███████▌  | 25/33 [18:12<06:27, 48.43s/it]

First part finished: 0:00:35.423711


 79%|███████▉  | 26/33 [18:56<05:28, 46.98s/it]

First part finished: 0:00:02.634094


 82%|████████▏ | 27/33 [19:06<03:36, 36.08s/it]

First part finished: 0:00:40.737525


 85%|████████▍ | 28/33 [19:56<03:20, 40.12s/it]

First part finished: 0:00:40.964275


 88%|████████▊ | 29/33 [20:45<02:51, 42.85s/it]

First part finished: 0:00:44.518607


 91%|█████████ | 30/33 [21:38<02:17, 45.78s/it]

First part finished: 0:00:50.221392


 94%|█████████▍| 31/33 [22:36<01:38, 49.46s/it]

First part finished: 0:00:49.567588


 97%|█████████▋| 32/33 [23:33<00:51, 51.83s/it]

First part finished: 0:00:47.407841


100%|██████████| 33/33 [24:28<00:00, 44.52s/it]


In [37]:
number_of_tweets_in_time_df = pd.concat(number_of_tweets_in_time)
number_of_tweets_per_lang_in_time_df = pd.concat(number_of_tweets_per_lang_in_time)
number_of_tweets_per_lang_df = pd.concat(number_of_tweets_per_lang)
number_of_sensitive_tweets_in_time_df = pd.concat(number_of_sensitive_tweets_in_time)
hashtag_count_in_english_df = pd.concat(hashtag_count_in_english)
number_of_tweets_by_verified_users_in_time_df = pd.concat(number_of_tweets_by_verified_users_in_time)
number_of_replies_in_time_df = pd.concat(number_of_replies_in_time)
number_of_deleted_tweets_df = pd.DataFrame(number_of_deleted_tweets)
top_retweeted_tweets_df = pd.concat(top_retweeted_tweets)
top_favorited_tweets_df = pd.concat(top_favorited_tweets)

In [13]:
top_favorited_tweets_df.reset_index().to_csv('pronoun_data/top_favorited_tweets.csv', index=False)
top_retweeted_tweets_df.reset_index().to_csv('pronoun_data/top_retweeted_tweets_df.csv', index=False)


In [25]:
number_of_tweets_in_time_df = number_of_tweets_in_time_df.groupby(pd.Grouper(freq='D')).sum()
number_of_tweets_in_time_df = number_of_tweets_in_time_df.loc['2017-01-01':'2021-01-01']

# number_of_tweets_in_time_df.to_csv('export_summary/number_of_tweets_in_time_df.csv')
# number_of_tweets_in_time_df = pd.read_csv('export_summary/number_of_tweets_in_time_df.csv', index_col=0)

px.line(number_of_tweets_in_time_df, title= 'Number of MeToo related tweeets in time')

In [13]:
number_of_tweets_per_lang_df = number_of_tweets_per_lang_df.groupby('lang').sum()
number_of_tweets_per_lang_df.sort_values(ascending = False, inplace=True)
number_of_tweets_per_lang_df = number_of_tweets_per_lang_df.head(30)
top_30_languages = list(number_of_tweets_per_lang_df.index)

# number_of_tweets_per_lang_df.to_csv('export_summary/number_of_tweets_per_lang.csv')
# number_of_tweets_per_lang_df = pd.read_csv('export_summary/number_of_tweets_per_lang.csv', index_col=0)

px.bar(number_of_tweets_per_lang_df, title= 'Tweets per language')

In [12]:
number_of_tweets_per_lang_in_time_df = number_of_tweets_per_lang_in_time_df.reset_index()
number_of_tweets_per_lang_in_time_df = number_of_tweets_per_lang_in_time_df[(number_of_tweets_per_lang_in_time_df.created_at >= '2017-01-01') & 
                                                                            (number_of_tweets_per_lang_in_time_df.created_at <= '2021-01-01')]
number_of_tweets_per_lang_in_time_df = number_of_tweets_per_lang_in_time_df[number_of_tweets_per_lang_in_time_df.lang.isin(top_30_languages)]
number_of_tweets_per_lang_in_time_df = number_of_tweets_per_lang_in_time_df.set_index('created_at').groupby([pd.Grouper(freq='D'), 'lang']).id.sum().reset_index().pivot_table(index = 'created_at', columns = 'lang', values = 'id')

# number_of_tweets_per_lang_in_time_df.to_csv('export_summary/number_of_tweets_per_lang_in_time.csv')
# number_of_tweets_per_lang_in_time_df = pd.read_csv('export_summary/number_of_tweets_per_lang_in_time.csv', index_col=0)

px.line(number_of_tweets_per_lang_in_time_df, title='Number of tweets in time per language')

In [17]:
number_of_sensitive_tweets_in_time_df = number_of_sensitive_tweets_in_time_df.reset_index()
number_of_sensitive_tweets_in_time_df = number_of_sensitive_tweets_in_time_df[(number_of_sensitive_tweets_in_time_df.created_at >= '2017-01-01') & 
                                                                            (number_of_sensitive_tweets_in_time_df.created_at <= '2021-01-01')]
number_of_sensitive_tweets_in_time_df.possibly_sensitive =  number_of_sensitive_tweets_in_time_df.possibly_sensitive.replace(np.nan, 'unknown').replace(False, 'not_sensitive').replace(True, 'possibly_sensitive').replace('False', 'not_sensitive').replace('True', 'possibly_sensitive')
number_of_sensitive_tweets_in_time_df = number_of_sensitive_tweets_in_time_df.set_index('created_at').groupby([pd.Grouper(freq='D'), 'possibly_sensitive'], dropna=False).id.sum().reset_index().pivot_table(index = 'created_at', columns = 'possibly_sensitive', values = 'id')

# number_of_sensitive_tweets_in_time_df.to_csv('export_summary/number_of_sensitive_tweets_in_time.csv')
# number_of_sensitive_tweets_in_time_df = pd.read_csv('export_summary/number_of_sensitive_tweets_in_time.csv', index_col=0)

fig = px.line(number_of_sensitive_tweets_in_time_df, title='Number of sensitive tweets in time')
fig.show()
fig = px.line(number_of_sensitive_tweets_in_time_df, title='Number of sensitive tweets in time',  log_y=True)
fig.show()

In [15]:
hashtag_count_in_english_df = hashtag_count_in_english_df.reset_index().groupby('hashtags')['count'].sum().sort_values(ascending=False)

# hashtag_count_in_english_df.to_csv('export_summary/hashtag_count_in_english_df.csv')
# hashtag_count_in_english_df = pd.read_csv('export_summary/hashtag_count_in_english_df.csv', index_col=0)

hashtag_count_in_english_df = hashtag_count_in_english_df.head(30)
px.bar(hashtag_count_in_english_df, title = 'Top 30 english hashtags for MeToo related tweets')

In [14]:
number_of_tweets_by_verified_users_in_time_df
number_of_tweets_by_verified_users_in_time_df = number_of_tweets_by_verified_users_in_time_df.reset_index()
number_of_tweets_by_verified_users_in_time_df = number_of_tweets_by_verified_users_in_time_df[(number_of_tweets_by_verified_users_in_time_df.created_at >= '2017-01-01') & 
                                                                            (number_of_tweets_by_verified_users_in_time_df.created_at <= '2021-01-01')]
number_of_tweets_by_verified_users_in_time_df.user_verified =  number_of_tweets_by_verified_users_in_time_df.user_verified.replace(np.nan, 'unknown').replace(False, 'not_verified').replace(True, 'verified').replace('False', 'not_verified').replace('True', 'verified')
number_of_tweets_by_verified_users_in_time_df = number_of_tweets_by_verified_users_in_time_df.set_index('created_at').groupby([pd.Grouper(freq='D'), 'user_verified'], dropna=False).id.sum().reset_index().pivot_table(index = 'created_at', columns = 'user_verified', values = 'id')

# number_of_tweets_by_verified_users_in_time_df.to_csv('export_summary/number_of_tweets_by_verified_users_in_time.csv')
# number_of_tweets_by_verified_users_in_time_df = pd.read_csv('export_summary/number_of_tweets_by_verified_users_in_time.csv', index_col=0)

fig = px.line(number_of_tweets_by_verified_users_in_time_df, title='Number of tweets by user type in time')
fig.show()
fig = px.line(number_of_tweets_by_verified_users_in_time_df, title='Number of tweets by user type in time',  log_y=True)
fig.show()

In [31]:
number_of_deleted_tweets_df = number_of_deleted_tweets_df.sum()
# number_of_deleted_tweets_df.to_csv('export_summary/number_of_deleted_tweets.csv')

In [44]:
px.bar(number_of_deleted_tweets_df, title = 'Deleted vs Remaining Tweets')

In [39]:
number_of_replies_in_time_df = number_of_replies_in_time_df.reset_index()
number_of_replies_in_time_df = number_of_replies_in_time_df[(number_of_replies_in_time_df.created_at >= '2017-01-01') & 
                                                                            (number_of_replies_in_time_df.created_at <= '2021-01-01')]
number_of_replies_in_time_df.is_reply =  number_of_replies_in_time_df.is_reply.replace(np.nan, 'unknown').replace(False, 'not_reply').replace(True, 'reply').replace('False', 'not_reply').replace('True', 'reply')
number_of_replies_in_time_df = number_of_replies_in_time_df.set_index('created_at').groupby([pd.Grouper(freq='D'), 'is_reply'], dropna=False).id.sum().reset_index().pivot_table(index = 'created_at', columns = 'is_reply', values = 'id')

# number_of_replies_in_time_df.to_csv('export_summary/number_of_replies_in_time.csv')
# number_of_sensitive_tweets_in_time_df = pd.read_csv('export_summary/number_of_sensitive_tweets_in_time.csv', index_col=0)

fig = px.line(number_of_replies_in_time_df, title='Number of replies in time')
fig.show()
fig = px.line(number_of_replies_in_time_df, title='Number of replies in time',  log_y=True)
fig.show()

## Distibution of pronouns

In [5]:
df_favorites = pd.read_csv('pronoun_data/top_favorited_tweets.csv')
df_retweets = pd.read_csv('pronoun_data/top_retweeted_tweets_v2_df.csv')

df_favorites = pd.merge(df_favorites, df_favorites.pronoun_data.apply(lambda x: ast.literal_eval(x)).explode().apply(pd.Series), left_index = True, right_index = True)
df_retweets = pd.merge(df_retweets, df_retweets.pronoun_data.apply(lambda x: ast.literal_eval(x)).explode().apply(pd.Series), left_index = True, right_index = True)

  df_favorites = pd.read_csv('pronoun_data/top_favorited_tweets.csv')
  df_retweets = pd.read_csv('pronoun_data/top_retweeted_tweets_v2_df.csv')


In [14]:
df_retweets.groupby(['pronoun', 'explain']).created_at.count().sort_values(ascending=False).head(50).to_csv('export_summary/pronoun_distribution_for_top_retweeted_tweets.csv')

In [None]:
df_favorites = df_favorites[df_favorites.id.isin(df_favorites[df_favorites.id.isin(df_favorites.id.unique())].sort_values('favorite_count', ascending =False).head(1000).id).values]

In [None]:
df_favorites.to_csv('df_favorites_exploded.csv')
df_retweets.to_csv('df_retweets_exploded.csv')
