In [1]:
import os
import pandas as pd
import json
import re
from deep_translator import GoogleTranslator
import swifter
import pre_processing

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/robinjaccard/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/robinjaccard/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/robinjaccard/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/robinjaccard/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/robinjaccard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
pd.options.display.max_colwidth = 500

## load the dataset

In [3]:
dfs = []
for r, d, f in os.walk(os.getcwd()):
    for file in f:
        if 'withheldtweets.json' in file:
            dfs.append(pd.read_json("./censored_tweets/%s" % file, lines=True))

df_cen = pd.concat(dfs)
df_cen = df_cen.dropna(subset=['withheld_in_countries'])

In [4]:
df_cen.columns

Index(['created_at', 'id', 'id_str', 'text', 'source', 'truncated',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'quoted_status_id', 'quoted_status_id_str',
       'quoted_status', 'quoted_status_permalink', 'is_quote_status',
       'extended_tweet', 'quote_count', 'reply_count', 'retweet_count',
       'favorite_count', 'entities', 'favorited', 'retweeted', 'filter_level',
       'lang', 'timestamp_ms', 'linked', 'display_text_range',
       'withheld_in_countries', 'extended_entities', 'possibly_sensitive',
       'retweeted_status', 'withheld_copyright'],
      dtype='object')

In [5]:
df_cen.set_index('id', inplace=True)

In [6]:
df_cen.dtypes

created_at                   datetime64[ns, UTC]
id_str                                     int64
text                                      object
source                                    object
truncated                                   bool
in_reply_to_status_id                    float64
in_reply_to_status_id_str                float64
in_reply_to_user_id                      float64
in_reply_to_user_id_str                  float64
in_reply_to_screen_name                   object
user                                      object
geo                                       object
coordinates                               object
place                                     object
contributors                             float64
quoted_status_id                         float64
quoted_status_id_str                     float64
quoted_status                             object
quoted_status_permalink                   object
is_quote_status                             bool
extended_tweet      

### Preclean dataset

In [7]:
df_cen.shape

(41727, 38)

In [8]:
df_without_duplicate = df_cen.drop_duplicates("text")

In [9]:
df_without_duplicate.shape

(23081, 38)

In [10]:
clean_t = pre_processing.clean_tweets(df_without_duplicate["text"])

In [11]:
## Inspect deutsch tweets

In [12]:
df_without_duplicate[df_without_duplicate.withheld_in_countries.apply(lambda x: x == ['DE'])]

Unnamed: 0_level_0,created_at,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,...,filter_level,lang,timestamp_ms,linked,display_text_range,withheld_in_countries,extended_entities,possibly_sensitive,retweeted_status,withheld_copyright
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1365260577564745734,2021-02-26 11:21:01+00:00,1365260577564745728,RT @maurodestefani: STRIP CHAT ⭐️ https://t.co/OSgHIoslTo\nSTRIP CHAT ⭐️ https://t.co/OSgHIoslTo https://t.co/PkyUucxYWH,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",False,,,,,,...,low,en,2021-02-26 11:21:01.663,no,,[DE],"{'media': [{'id': 1271048566983872512, 'id_str': '1271048566983872512', 'indices': [96, 119], 'additional_media_info': {'monetizable': False}, 'media_url': 'http://pbs.twimg.com/ext_tw_video_thumb/1271048566983872512/pu/img/3IgGhfJt9xxdjrkU.jpg', 'media_url_https': 'https://pbs.twimg.com/ext_tw_video_thumb/1271048566983872512/pu/img/3IgGhfJt9xxdjrkU.jpg', 'url': 'https://t.co/PkyUucxYWH', 'display_url': 'pic.twitter.com/PkyUucxYWH', 'expanded_url': 'https://twitter.com/maurodestefani/status/...",1.0,"{'created_at': 'Thu Jun 11 11:57:10 +0000 2020', 'id': 1271048827361988608, 'id_str': '1271048827361988608', 'text': 'STRIP CHAT ⭐️ https://t.co/OSgHIoslTo STRIP CHAT ⭐️ https://t.co/OSgHIoslTo https://t.co/PkyUucxYWH', 'display_text_range': [0, 75], 'source': '<a href=""https://socialbee.io/"" rel=""nofollow"">SocialBee.io v2</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_scre...",
1365226716940042247,2021-02-26 09:06:28+00:00,1365226716940042240,RT @LaureenPinkXXX: 📣⤵️📣\n\n⚠️#LaureenPink @clips4sale\n\n👉🆕blonde pornwhores are better 🆕⚠️\n\n https://t.co/LN6vNa3Bid⚠️\n\n👇🅙🅞🅘🅝 🅐🅝🅓 🅔🅝🅙🅞🅨👇\n\n🐥↪️…,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",False,,,,,,...,low,de,2021-02-26 09:06:28.661,no,,[DE],,1.0,"{'created_at': 'Fri Feb 19 09:31:05 +0000 2021', 'id': 1362696195743887362, 'id_str': '1362696195743887362', 'text': '📣⤵️📣 ⚠️#LaureenPink @clips4sale 👉🆕blonde pornwhores are better 🆕⚠️  https://t.co/LN6vNa3Bid⚠️ 👇🅙🅞🅘🅝 🅐🅝🅓 🅔🅝🅙🅞🅨👇… https://t.co/pVf9TcBvSr', 'display_text_range': [0, 140], 'source': '<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>', 'truncated': True, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_r...",
1365226825983557632,2021-02-26 09:06:54+00:00,1365226825983557632,RT @LaureenPinkXXX: 🌹#SHOUTOUT #KevinsAngels🌹\n\n🌹CHECK OUT @EroticismEzine Eroticism Magazin 🌹\n\n🔝🌹@LaureenPinkCOVER/Centerfold🌹🔝\n\n#KevinsAn…,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",False,,,,,,...,low,en,2021-02-26 09:06:54.659,no,,[DE],,,"{'created_at': 'Sun Jan 31 14:10:59 +0000 2021', 'id': 1355881264524488705, 'id_str': '1355881264524488705', 'text': '🌹#SHOUTOUT #KevinsAngels🌹 🌹CHECK OUT @EroticismEzine Eroticism Magazin 🌹 🔝🌹@LaureenPinkCOVER/Centerfold🌹🔝… https://t.co/mYB91K2mrB', 'display_text_range': [0, 140], 'source': '<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>', 'truncated': True, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_t...",
1365480904328679429,2021-02-27 01:56:31+00:00,1365480904328679424,RT @LaureenPinkXXX: 💋🅼🆈 🅽🅰🆄🅶🅷🆃🆈 🅲🅾🅽🆃🅴🅽🆃❗ 💋\n\n👉#mustfollow #LaureenPink #MDH👈\n\n👉Laureens heisses Profil - Laureens megahot profile 👈\n\n👉🆓💯SIG…,"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>",False,,,,,,...,low,en,2021-02-27 01:56:31.657,no,,[DE],,,"{'created_at': 'Sun Jan 31 14:14:51 +0000 2021', 'id': 1355882239159054341, 'id_str': '1355882239159054341', 'text': '💋🅼🆈 🅽🅰🆄🅶🅷🆃🆈 🅲🅾🅽🆃🅴🅽🆃❗ 💋 👉#mustfollow #LaureenPink #MDH👈 👉Laureens heisses Profil - Laureens megahot profile 👈 👉🆓… https://t.co/DAWnVKfplr', 'display_text_range': [0, 140], 'source': '<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>', 'truncated': True, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_...",
1365363136698679299,2021-02-26 18:08:33+00:00,1365363136698679296,RT @sIutstark: https://t.co/s1GPJCbIcp,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",False,,,,,,...,low,und,2021-02-26 18:08:33.666,no,,[DE],"{'media': [{'id': 1362354264799141889, 'id_str': '1362354264799141889', 'indices': [15, 38], 'additional_media_info': {'title': '', 'description': '', 'embeddable': True, 'monetizable': False}, 'media_url': 'http://pbs.twimg.com/amplify_video_thumb/1362354264799141889/img/ids1V7jMPmEVjiah.jpg', 'media_url_https': 'https://pbs.twimg.com/amplify_video_thumb/1362354264799141889/img/ids1V7jMPmEVjiah.jpg', 'url': 'https://t.co/s1GPJCbIcp', 'display_url': 'pic.twitter.com/s1GPJCbIcp', 'expanded_ur...",1.0,"{'created_at': 'Thu Feb 25 00:02:20 +0000 2021', 'id': 1364727393403867138, 'id_str': '1364727393403867138', 'text': 'https://t.co/s1GPJCbIcp', 'source': '<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 4047831672, 'id_str': '4047831672', 'name': '18+', 'screen...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1361663278204739593,2021-02-16 13:06:38+00:00,1361663278204739584,RT @togks0513: https://t.co/bTTeIbeIjo https://t.co/UXWteK4lYT,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",False,,,,,,...,low,und,2021-02-16 13:06:38.661,no,,[DE],"{'media': [{'id': 1354929760678371336, 'id_str': '1354929760678371336', 'indices': [39, 62], 'additional_media_info': {'monetizable': False}, 'media_url': 'http://pbs.twimg.com/ext_tw_video_thumb/1354929760678371336/pu/img/yAr-M2xa1PigvJDV.jpg', 'media_url_https': 'https://pbs.twimg.com/ext_tw_video_thumb/1354929760678371336/pu/img/yAr-M2xa1PigvJDV.jpg', 'url': 'https://t.co/UXWteK4lYT', 'display_url': 'pic.twitter.com/UXWteK4lYT', 'expanded_url': 'https://twitter.com/togks0513/status/135492...",1.0,"{'created_at': 'Thu Jan 28 23:10:30 +0000 2021', 'id': 1354929874088185856, 'id_str': '1354929874088185856', 'text': 'https://t.co/bTTeIbeIjo https://t.co/UXWteK4lYT', 'display_text_range': [0, 23], 'source': '<a href=""https://socialbee.io/"" rel=""nofollow"">SocialBee.io v2</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 323011971, 'id_str': ...",
1361757582927495169,2021-02-16 19:21:22+00:00,1361757582927495168,RT @GoldenBoy6422: 🅕⃤ @Teicu6\n🅞⃤ @Sade6422\n🅛⃤ @CoffeeMaid4\n🅛⃤ \n🅞⃤˙·٠•●♥𝕊𝔼𝕏●𝕊ℍ𝕆ℙ●𝕆ℕ𝕃𝕀ℕ𝔼♥●•٠·˙\n🅦⃤ https://t.co/IpzLHLguub https://t.co/G…,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",False,,,,,,...,low,vi,2021-02-16 19:21:22.659,no,,[DE],,1.0,"{'created_at': 'Tue Feb 09 23:28:41 +0000 2021', 'id': 1359283106839166977, 'id_str': '1359283106839166977', 'text': '🅕⃤ @Teicu6 🅞⃤ @Sade6422 🅛⃤ @CoffeeMaid4 🅛⃤ 🅞⃤˙·٠•●♥𝕊𝔼𝕏●𝕊ℍ𝕆ℙ●𝕆ℕ𝕃𝕀ℕ𝔼♥●•٠·˙ 🅦⃤ https://t.co/IpzLHLguub https://t.co/GyIJt0HiUr', 'display_text_range': [0, 105], 'source': '<a href=""https://studio.twitter.com"" rel=""nofollow"">Twitter Media Studio</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply...",
1361755624191778818,2021-02-16 19:13:35+00:00,1361755624191778816,RT @AtilaPromo: Come and hang out with this sexy model #MistressLucilla ⤵️\n\n🔗 https://t.co/vmz0z8rsin https://t.co/RYRphXghU5,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",False,,,,,,...,low,en,2021-02-16 19:13:35.660,no,,[DE],"{'media': [{'id': 1354361564330078212, 'id_str': '1354361564330078212', 'indices': [105, 128], 'media_url': 'http://pbs.twimg.com/media/EsuoqsNWMAQl42D.jpg', 'media_url_https': 'https://pbs.twimg.com/media/EsuoqsNWMAQl42D.jpg', 'url': 'https://t.co/RYRphXghU5', 'display_url': 'pic.twitter.com/RYRphXghU5', 'expanded_url': 'https://twitter.com/AtilaPromo/status/1354361571498192896/photo/1', 'type': 'photo', 'sizes': {'large': {'w': 1534, 'h': 1024, 'resize': 'fit'}, 'thumb': {'w': 150, 'h': 15...",1.0,"{'created_at': 'Wed Jan 27 09:32:16 +0000 2021', 'id': 1354361571498192896, 'id_str': '1354361571498192896', 'text': 'Come and hang out with this sexy model #MistressLucilla ⤵️ 🔗 https://t.co/vmz0z8rsin https://t.co/RYRphXghU5', 'display_text_range': [0, 88], 'source': '<a href=""https://videomodels.club/"" rel=""nofollow"">VMClub</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply...",
1361757016700690436,2021-02-16 19:19:07+00:00,1361757016700690432,RT @eyedustcartel: https://t.co/xZsquEgfHJ,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",False,,,,,,...,low,und,2021-02-16 19:19:07.660,no,,[DE],"{'media': [{'id': 1361403716100882438, 'id_str': '1361403716100882438', 'indices': [19, 42], 'additional_media_info': {'monetizable': False}, 'media_url': 'http://pbs.twimg.com/ext_tw_video_thumb/1361403716100882438/pu/img/C8XVTyBtmtwVP4Ix.jpg', 'media_url_https': 'https://pbs.twimg.com/ext_tw_video_thumb/1361403716100882438/pu/img/C8XVTyBtmtwVP4Ix.jpg', 'url': 'https://t.co/xZsquEgfHJ', 'display_url': 'pic.twitter.com/xZsquEgfHJ', 'expanded_url': 'https://twitter.com/eyedustcartel/status/13...",1.0,"{'created_at': 'Mon Feb 15 19:55:40 +0000 2021', 'id': 1361403828055265281, 'id_str': '1361403828055265281', 'text': 'https://t.co/xZsquEgfHJ', 'display_text_range': [0, 0], 'source': '<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1232060260569034752, 'id_str':...",


In [13]:
for i in range(100):
    print("********************************************")
    print(clean_t[df_without_duplicate.withheld_in_countries.apply(lambda x: x == ['DE'])].iloc[i])

********************************************
  STRIP CHAT   STRIP CHAT   
********************************************
    #LaureenPink    blonde pornwhores are better          ↪…
********************************************
  #SHOUTOUT #KevinsAngels  CHECK OUT    Eroticism Magazin    /Centerfold  #KevinsAn…
********************************************
   my   naughty   content    #mustfollow  #LaureenPink #MDH  Laureens heisses Profil - Laureens megahot profile   SIG…
********************************************
  
********************************************
   I hope she's ok!  #AvaParker 
********************************************
  #LaureenPink   Wolles Geburtstagsparty     VIDEO ONLINE  Laureens    …
********************************************
  So did Syria question the election or something?
********************************************
  Cock sucking should always be trained. So that I don't get out of practice. Do you see the lipstick stain on the dick?…
*******************

 my   naughty   content   #LaureenPink #MDH   BUKKAKESÜCHTIG   click  here   VIDEO ONLINE 
********************************************
  
********************************************
  »»————-.————-«« The best anal attitude  /   Obtain your Trial Membership for $  or Full Membership up t…
********************************************
         #LaureenPink… 
********************************************
  
********************************************
  Guys, please give  's NEW ACCOUNT a follow!  
********************************************
     are a couple who #LOVE FUCK!   the first   people who subscribe to their FREE onlyfans page  and  write a personal…
********************************************
      ƒσℓℓσω  (¯`•.•´¯)    `•.¸.•´  Uncensored!   
********************************************
               _________…
********************************************
  Se uma é bom,imagina duas VOU SORTEAR DOIS LINKS Quer ganhar  mensal no meu  e ter acesso livre…
**********************

In [14]:
df_clean = df_without_duplicate.copy()

In [15]:
df_clean['text'] = clean_t

In [16]:
df_clean.shape

(23081, 38)

#### remove empty tweets

In [17]:
df_clean[clean_t.apply(lambda x: x.isspace())].shape

(725, 38)

In [18]:
df = df_clean.drop(df_clean[df_clean.text.apply(lambda x: x.isspace())].index)

In [19]:
df.shape

(22356, 38)

In [24]:
translated = df.text.swifter.apply(lambda x: GoogleTranslator(source='auto', target='en').translate(str(x)))

Pandas Apply:   0%|          | 0/22356 [00:00<?, ?it/s]

In [None]:
df['translated'] = translated

In [23]:
translated = df.head(200).text.swifter.apply(lambda x: GoogleTranslator(source='auto', target='en').translate(str(x)))

Pandas Apply:   0%|          | 0/200 [00:00<?, ?it/s]

KeyboardInterrupt: 