In [31]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import pandas as pd
import numpy as np
import src.data.utils as utils
import os
import plotly.express as px
from tqdm.auto import tqdm
import regex as re
import plotly.io as pio
pio.renderers.default="browser"

In [33]:
df = pd.read_pickle(os.path.join("datasets", "tweets.pkl.gz"))

In [34]:
print(df.columns)
print(df.shape)

Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'place', 'tweet', 'language', 'mentions',
       'urls', 'photos', 'replies_count', 'retweets_count', 'likes_count',
       'hashtags', 'cashtags', 'link', 'retweet', 'quote_url', 'video',
       'thumbnail', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],
      dtype='object')
(1659884, 36)


In [35]:
tqdm.pandas()

In [36]:
df["tweet_length"] = df["tweet"].progress_apply(len)

100%|██████████| 1659884/1659884 [00:01<00:00, 1075039.19it/s]


In [37]:
print(df[df["tweet_length"] == 842]["tweet"].values[0])

@PiS_WarmiaMazur @sobecka_anna @ASztandur @Cyntia_Harasim @ZChodowski @ajsyk23 @emilabogusz1958 @PiS_Pomorski @rem_tene @BitwaoPL @Karina_August @1973Buba @MrCichy84 @Jowita_W @clodin56 @KsiadzRobak_ @Jowita77732510 @29Joanna09 @PolskaKobieta @A_Pawlikowski @HGeeka @HonoriOjczyzna @Agapo59641184 @Sawek34797408 @jeste_samolotem @MaGuszka @Jolanta6Jolanta @MarekS98837118 @Maximus13305555 @SympatykPIS @Autolikos1 @AndreWisniewsk2 @JacekWronaCBS @distefano565 @Taplarski_Dziad @AsiaNietop @MatkaPolka_ @Elciapelciaba @ZlotPrawych @PeterLukomski @PiotrW1966 @tylko_prawda_ Bardzo dziękuję @PiS_WarmiaMazur za nominację.  Pozdrawiam wszystkich z całego serca ❤️🇵🇱 Nominuję @PiSOkreg5 @PawelKanas @chmielewski2020 @KamilKolata @JDebczynski @Krzysztof_91 @VukoviIrena1  #MaszerujemyOnline  #internetowymarszniepodległości  https://t.co/4Q2Pkd8kGe


In [38]:
language_counts = df.value_counts("language").to_frame().reset_index().rename({0: "count"}, axis=1)
fig = px.bar(language_counts, x="language", y="count", log_y=True, title="Log plot of language counts")
fig.show()

In [39]:
print("PL ratio to all:", f'{(language_counts[language_counts["language"] == "pl"]["count"].values[0] / language_counts["count"].sum()):.3%}')

PL ratio to all: 90.795%


In [40]:
und_language = df[df["language"] == "und"]["tweet"]
en_language = df[df["language"] == "en"]["tweet"]

In [41]:
df = df[df["language"] == "pl"]

In [42]:
def remove_content(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'\S+\.com\S+','',text)
    text = re.sub(r'\@\w+','',text)
    text = re.sub(r'\#','',text)
    return text

def process_text(text):
    text = remove_content(text)
    text = re.sub(r"\s+", ' ', text)
    return text

In [43]:
df["tweet_preprocessed"] = df["tweet"].progress_apply(process_text)

100%|██████████| 1507098/1507098 [00:38<00:00, 38657.16it/s]


In [44]:
df["tweet_length"] = df["tweet_preprocessed"].progress_apply(len)

100%|██████████| 1507098/1507098 [00:01<00:00, 1059552.41it/s]


In [45]:
print("Max tweet length:", df["tweet_length"].max())
print("Mean tweet length:", df["tweet_length"].mean())

Max tweet length: 296
Mean tweet length: 101.8824781135666


In [46]:
fig = px.histogram(df, x="tweet_length", nbins=100)
fig.show()

In [47]:
short_tweets = df[df["tweet_length"] < 10]["tweet_preprocessed"].unique()

In [48]:
df = df[df["tweet_length"] >= 10]

In [49]:
tweets_by_username_count = df.groupby(by="username")[["id"]].count()
tweets_by_username_count

Unnamed: 0_level_0,id
username,Unnamed: 1_level_1
_mkonieczny,568
_piotrcwik,559
_urbaniakj,2589
a_betkowski,20
a_czartoryski,170
...,...
zdzkrasnodebski,12128
ziebadariusz,79
zielinskijaro,670
ziobropl,423


In [50]:
users_to_drop = tweets_by_username_count[tweets_by_username_count["id"] < 20].reset_index()
users_to_drop

Unnamed: 0,username,id
0,andrzejszlachta,16
1,ewagaweda,1
2,grabczukk,6
3,grzegorzwojci16,13
4,katarzy48592111,3
5,katarzynaosos,19
6,kkubow,14
7,lawniczaktomek,18
8,radoslawrataj,11
9,teresawargocka,11


In [51]:
df[~df["username"].isin(users_to_drop["username"].to_list())]

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,tweet_length,tweet_preprocessed
0,1330865505306087424,1330865505306087424,2020-11-23 14:27:17 CET,2020-11-23,14:27:17,100,879756935733682180,achybicka,Alicja Chybicka,,...,,,,[],,,,,263,Środki finansowe z UE są nam bardzo potrzebne....
3,1324683520095256576,1324683520095256576,2020-11-06 13:02:17 CET,2020-11-06,13:02:17,100,879756935733682180,achybicka,Alicja Chybicka,,...,,,,[],,,,,256,Konferencja prasowa z udziałem Parlamentarzyst...
5,1321458785555275776,1321458785555275776,2020-10-28 15:28:20 CET,2020-10-28,15:28:20,100,879756935733682180,achybicka,Alicja Chybicka,,...,,,,[],,,,,49,Pytanie o dostępność szczepionek przeciwko grypie
6,1320780862158524416,1320780862158524416,2020-10-26 18:34:31 CET,2020-10-26,18:34:31,100,879756935733682180,achybicka,Alicja Chybicka,,...,,,,[],,,,,16,Walka z COVID-19
8,1320768971696689153,1320768971696689153,2020-10-26 17:47:16 CET,2020-10-26,17:47:16,100,879756935733682180,achybicka,Alicja Chybicka,,...,,,,[],,,,,17,Czekam na pytania
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659879,636616652034043906,636616652034043906,2015-08-26 21:10:12 CEST,2015-08-26,21:10:12,100,3437755967,_urbaniakj,Jarosław Urbaniak,,...,,,,[],,,,,12,No właśnie.
1659880,636615744034340864,636615744034340864,2015-08-26 21:06:35 CEST,2015-08-26,21:06:35,100,3437755967,_urbaniakj,Jarosław Urbaniak,,...,,,,[],,,,,98,Bardzo dobre informacje dl całej Południowej W...
1659881,636253338514903041,636253338514903041,2015-08-25 21:06:31 CEST,2015-08-25,21:06:31,100,3437755967,_urbaniakj,Jarosław Urbaniak,,...,,,,[],,,,,78,"Inwestycja LPR niezwykle potrzebna i ważna, dl..."
1659882,636252520977952772,636252520977952772,2015-08-25 21:03:16 CEST,2015-08-25,21:03:16,100,3437755967,_urbaniakj,Jarosław Urbaniak,,...,,,,[],,,,,116,"Drodzy Państwo, dziś mija dokładnie 25 lat od ..."
