In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import src.data.utils as utils
import os
import plotly.express as px
from tqdm.auto import tqdm
import regex as re

In [None]:
df = pd.read_pickle(os.path.join("datasets", "tweets.pkl.gz"))

In [None]:
print(df.columns)
print(df.shape)

In [None]:
tqdm.pandas()

In [None]:
print(df[df["tweet_length"] == 842]["tweet"].values[0])

In [None]:
language_counts = df.value_counts("language").to_frame().reset_index().rename({0: "count"}, axis=1)
fig = px.bar(language_counts, x="language", y="count", log_y=True, title="Log plot of language counts")
fig.show()

In [None]:
print("PL ratio to all:", f'{(language_counts[language_counts["language"] == "pl"]["count"].values[0] / language_counts["count"].sum()):.3%}')

In [None]:
und_language = df[df["language"] == "und"]["tweet"]
en_language = df[df["language"] == "en"]["tweet"]

In [None]:
df = df[df["language"] == "pl"]

In [None]:
def remove_content(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'\S+\.com\S+','',text)
    text = re.sub(r'\@\w+','',text)
    text = re.sub(r'\#','',text)
    return text

def process_text(text):
    text = remove_content(text)
    text = re.sub(r"\s+", ' ', text)
    return text

In [None]:
df["tweet_preprocessed"] = df["tweet"].progress_apply(process_text)

In [None]:
df["tweet_length"] = df["tweet_preprocessed"].progress_apply(len)

In [None]:
print("Max tweet length:", df["tweet_length"].max())
print("Mean tweet length:", df["tweet_length"].mean())

In [None]:
fig = px.histogram(df, x="tweet_length", nbins=100)
fig.show()

In [None]:
short_tweets = df[df["tweet_length"] < 10]["tweet_preprocessed"].unique()

In [None]:
df = df[df["tweet_length"] >= 10]

In [None]:
tweets_by_username_count = df.groupby(by="username")[["id"]].count()
tweets_by_username_count

In [None]:
users_to_drop = tweets_by_username_count[tweets_by_username_count["id"] < 20].reset_index()
users_to_drop

In [None]:
df[~df["username"].isin(users_to_drop["username"].to_list())]