In [None]:
import scipy
import numpy as np
from scipy.spatial.distance import directed_hausdorff
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances, pairwise_distances
import src.data.utils as utils
import os
import pandas as pd

def read_tweets(username: str):
    return pd.read_csv(os.path.join("..", "datasets", "tweets", f"{username}.csv"))

def read_tweets_with_embeddings(embeddings_path: str):
    embeddings_df = pd.read_pickle(embeddings_path)
    username = embeddings_df.iloc[0]["username"]
    tweets_df = read_tweets(username)
    whole_df = pd.merge(embeddings_df, tweets_df, left_on="tweet_id", right_on="id")
    if len(whole_df) != len(embeddings_df):
        print(username)
    return username, whole_df


In [None]:
# embeddings = utils.list_full_paths(os.path.join("..", "datasets", "embeddings"))
embeddings = utils.list_full_paths(os.path.join("embeddings_50_20"))

all_embeddings = pd.concat(list(map(lambda path: pd.read_pickle(path), embeddings)))

In [None]:
tweets_with_embeddings_dfs = {}

for path in embeddings:
    username, whole_df = read_tweets_with_embeddings(path)
    tweets_with_embeddings_dfs[username] = whole_df

In [None]:
from src.data.utils import read_embeddings_dataframe
users_embeddings_df = read_embeddings_dataframe(os.path.join("..", "datasets", "embeddings.csv"))

In [None]:
def get_embeddings_array(df):
    return np.array(df["tweet_embedding"].to_list())

def explain_closeness(first_username: str, second_username: str):
    first_user_df = tweets_with_embeddings_dfs[first_username]
    second_user_df = tweets_with_embeddings_dfs[second_username]
    first_np = get_embeddings_array(first_user_df)
    second_np = get_embeddings_array(second_user_df)

    first_user_embedding = users_embeddings_df[users_embeddings_df["username"] == first_username]["embedding"].values[0]
    second_user_embedding = users_embeddings_df[users_embeddings_df["username"] == second_username]["embedding"].values[0]


    def get_tweet_and_similar_tweets(user_tweet_embeddings: np.array, another_user_embedding: np.array, another_user_tweet_embeddings: np.array,
        user_df, another_user_df):
        if len(another_user_embedding.shape) < 2:
            another_user_embedding = another_user_embedding[None, :]

        distances = euclidean_distances(user_tweet_embeddings, another_user_embedding).squeeze()
        closest_tweets_indices = np.argsort(distances)[:20]
        closest_tweets = user_df.iloc[closest_tweets_indices]["tweet"].tolist()
        closest_tweets_embeddings = user_tweet_embeddings[closest_tweets_indices, :]

        another_user_closest_tweets_distances = euclidean_distances(closest_tweets_embeddings, another_user_tweet_embeddings)
        another_user_closest_tweets_indices = np.argsort(another_user_closest_tweets_distances, axis=1)[:, 0].reshape(-1)

        another_user_closest_tweets = another_user_df.iloc[another_user_closest_tweets_indices]["tweet"].to_list()

        return closest_tweets, another_user_closest_tweets 

    first_user_tweet_closest_to_second, similar_second_user_tweets = get_tweet_and_similar_tweets(first_np, second_user_embedding, second_np, first_user_df, second_user_df)
    second_user_tweet_closest_to_first, similar_first_user_tweets = get_tweet_and_similar_tweets(second_np, first_user_embedding, first_np, second_user_df, first_user_df)

    return first_user_tweet_closest_to_second, similar_second_user_tweets, second_user_tweet_closest_to_first, similar_first_user_tweets


def get_closest_tweet_pairs(first_username: str, second_username: str, n_pairs=50, metric='euclidean'):
    first_user_df = tweets_with_embeddings_dfs[first_username]
    second_user_df = tweets_with_embeddings_dfs[second_username]
    first_np = get_embeddings_array(first_user_df)
    second_np = get_embeddings_array(second_user_df)

    distances = pairwise_distances(first_np, second_np, metric=metric)

    distances_flat = distances.ravel()
    first_user_indices, second_user_indices = np.unravel_index(np.argsort(distances_flat), distances.shape)
    first_user_indices, second_user_indices = first_user_indices[:n_pairs], second_user_indices[:n_pairs]

    result_tweet_pairs = list(zip(first_user_df.iloc[first_user_indices]["tweet"].to_list(), second_user_df.iloc[second_user_indices]["tweet"].to_list()))
    return result_tweet_pairs

    # print(distances.shape)

    # smallest_distances_indices = np.argsort(distances, axis=1)[:, 0][:, None]
    # smallest_pair_distances = distances[np.arange(distances.shape[0])[:, None], smallest_distances_indices]
    # overall_smallest_distances_indices = np.argsort(smallest_pair_distances, axis=0)
    # overall_smallest_distances = smallest_pair_distances[overall_smallest_distances_indices].squeeze()

    # row_indices = 



In [None]:
# explain_closeness("kulesza_pl", "klubnauer")

get_closest_tweet_pairs("witkomarcin", "iwonaarent")

In [None]:
from tqdm import tqdm
for i, vi in tqdm(enumerate(embeddings)):
    for j, vj in enumerate(embeddings):
        d, _, _ = directed_hausdorff(vi, vj)
        distances[i, j] = d


In [29]:
df = pd.read_pickle("../datasets/tweets_cleaned.pkl.gz")

In [30]:
embedding_array = np.array(all_embeddings["tweet_embedding"].to_list())

In [31]:
from tqdm import tqdm
n_closest_for_each = 5
n_to_show = 20
closest_tweets_indices = np.zeros((n_to_show, n_closest_for_each))

for i, vector in tqdm(enumerate(embedding_array[:n_to_show, :])):
    # distances = euclidean_distances(embedding_array, vector[None, :]).squeeze()
    distances = cosine_distances(embedding_array, vector[None, :]).squeeze()
    closest_indices = np.argsort(distances)[1:n_closest_for_each+1]
    closest_tweets_indices[i, :] = closest_indices

20it [00:35,  1.78s/it]


In [32]:
anchor_tweet_ids = all_embeddings.iloc[:20].tweet_id.to_list()

In [33]:
closest_ids = np.array(all_embeddings.iloc[closest_tweets_indices.reshape(-1)].tweet_id.to_list())

In [34]:
closests_tweets = df.loc[df["id"].isin(closest_ids)].tweet.to_numpy().reshape(-1, 5)

In [35]:
anchor_tweets = df.loc[df["id"].isin(closest_ids)].tweet.to_numpy()

In [36]:
closest_ids = closest_ids.reshape(-1, 5)

In [37]:
for i in range(20):
    print(df.loc[df["id"] == anchor_tweet_ids[i]].tweet.values[0])
    print(df.loc[df["id"].isin(closest_ids[i, :])].tweet.values)

Środki finansowe z UE są nam bardzo potrzebne. Najlepszym przykładem jest , który został wybudowany z tych funduszy. To również szczególny dzień dla personelu , ale również małych pacjentów z Kliniki „Przylądek Nadziei”, gdyż dzisiaj obchodzimy 5latPrzylądka 🎉🎉🎉 
['Dalsza część uroczystości odbyła się na Cmentarzu Obrońców Wybrzeża, gdzie również tego dnia miało miejsce wspomnienie Zbrodni Katyńskiej (przesunięte w związku w wybuchem pandemii) w 80. lecie jej dokonania. W imieniu AnnaFotyga 🇵🇱 udział w uroczystościach brał pracownik biura'
 'Konkurs "Gotuj z klasą" wygrali Natalia Farfułowska i Piotr Pastorczyk. Gratuluję 👏🏻 Zachwycili jury kapuśniakiem na pstrągu z pierożkiem ziemniaczanym, żołądkami gęsimi w sosie z grzybów leśnych i kaszą pęczak oraz ciastem Marcinek. Specjalnie wrzucam info w porze obiadowej😉 '
 'Polska = Wisła Kraków? Wisła: ✔ oddanie władzy hochsztaplerom z gębami pełnymi wielkich słów ✔ uprzywilejowani nieliczni rabują, reszta wierzy w cuda ✔ krach i potem żal d

In [None]:
closests_tweets