In [2]:
import pandas as pd
import networkx
import pyvis

df = pd.read_csv('../posts.tsv', sep='\t')
df = df.drop_duplicates(subset=['Title', 'Description'])

In [13]:
import spacy

# create word list from posts
nlp = spacy.load('en_core_web_sm')
words = []
for i, row in df.iterrows():
    words.append([])
    # doc = nlp(row["Description"])
    # for token in doc:
    #     if token.is_alpha:
    #         words[-1].append(token.text.lower())
    words[-1].extend(row["Keywords"])

In [14]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(words)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

2024-06-28 12:41:31,781 : INFO : collecting all words and their counts
2024-06-28 12:41:31,782 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2024-06-28 12:41:31,784 : INFO : collected 38 word types and 217 unique tags from a corpus of 217 examples and 20300 words
2024-06-28 12:41:31,785 : INFO : Creating a fresh vocabulary
2024-06-28 12:41:31,786 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=1 retains 38 unique words (100.00% of original 38, drops 0)', 'datetime': '2024-06-28T12:41:31.786041', 'gensim': '4.3.2', 'python': '3.9.1 (v3.9.1:1e5d33e9b9, Dec  7 2020, 12:44:01) \n[Clang 12.0.0 (clang-1200.0.32.27)]', 'platform': 'macOS-14.5-arm64-arm-64bit', 'event': 'prepare_vocab'}
2024-06-28 12:41:31,786 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 20300 word corpus (100.00% of original 20300, drops 0)', 'datetime': '2024-06-28T12:41:31.786415', 'gensim': '4.3.2', 'python': '3.9.1 (v3.9.1:1e5d33e9b9, Dec  7 2

In [15]:
# plot the t-SNE of the embeddings of the descriptions
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px

X = np.array([model.infer_vector(words[i]) for i in range(len(words))])
X_embedded = TSNE(n_components=2).fit_transform(X)

df["desc_length"] = df["Description"].str.len()

fig = px.scatter(x=X_embedded[:,0], y=X_embedded[:,1], hover_name=df['Title'], color=df['desc_length'])
fig.update_traces(textposition='top center')
fig.update_layout(title='t-SNE of the embeddings of the descriptions')
fig.show()

In [27]:
import ipywidgets

@ipywidgets.interact(title=ipywidgets.Dropdown(options=df['Title'], description='Title:'))
def tsne_closest_titles(title, n_closest=10):
    idx = df[df['Title'] == title].index[0]
    x = X_embedded[idx]
    distances = np.linalg.norm(X_embedded - x, axis=1)
    closest = distances.argsort()[1:n_closest]

    # plot the n_closest on t-SNE
    fig = px.scatter(x=X_embedded[:,0], y=X_embedded[:,1], hover_name=df['Title'],
                     color=np.where(np.any(np.arange(len(words)) == closest.reshape(-1, 1), axis=0), f'{n_closest} closest', 'others'))
    fig.update_traces(textposition='top center')
    fig.update_layout(title='t-SNE of the embeddings of the descriptions')
    fig.show()

    return df.iloc[closest]

interactive(children=(Dropdown(description='Title:', options=('1000 Genomes Project', 'American Gut Project', …