In [None]:
import ast
import pandas as pd


df = pd.read_csv("../metadata/merged_metadata_popularity_features.csv")

import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px

# Assuming `embeddings` is your list of embeddings
# Convert your list of embeddings to a NumPy array if it's not already
embeddings = np.array([ast.literal_eval(emb) for emb in df['features_embedding_mean']])
colors = df['log_Informative_norm_cat']

# Compute t-SNE transformation
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

# Create a DataFrame for Plotly
df_embeddings = pd.DataFrame(embeddings_2d, columns=['Dimension 1', 'Dimension 2'])

# Plot using Plotly Express
fig = px.scatter(df_embeddings, x='Dimension 1', y='Dimension 2', title='t-SNE Plot of Embeddings', color=colors)
fig.show()


In [None]:
import ast
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px


df_1 = pd.read_csv("../metadata/merged_metadata_popularity_features.csv")
df_1['url'] = df_1['url'].astype(str).apply(str.strip)

df_2 = pd.read_csv("../metadata/embeddings_transcript.csv")
df_2['url'] = df_2['url'].astype(str).apply(str.strip)

df = pd.merge(df_1, df_2, on="url")

most = df.sort_values(by="log_Beautiful_views_norm", ascending=False).iloc[:100, :]
least = df.sort_values(by="log_Beautiful_views_norm", ascending=False).iloc[-100:, :]

# Compute t-SNE transformation
embeddings_audio_most = np.array([ast.literal_eval(emb) for emb in most['features_embedding_mean']])
embeddings_audio_least = np.array([ast.literal_eval(emb) for emb in least['features_embedding_mean']])
embeddings_audio = np.concatenate([embeddings_audio_most, embeddings_audio_least], axis=0)

embeddings_text_most = np.array([ast.literal_eval(emb) for emb in most['embeddings']])
embeddings_text_least = np.array([ast.literal_eval(emb) for emb in least['embeddings']])
embeddings_text = np.concatenate([embeddings_text_most, embeddings_text_least], axis=0)

embeddings_all = np.concatenate([embeddings_audio, embeddings_text], axis=-1)
colors = ["most"]*len(embeddings_audio_most) + ["least"]*len(embeddings_audio_least)

tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings_audio)

# Create a DataFrame for Plotly
df_embeddings = pd.DataFrame(embeddings_2d, columns=['Dimension 1', 'Dimension 2'])

# Plot using Plotly Express
fig = px.scatter(df_embeddings, x='Dimension 1', y='Dimension 2', title='t-SNE Plot of Embeddings', color=colors)
fig.show()

In [39]:
import ast
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px


df_1 = pd.read_csv("../metadata/merged_metadata_popularity_features.csv")
df_1['url'] = df_1['url'].astype(str).apply(str.strip)

df_2 = pd.read_csv("../metadata/embeddings_transcript.csv")
df_2['url'] = df_2['url'].astype(str).apply(str.strip)

df = pd.merge(df_1, df_2, on="url")

metric = "log_Funny_norm"
most = df.sort_values(by=metric, ascending=False).iloc[:100, :]
least = df.sort_values(by=metric, ascending=False).iloc[-100:, :]


post_cols = ['emotion_angry_mean', 'emotion_angry_90p', 'emotion_angry_std',
                 'emotion_happy_mean', 'emotion_happy_90p', 'emotion_happy_std',
                 'emotion_sad_mean', 'emotion_sad_90p', 'emotion_sad_std',
                 'emotion_neutral_mean', 'emotion_neutral_90p', 'emotion_neutral_std',
                 'strength_weak_mean', 'strength_weak_90p', 'strength_weak_std',
                 'strength_neutral_mean', 'strength_neutral_90p', 'strength_neutral_std',
                 'strength_strong_mean', 'strength_strong_90p', 'strength_strong_std',
                 'positivity_negative_mean', 'positivity_negative_90p',
                 'positivity_negative_std', 'positivity_neutral_mean',
                 'positivity_neutral_90p', 'positivity_neutral_std',
                 'positivity_positive_mean', 'positivity_positive_90p',
                 'positivity_positive_std', 'pauses_mean', 'pauses_std', 'pauses_10p',
                 'pauses_90p', 'turn_durations_mean', 'turn_durations_std',
                 'turn_durations_10p', 'turn_durations_90p', 'gender_male_mean']

most["rank"] = "most"
least["rank"] = "least"
merged = pd.concat([most, least])

corr = merged[post_cols + [metric]].corr()[metric].sort_values(key=lambda it: np.abs(it), ascending=False)
print(corr)

# for feat in ["emotion_sad_90p", "emotion_sad_std", "gender_male_mean", "strength_weak_90p", "pauses_10p"]:
#     px.histogram(merged, x=feat, color="rank", barmode="overlay").show()
    
# for feat in ["positivity_positive_90p", "turn_durations_10p"]:
#     px.histogram(merged, x=feat, color="rank", barmode="overlay").show()

for feat in ["emotion_happy_90p", "positivity_positive_std", "strength_neutral_std", "turn_durations_mean"]:
    px.histogram(merged, x=feat, color="rank", barmode="overlay").show()

log_Funny_norm              1.000000
emotion_happy_std           0.536789
emotion_happy_90p           0.524810
positivity_positive_std     0.521344
strength_neutral_std        0.519604
turn_durations_mean        -0.517629
turn_durations_10p         -0.516989
emotion_happy_mean          0.462641
turn_durations_90p         -0.460997
positivity_positive_90p     0.460387
strength_strong_90p         0.451641
strength_neutral_mean      -0.440178
pauses_10p                 -0.431373
strength_strong_mean        0.429082
emotion_neutral_std         0.425750
strength_strong_std         0.421136
positivity_neutral_std      0.416570
positivity_neutral_mean    -0.407927
emotion_neutral_mean       -0.403909
positivity_negative_std     0.396483
positivity_positive_mean    0.371520
turn_durations_std         -0.344700
positivity_neutral_90p     -0.340025
emotion_angry_std           0.333546
emotion_neutral_90p        -0.324858
strength_neutral_90p       -0.310602
emotion_sad_mean           -0.281549
s