In [83]:
import ast
import pandas as pd


df = pd.read_csv("../metadata/merged_metadata_popularity_features_std.csv")

import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px

# Assuming `embeddings` is your list of embeddings
# Convert your list of embeddings to a NumPy array if it's not already
embeddings = np.array([ast.literal_eval(emb) for emb in df['features_embedding_mean']])
colors = df['log_Beautiful_norm_cat']

# Compute t-SNE transformation
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

# Create a DataFrame for Plotly
df_embeddings = pd.DataFrame(embeddings_2d, columns=['Dimension 1', 'Dimension 2'])

# Plot using Plotly Express
fig = px.scatter(df_embeddings, x='Dimension 1', y='Dimension 2', title='t-SNE Plot of Embeddings', color=colors)
fig.show()


In [68]:
import ast
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px


df_1 = pd.read_csv("../metadata/merged_metadata_popularity_features.csv")
df_1['url'] = df_1['url'].astype(str).apply(str.strip)

df_2 = pd.read_csv("../metadata/embeddings_transcript.csv")
df_2['url'] = df_2['url'].astype(str).apply(str.strip)

df = pd.merge(df_1, df_2, on="url")

most = df.sort_values(by="log_Beautiful_views_norm", ascending=False).iloc[:100, :]
least = df.sort_values(by="log_Beautiful_views_norm", ascending=False).iloc[-100:, :]

# Compute t-SNE transformation
embeddings_audio_most = np.array([ast.literal_eval(emb) for emb in most['features_embedding_mean']])
embeddings_audio_least = np.array([ast.literal_eval(emb) for emb in least['features_embedding_mean']])
embeddings_audio = np.concatenate([embeddings_audio_most, embeddings_audio_least], axis=0)

embeddings_text_most = np.array([ast.literal_eval(emb) for emb in most['embeddings']])
embeddings_text_least = np.array([ast.literal_eval(emb) for emb in least['embeddings']])
embeddings_text = np.concatenate([embeddings_text_most, embeddings_text_least], axis=0)

embeddings_all = np.concatenate([embeddings_audio, embeddings_text], axis=-1)
colors = ["most"]*len(embeddings_audio_most) + ["least"]*len(embeddings_audio_least)

tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings_audio)

# Create a DataFrame for Plotly
df_embeddings = pd.DataFrame(embeddings_2d, columns=['Dimension 1', 'Dimension 2'])

# Plot using Plotly Express
fig = px.scatter(df_embeddings, x='Dimension 1', y='Dimension 2', title='t-SNE Plot of Embeddings', color=colors)
fig.show()

In [69]:
# Standardizing the features (important for PCA)
from sklearn.preprocessing import StandardScaler

X = embeddings_audio
X_scaled = StandardScaler().fit_transform(X)

# Performing PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)  # Reducing to 2 components for visualization
principalComponents = pca.fit_transform(X_scaled)
principalDf = pd.DataFrame(data=principalComponents, columns=['principal component 1', 'principal component 2'])

# Adding the 'rank' back for coloring
finalDf = pd.concat([principalDf, pd.DataFrame({'rank': colors})], axis=1)

# Plotting with Plotly Express
fig = px.scatter(finalDf, x='principal component 1', y='principal component 2', color='rank',
                 title="PCA Plot of Most and Least based on Specified Metrics")
fig.show()


In [86]:
import ast
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px

df = pd.read_csv("../metadata/merged_metadata_popularity_features_std.csv")

metric = "log_Beautiful_norm"
most = df.sort_values(by=metric, ascending=False).iloc[:100, :]
least = df.sort_values(by=metric, ascending=False).iloc[-100:, :]


post_cols = ['emotion_angry_mean', 'emotion_angry_90p', 'emotion_angry_std',
                 'emotion_happy_mean', 'emotion_happy_90p', 'emotion_happy_std',
                 'emotion_sad_mean', 'emotion_sad_90p', 'emotion_sad_std',
                 'emotion_neutral_mean', 'emotion_neutral_90p', 'emotion_neutral_std',
                 'strength_weak_mean', 'strength_weak_90p', 'strength_weak_std',
                 'strength_neutral_mean', 'strength_neutral_90p', 'strength_neutral_std',
                 'strength_strong_mean', 'strength_strong_90p', 'strength_strong_std',
                 'positivity_negative_mean', 'positivity_negative_90p',
                 'positivity_negative_std', 'positivity_neutral_mean',
                 'positivity_neutral_90p', 'positivity_neutral_std',
                 'positivity_positive_mean', 'positivity_positive_90p',
                 'positivity_positive_std', 'pauses_mean', 'pauses_std', 'pauses_10p',
                 'pauses_90p', 'turn_durations_mean', 'turn_durations_std',
                 'turn_durations_10p', 'turn_durations_90p', 'gender_male_mean']

most["rank"] = "most"
least["rank"] = "least"
merged = pd.concat([most, least])

corr = merged[post_cols + [metric]].corr()[metric].sort_values(key=lambda it: np.abs(it), ascending=False)
print(corr)

# log_Beautiful_norm
for feat in ["emotion_sad_std", "gender_male_mean", "strength_weak_90p", "pauses_10p"]:
    px.histogram(merged, x=feat, color="rank", barmode="overlay").show()
    
# for feat in ["positivity_positive_90p", "turn_durations_10p"]:
#     px.histogram(merged, x=feat, color="rank", barmode="overlay").show()

# # log_Funny_norm
# for feat in ["emotion_happy_90p", "positivity_positive_std", "strength_neutral_std", "turn_durations_mean"]:
#     px.histogram(merged, x=feat, color="rank", barmode="overlay").show()

# # log_Informative_norm
# for feat in ["turn_durations_mean", "emotion_sad_std", "strength_neutral_std"]:
#     px.histogram(merged, x=feat, color="rank", barmode="overlay").show()

log_Beautiful_norm          1.000000
emotion_sad_std             0.547088
emotion_sad_90p             0.518191
emotion_sad_mean            0.487761
gender_male_mean           -0.449059
strength_weak_90p           0.369510
strength_weak_mean          0.361581
strength_weak_std           0.346354
pauses_10p                  0.328379
positivity_positive_std     0.321909
emotion_happy_std           0.289088
positivity_positive_90p     0.274172
emotion_angry_mean         -0.251305
pauses_std                  0.246476
pauses_mean                 0.239883
positivity_neutral_std      0.236837
strength_strong_mean       -0.232704
emotion_angry_90p          -0.228250
turn_durations_10p         -0.219764
positivity_positive_mean    0.213766
pauses_90p                  0.207281
strength_strong_90p        -0.184856
turn_durations_mean        -0.176157
emotion_angry_std          -0.165683
emotion_neutral_std         0.164086
positivity_neutral_mean    -0.156166
emotion_neutral_mean       -0.148109
t

In [70]:
# PCA visualization of the posterior features wrt the target label

import ast
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px


df_1 = pd.read_csv("../metadata/merged_metadata_popularity_features.csv")
df_1['url'] = df_1['url'].astype(str).apply(str.strip)

df_2 = pd.read_csv("../metadata/embeddings_transcript.csv")
df_2['url'] = df_2['url'].astype(str).apply(str.strip)

df = pd.merge(df_1, df_2, on="url")

metric = "log_Beautiful_norm"
most = df.sort_values(by=metric, ascending=False).iloc[:100, :]
least = df.sort_values(by=metric, ascending=False).iloc[-100:, :]


post_cols = ['emotion_angry_mean', 'emotion_angry_90p', 'emotion_angry_std',
                 'emotion_happy_mean', 'emotion_happy_90p', 'emotion_happy_std',
                 'emotion_sad_mean', 'emotion_sad_90p', 'emotion_sad_std',
                 'emotion_neutral_mean', 'emotion_neutral_90p', 'emotion_neutral_std',
                 'strength_weak_mean', 'strength_weak_90p', 'strength_weak_std',
                 'strength_neutral_mean', 'strength_neutral_90p', 'strength_neutral_std',
                 'strength_strong_mean', 'strength_strong_90p', 'strength_strong_std',
                 'positivity_negative_mean', 'positivity_negative_90p',
                 'positivity_negative_std', 'positivity_neutral_mean',
                 'positivity_neutral_90p', 'positivity_neutral_std',
                 'positivity_positive_mean', 'positivity_positive_90p',
                 'positivity_positive_std', 'pauses_mean', 'pauses_std', 'pauses_10p',
                 'pauses_90p', 'turn_durations_mean', 'turn_durations_std',
                 'turn_durations_10p', 'turn_durations_90p']

most["rank"] = "most"
least["rank"] = "least"
merged = pd.concat([most, least])

# Standardizing the features (important for PCA)
from sklearn.preprocessing import StandardScaler

X = merged[post_cols]
X_scaled = StandardScaler().fit_transform(X)

# Performing PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)  # Reducing to 2 components for visualization
principalComponents = pca.fit_transform(X_scaled)
principalDf = pd.DataFrame(data=principalComponents, columns=['principal component 1', 'principal component 2'])

# Adding the 'rank' back for coloring
finalDf = pd.concat([principalDf, merged[['rank']].reset_index(drop=True)], axis=1)

# Plotting with Plotly Express
fig = px.scatter(finalDf, x='principal component 1', y='principal component 2', color='rank',
                 title="PCA Plot of Most and Least based on Specified Metrics")
fig.show()