In [1]:
import csv
import ast
import requests
import numpy as np
import time
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from scipy.spatial.distance import cosine

In [2]:
api_key = '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'

def get_embeddings(text):

    headers = {"Authorization": f"Bearer {api_key}"}
    data = {
        "input": text,
        "model": "text-embedding-3-large"
    }
    response = requests.post("https://api.openai.com/v1/embeddings", headers=headers, json=data)
    if response.status_code == 200:
        embedding = response.json()['data'][0]['embedding']
        return np.array(embedding)
    else:
        print(f"Error with text: {text[:30]}... Status code: {response.status_code}")
        return None




def similarity(text_embeddings_real, text_embeddings_synthetic):
    average_similarities = []
    for i, group1_embeddings in enumerate(text_embeddings_real):
        for j, group2_embeddings in enumerate(text_embeddings_synthetic):
            # Convert embeddings to numpy arrays
            group1_embeddings = np.array(group1_embeddings)
            group2_embeddings = np.array(group2_embeddings)
            
            # Calculate cosine similarities between corresponding embeddings
            similarity_matrix = cosine_similarity(group1_embeddings, group2_embeddings)
            
            # Calculate average similarity
            average_similarity = np.mean(similarity_matrix)
            
            # Store the average similarity
            average_similarities.append(average_similarity)
            
    mean_similarity = np.mean(average_similarities)
    return mean_similarity, average_similarities

def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors."""
    return 1 - cosine(vec1, vec2)

In [3]:
import re
import emoji

def remove_urls(text):
    # Regular expression to find URLs
    url_pattern = re.compile(r'http\S+|www\S+|https\S+')
    return url_pattern.sub(r'', text)

def remove_emojis(text):
    # Function to remove emojis
    return emoji.replace_emoji(text, replace='')

def text_cleaning(text):
    # Process each row in the list
    cleaned_topics = []
    for row in text:
        no_urls = remove_urls(row)
        no_emojis = remove_emojis(no_urls)
        cleaned_topics.append(no_emojis)
    return cleaned_topics

def embeddings(data, real_path, synthetic_path):
    text_real=data[1].tolist()
    text_real=text_cleaning(text_real)
    text_syntethic=data[0].tolist()
    text_syntethic=text_cleaning(text_syntethic)
    
    real_embeddings = [get_embeddings(text) for text in text_real]
    for i, embedding in enumerate(real_embeddings):
        if embedding is None:
            print("Retrying...")
            real_embeddings[i] = get_embeddings(text_real[i])
            print("Fixed!")
    synthetic_embeddings = [get_embeddings(text) for text in text_syntethic]
    for i, embedding in enumerate(synthetic_embeddings):
        if embedding is None:
            print("Retrying...")
            synthetic_embeddings[i] = get_embeddings(text_syntethic[i])
            print("Fixed")
            
    with open(real_path, 'wb') as f:
        pickle.dump(real_embeddings, f)
    with open(synthetic_path, 'wb') as f:
        pickle.dump(synthetic_embeddings, f)
    return real_embeddings, synthetic_embeddings

def simat(real_embeddings,synthetic_embeddings):
    num_matrices = len(real_embeddings) // 3  # Calculate the number of matrices
    similarity_matrices = []
    for k in range(num_matrices):
        start_idx = k * 3
        end_idx = start_idx + 3
        similarity_matrix = np.zeros((3, 3))
        for i in range(3):
            for j in range(3):
                similarity = cosine_similarity(real_embeddings[start_idx + i], synthetic_embeddings[start_idx + j])
                similarity_matrix[i][j] = similarity
        similarity_matrices.append(similarity_matrix)
    return similarity_matrices

## TikTok

In [None]:
file_path_synthetic = 'C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation_Dutch/Generic_t=1_P=1_TikTok_Dutch.csv'
data = pd.read_csv(file_path_synthetic, sep=';', header=None)
real_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_TikTok_Dutch_emb_real.pkl"
synthetic_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_TikTok_Dutch_emb_synthetic.pkl"
gen=embeddings(data, real_path, synthetic_path)
real_embeddings_tt_gen=gen[0]
synthetic_embeddings_tt_gen=gen[1]

In [None]:
file_path_synthetic = 'C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation_Dutch/Content_Aware_t=1_P=1_TikTok_Dutch.csv'
data = pd.read_csv(file_path_synthetic, sep=';', header=None)
real_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_TikTok_Dutch_emb_real.pkl"
synthetic_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_TikTok_Dutch_emb_synthetic.pkl"
gen=embeddings(data, real_path, synthetic_path)
real_embeddings_tt_ca=gen[0]
synthetic_embeddings_tt_ca=gen[1]

## Instagram

In [None]:
file_path_synthetic = 'C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation_Dutch/Generic_t=1_P=1_Instagram_Dutch.csv'
data = pd.read_csv(file_path_synthetic, sep=';', header=None)
real_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_Instagram_Dutch_emb_real.pkl"
synthetic_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_Instagram_Dutch_emb_synthetic.pkl"
gen=embeddings(data, real_path, synthetic_path)
real_embeddings_ins_gen=gen[0]
synthetic_embeddings_ins_gen=gen[1]

In [None]:
file_path_synthetic = 'C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation_Dutch/Content_Aware_t=1_P=1_Instagram_Dutch.csv'
data = pd.read_csv(file_path_synthetic, sep=';', header=None)
real_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_Instagram_Dutch_emb_real.pkl"
synthetic_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_Instagram_Dutch_emb_synthetic.pkl"
gen=embeddings(data, real_path, synthetic_path)
real_embeddings_ins_ca=gen[0]
synthetic_embeddings_ins_ca=gen[1]

## YouTube

In [None]:
file_path_synthetic = 'C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation_Dutch/Generic_t=1_P=1_YouTube_Dutch.csv'
data = pd.read_csv(file_path_synthetic, sep=';', header=None)
real_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_YouTube_Dutch_emb_real.pkl"
synthetic_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_YouTube_Dutch_emb_synthetic.pkl"
gen=embeddings(data, real_path, synthetic_path)
real_embeddings_yt_gen=gen[0]
synthetic_embeddings_yt_gen=gen[1]

In [None]:
file_path_synthetic = 'C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Generation_Dutch/Content_Aware_t=1_P=1_YouTube_Dutch.csv'
data = pd.read_csv(file_path_synthetic, sep=';', header=None)
real_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_YouTube_Dutch_emb_real.pkl"
synthetic_path="C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_YouTube_Dutch_emb_synthetic.pkl"
gen=embeddings(data, real_path, synthetic_path)
real_embeddings_yt_ca=gen[0]
synthetic_embeddings_yt_ca=gen[1]

In [None]:
real_embeddings_ins_gen = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_Instagram_Dutch_emb_real.pkl", "rb"))
synthetic_embeddings_ins_gen = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_Instagram_Dutch_emb_synthetic.pkl", "rb"))
real_embeddings_ins_ca = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_Instagram_Dutch_emb_real.pkl", "rb"))
synthetic_embeddings_ins_ca = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_Instagram_Dutch_emb_synthetic.pkl", "rb"))
real_embeddings_tt_gen = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_TikTok_Dutch_emb_real.pkl", "rb"))
synthetic_embeddings_tt_gen = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_TikTok_Dutch_emb_synthetic.pkl", "rb"))
real_embeddings_tt_ca = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_TikTok_Dutch_emb_real.pkl", "rb"))
synthetic_embeddings_tt_ca = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_TikTok_Dutch_emb_synthetic.pkl", "rb"))
real_embeddings_yt_gen = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_YouTube_Dutch_emb_real.pkl", "rb"))
synthetic_embeddings_yt_gen = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Generic_t=1_P=1_YouTube_Dutch_emb_synthetic.pkl", "rb"))
real_embeddings_yt_ca = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_YouTube_Dutch_emb_real.pkl", "rb"))
synthetic_embeddings_yt_ca = pickle.load(open("C:/Users/I6240624/Documents/BISS/Master Thesis/Code/DarianOthmanMasterThesis/Emb_Dutch/Content_Aware_t=1_P=1_YouTube_Dutch_emb_synthetic.pkl", "rb"))

In [None]:
sim_ins_gen=simat(real_embeddings_ins_gen,synthetic_embeddings_ins_gen)
sim_ins_ca=simat(real_embeddings_ins_ca,synthetic_embeddings_ins_ca)
sim_tt_gen=simat(real_embeddings_tt_gen,synthetic_embeddings_tt_gen)
sim_tt_ca=simat(real_embeddings_tt_ca,synthetic_embeddings_tt_ca)
sim_yt_gen=simat(real_embeddings_yt_gen,synthetic_embeddings_yt_gen)
sim_yt_ca=simat(real_embeddings_yt_ca,synthetic_embeddings_yt_ca)

In [None]:
results = []
for i in range(len(sim_ins_gen)):
    # Flatten the array and get the indices that would sort it
    sorted_indices = np.argsort(sim_ins_gen[i].flatten())
    
    # Extract the highest three values
    highest_three_indices = sorted_indices[-3:]
    highest_three_values = sim_ins_gen[i].flatten()[highest_three_indices]
    
    # Extract the lowest three values
    lowest_three_indices = sorted_indices[:3]
    lowest_three_values = sim_ins_gen[i].flatten()[lowest_three_indices]
    average_of_top_3 = np.mean(highest_three_values)
    average_of_bottom_3 = np.mean(lowest_three_values)
    
    results.append([
            i,
            highest_three_values.tolist(),
            average_of_top_3,
            lowest_three_values.tolist(),
            average_of_bottom_3
        ])
results_df = pd.DataFrame(results, columns=[
    'Set Index',
    'Top 3 Similarity Scores',
    'Average of Top 3',
    'Bottom 3 Similarity Scores',
    'Average of Bottom 3'
])
results_df

In [8]:
1-cosine(np.array(synthetic_embeddings).flatten(),np.array(real_embeddings).flatten())

0.48518060052193135

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne_results = []
labels = []
list_embeddings = [real_embeddings_ins_gen,synthetic_embeddings_ins_gen,real_embeddings_ins_ca,synthetic_embeddings_ins_ca,real_embeddings_tt_gen,synthetic_embeddings_tt_gen,real_embeddings_tt_ca,synthetic_embeddings_tt_ca,real_embeddings_yt_gen,synthetic_embeddings_yt_gen, real_embeddings_yt_ca,synthetic_embeddings_yt_ca]
label_names = ['Instagram Generic Real', 'Instagram Generic Synthetic', 'Instagram Content Aware Real', 'Instagram Content Aware Synthetic', 'TikTok Generic Real', 'TikTok Generic Synthetic', 'TikTok Content Aware Real', 'TikTok Content Aware Synthetic', 'YouTube Generic Real', 'YouTube Generic Synthetic', 'YouTube Content Aware Real', 'YouTube Content Aware Synthetic']
color_map= {'Instagram Generic Real': 'blue', 'Instagram Generic Synthetic': 'red', 'Instagram Content Aware Real': 'green', 'Instagram Content Aware Synthetic': 'yellow', 'TikTok Generic Real': 'purple', 'TikTok Generic Synthetic': 'orange', 'TikTok Content Aware Real': 'brown', 'TikTok Content Aware Synthetic': 'pink', 'YouTube Generic Real': 'black', 'YouTube Generic Synthetic': 'grey', 'YouTube Content Aware Real': 'cyan', 'YouTube Content Aware Synthetic': 'magenta'}
for list in range(len(list_embeddings)):
    embeddings = list_embeddings[list]
    
    kmeans = KMeans(n_clusters=50, random_state=42)
    kmeans.fit(embeddings)
    cluster_centers = kmeans.cluster_centers_
    
    tsne = TSNE(n_components=2, random_state=42)
    tsne_cluster_results = tsne.fit_transform(cluster_centers)
    tsne_results.extend(tsne_cluster_results)
    labels.extend([label_names[list]] * len(cluster_centers))

# Plotting
plt.figure(figsize=(12, 8))
for label, (x, y) in zip(labels, tsne_results):
    plt.scatter(x, y, color=color_map[label], label=f'{label}', alpha=0.7)

handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())

plt.title('t-SNE visualization of k-means centroids')
plt.xlabel('t-SNE axis 1')
plt.ylabel('t-SNE axis 2')
plt.show()