In [1]:
import os
import json
import time

import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load environment variables from .env
load_dotenv()

# Access variables
api_key = os.getenv("API_KEY")
huggingface_key = os.getenv("HUGGINGFACE_KEY")

In [2]:
movies = pd.read_csv('/Users/trihoang/Downloads/TMDB_all_movies.csv')

In [3]:
series = pd.read_csv('/Users/trihoang/Downloads/TMDB_tv_dataset_v3.csv')

In [4]:
#full movies
movies.shape

(1066506, 28)

In [5]:
#full_series
series.shape

(168639, 29)

In [6]:
used_cols = ['title', 'original_title','release_date', 'runtime', 'genres', 'overview', 'poster_path', 'imdb_rating', 'imdb_votes']

movies = movies[(movies[used_cols].isna().any(axis=1)==False)][used_cols]

In [7]:
movies['overview_word_num'] = movies['overview'].str.split().str.len()

In [8]:
filtered_movies = movies[movies['overview_word_num'].between(20, 250, 'both')]

In [9]:
filtered_movies.shape

(267669, 10)

In [10]:
used_series_cols = ['name', 'original_name', 'first_air_date', 'number_of_seasons', 'number_of_episodes', 'genres', 'overview', 'backdrop_path', 'vote_average', 'vote_count']

series = series[(series[used_series_cols].isna().any(axis=1)==False)][used_series_cols]

In [11]:
series.columns = ['title', 'original_title', 'first_air_date', 'number_of_seasons', 'number_of_episodes', 'genres', 'overview', 'poster_path', 'vote_average', 'vote_count']

In [12]:
series['overview_word_num'] = series['overview'].str.split().str.len()

In [13]:
filtered_series = series[series['overview_word_num'].between(20, 250, 'both')]

In [15]:
# combined_df = pd.concat([filtered_movies, filtered_series])
sampled_df = pd.concat([
    filtered_series.sample(frac=0.13, random_state=41),
    filtered_movies[filtered_movies['overview_word_num']<=50][used_cols].sample(frac=0.166, random_state=31),
    filtered_movies[(filtered_movies['overview_word_num']>50)&(filtered_movies['overview_word_num']<=120)][used_cols].sample(frac=0.17, random_state=31),
    filtered_movies[(filtered_movies['overview_word_num']>120)&(filtered_movies['overview_word_num']<=180)][used_cols].sample(frac=0.17, random_state=31),
    filtered_movies[(filtered_movies['overview_word_num']>180)][used_cols].sample(frac=0.17, random_state=31)
])


In [41]:
sampled_df.shape

(50026, 15)

In [None]:
import tiktoken
from tqdm import tqdm

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def calculate_cost(embedding_model, text_list: list[str]):
    model_cost = {
        'text-embedding-3-small': 0.02,
        'text-embedding-3-large': 0.13,
        'text-embedding-ada-002': 0.10
    }
    token_num  = []
    for row in tqdm(text_list, total=len(text_list)):
        token_num.append(num_tokens_from_string(row, "cl100k_base"))    

    return sum(token_num) / 1_000_000 * model_cost[embedding_model]

In [85]:
print(calculate_cost('text-embedding-3-small', sampled_df['overview'].tolist()))

100%|██████████| 50026/50026 [00:01<00:00, 28818.20it/s]

0.07120362000000001





In [11]:
import openai
import asyncio
import nest_asyncio

In [88]:
nest_asyncio.apply()

In [12]:
client = openai.AsyncOpenAI(api_key=api_key)

In [None]:
sampled_df['embeddings'] = ''

In [None]:
sampled_df = sampled_df.iloc[26:]

In [140]:
sampled_df.reset_index(inplace=True)
sampled_df.drop(columns='index', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df.drop(columns='index', inplace=True)


In [None]:
# model distillation, eval in openai
# maybe test the 100k version with embedding-large later
step = 1000
checkpoint_interval = 5000

def save_checkpoint(df: pd.DataFrame, file_path="checkpoint.pkl"):
    df.to_pickle(file_path)

async def get_openai_response(text):
    embedding = await client.embeddings.create(input=[text],
                                               model="text-embedding-3-small")
    return embedding.data[0].embedding

async def process_all_prompts(texts):
    tasks = [get_openai_response(text) for text in texts]  # Create async tasks
    return await asyncio.gather(*tasks)  # Run all tasks at once

for i in tqdm(range(0, len(sampled_df), step)):
    print(i)
    embeddings = asyncio.run(process_all_prompts(sampled_df[i:i+step]))
    sampled_df.iloc[i:i+step, -1] = pd.Series(embeddings, index=sampled_df.index[i:i+step])
    time.sleep(15)

    # Save checkpoint at every interval
    if (i + step) % checkpoint_interval == 0:
        save_checkpoint(sampled_df, f'checkpoint_{i+step}.pkl')
        print(f"Checkpoint saved at {i + step} rows.")

sampled_df.to_pickle("openai_embeddings.pkl")
print("Results saved to openai_embeddings.csv")

  0%|          | 0/50 [00:00<?, ?it/s]

0


  2%|▏         | 1/50 [00:47<38:50, 47.57s/it]

1000


  4%|▍         | 2/50 [01:09<25:58, 32.47s/it]

2000


  6%|▌         | 3/50 [02:28<42:13, 53.91s/it]

3000


  8%|▊         | 4/50 [02:52<32:03, 41.82s/it]

4000


 10%|█         | 5/50 [03:27<29:38, 39.52s/it]

Checkpoint saved at 5000 rows.
5000


 12%|█▏        | 6/50 [03:49<24:37, 33.59s/it]

6000


 14%|█▍        | 7/50 [04:32<26:13, 36.60s/it]

7000


 16%|█▌        | 8/50 [05:17<27:27, 39.23s/it]

8000


 18%|█▊        | 9/50 [05:41<23:32, 34.45s/it]

9000


 20%|██        | 10/50 [06:07<21:22, 32.05s/it]

Checkpoint saved at 10000 rows.
10000


 22%|██▏       | 11/50 [06:31<19:07, 29.42s/it]

11000


 24%|██▍       | 12/50 [06:53<17:16, 27.28s/it]

12000


 26%|██▌       | 13/50 [07:15<15:44, 25.54s/it]

13000


 28%|██▊       | 14/50 [07:36<14:32, 24.23s/it]

14000


 30%|███       | 15/50 [07:59<13:54, 23.83s/it]

Checkpoint saved at 15000 rows.
15000


 32%|███▏      | 16/50 [08:22<13:19, 23.52s/it]

16000


 34%|███▍      | 17/50 [08:45<12:48, 23.29s/it]

17000


 36%|███▌      | 18/50 [09:20<14:25, 27.03s/it]

18000


 38%|███▊      | 19/50 [10:02<16:11, 31.33s/it]

19000


 40%|████      | 20/50 [10:28<14:57, 29.90s/it]

Checkpoint saved at 20000 rows.
20000


 42%|████▏     | 21/50 [11:00<14:45, 30.54s/it]

21000


 44%|████▍     | 22/50 [11:37<15:06, 32.38s/it]

22000


 46%|████▌     | 23/50 [12:02<13:33, 30.11s/it]

23000


 48%|████▊     | 24/50 [12:47<15:03, 34.75s/it]

24000


 50%|█████     | 25/50 [13:11<13:06, 31.44s/it]

Checkpoint saved at 25000 rows.
25000


 52%|█████▏    | 26/50 [13:32<11:20, 28.34s/it]

26000


 54%|█████▍    | 27/50 [14:16<12:41, 33.10s/it]

27000


 56%|█████▌    | 28/50 [14:42<11:16, 30.74s/it]

28000


 58%|█████▊    | 29/50 [15:15<11:00, 31.47s/it]

29000


 60%|██████    | 30/50 [15:41<09:59, 29.99s/it]

Checkpoint saved at 30000 rows.
30000


 62%|██████▏   | 31/50 [16:10<09:24, 29.69s/it]

31000


 64%|██████▍   | 32/50 [16:32<08:12, 27.35s/it]

32000


 66%|██████▌   | 33/50 [16:56<07:28, 26.39s/it]

33000


 68%|██████▊   | 34/50 [17:20<06:50, 25.64s/it]

34000


 70%|███████   | 35/50 [18:09<08:10, 32.67s/it]

Checkpoint saved at 35000 rows.
35000


 72%|███████▏  | 36/50 [18:57<08:41, 37.25s/it]

36000


 74%|███████▍  | 37/50 [20:08<10:15, 47.33s/it]

37000


 76%|███████▌  | 38/50 [20:45<08:51, 44.25s/it]

38000


 78%|███████▊  | 39/50 [21:12<07:08, 38.94s/it]

39000


 80%|████████  | 40/50 [24:03<13:07, 78.73s/it]

Checkpoint saved at 40000 rows.
40000


 82%|████████▏ | 41/50 [24:28<09:23, 62.65s/it]

41000


 84%|████████▍ | 42/50 [25:39<08:40, 65.10s/it]

42000


 86%|████████▌ | 43/50 [43:47<43:22, 371.80s/it]

43000


 88%|████████▊ | 44/50 [44:30<27:20, 273.41s/it]

44000


 90%|█████████ | 45/50 [44:55<16:33, 198.72s/it]

Checkpoint saved at 45000 rows.
45000


 92%|█████████▏| 46/50 [45:21<09:47, 146.82s/it]

46000


 94%|█████████▍| 47/50 [45:45<05:29, 109.97s/it]

47000


 96%|█████████▌| 48/50 [46:11<02:49, 84.85s/it] 

48000


 98%|█████████▊| 49/50 [46:35<01:06, 66.63s/it]

49000


100%|██████████| 50/50 [47:01<00:00, 56.43s/it]

Checkpoint saved at 50000 rows.





Results saved to openai_embeddings.csv


In [50]:
embeddings_df = pd.read_pickle("openai_embeddings.pkl")

In [110]:
shuffled_embeddings_df = embeddings_df.sample(frac=1, random_state=42)

In [None]:
shuffled_embeddings_df.iloc[:25000]['embeddings']

Unnamed: 0,embeddings
33553,"[0.028374338522553444, 0.0070473141968250275, ..."
9427,"[0.012964552268385887, 0.03231743350625038, -0..."
199,"[0.0018504128092899919, -0.005899613257497549,..."
12447,"[0.015956750139594078, 0.024705497547984123, 0..."
39489,"[0.025965319946408272, 0.05835741385817528, -0..."
...,...
7110,"[0.0397479310631752, 0.07066299021244049, -0.0..."
46643,"[0.027213390916585922, 0.060060519725084305, 0..."
5440,"[0.01369396410882473, 0.017240161076188087, -0..."
33017,"[-0.000996461370959878, 0.024415280669927597, ..."


In [122]:
import numpy as np

embeddings_1 = np.array(shuffled_embeddings_df.iloc[:25000]['embeddings'].tolist(), dtype=np.float32)
embeddings_2 = np.array(embeddings_df['embeddings'].tolist(), dtype=np.float32)

In [124]:
from sentence_transformers.util import pytorch_cos_sim
import torch


# Convert NumPy embeddings to PyTorch tensors
embedding_tensor1 = torch.tensor(embeddings_1)
embedding_tensor2 = torch.tensor(embeddings_2)

# Compute cosine similarity
similarity_score = pytorch_cos_sim(embedding_tensor1, embedding_tensor2)
print("Cosine Similarity:", similarity_score)

Cosine Similarity: tensor([[0.2962, 0.1831, 0.1834,  ..., 0.3664, 0.1478, 0.2298],
        [0.0980, 0.2334, 0.2922,  ..., 0.2832, 0.0750, 0.1842],
        [0.1774, 0.1822, 0.1762,  ..., 0.1841, 0.2156, 0.2734],
        ...,
        [0.1667, 0.1929, 0.3352,  ..., 0.3547, 0.0802, 0.2822],
        [0.1019, 0.2300, 0.3428,  ..., 0.3088, 0.0975, 0.2038],
        [0.2708, 0.3231, 0.3365,  ..., 0.4159, 0.1464, 0.3002]])


In [131]:
similarity_score.shape

torch.Size([25000, 50000])

In [125]:
sorted_similarity_indices = torch.argsort(similarity_score,  dim=1)

In [188]:
negative_pair_index_list = []

for i in tqdm(range(len(similarity_score))):

    # Start with the smallest similarity index for the current row
    neg_j = torch.randint(0, 500, (1,)).item()

    neg_index = int(sorted_similarity_indices[i][neg_j])

    # Ensure the index is unique
    while neg_index in negative_pair_index_list:
        neg_index = int(sorted_similarity_indices[i][torch.randint(0, 3000, (1,)).item()])

    negative_pair_index_list.append(neg_index)


100%|██████████| 25000/25000 [00:10<00:00, 2497.51it/s]


In [200]:
positive_pair_index_list = []

for i in tqdm(range(len(similarity_score))):

    pos_j = torch.randint(-30, -1, (1,)).item()

    neg_index = int(sorted_similarity_indices[i][neg_j])
    # pos_index = int(sorted_similarity_indices[i][pos_j])

    while pos_index in positive_pair_index_list:
        pos_index = int(sorted_similarity_indices[i][torch.randint(-60, -1, (1,)).item()])

    positive_pair_index_list.append(pos_index)

100%|██████████| 25000/25000 [00:05<00:00, 4721.03it/s]


In [151]:
shuffled_embeddings_df = shuffled_embeddings_df.iloc[:25000]
shuffled_embeddings_df.shape

(25000, 16)

In [201]:
shuffled_embeddings_df['negative_overview'] = embeddings_df['overview'].iloc[negative_pair_index_list].values
shuffled_embeddings_df['positive_overview'] = embeddings_df['overview'].iloc[positive_pair_index_list].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shuffled_embeddings_df['negative_overview'] = embeddings_df['overview'].iloc[negative_pair_index_list].values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shuffled_embeddings_df['positive_overview'] = embeddings_df['overview'].iloc[positive_pair_index_list].values


In [202]:
shuffled_embeddings_df[['overview', 'negative_overview', 'positive_overview']]

Unnamed: 0,overview,negative_overview,positive_overview
33553,Gabrielle has just joined a prestigious news p...,What caused Building 7 to collapse on 9/11? Dr...,A woman with a shadowed past lures a couple in...
9427,"Fay reconnects with her sister, Alice, as she ...","«Svejk v civilu» (also known as ""Svejk as a Ci...",A television star goes home to Texas for her f...
199,After a billionaire engineer witnesses his bes...,Hitotsu Yane no Shita is a Japanese television...,"When one of his former colleagues is murdered,..."
12447,A large group of criminals escapes from prison...,Clare Balding rounds up every GB medal as the ...,"During a routine prison work detail, convict P..."
39489,"The sun is setting and we see Dave, an artist,...","Recording the journey of Raisa, a great Indone...","After the murder of his beloved wife, a man in..."
...,...,...,...
7110,"Home with their newly-formed family, happy par...",Following his discharge from the Navy for hitt...,Jake and Desmond (Paranormal Exterminators) te...
46643,November 1970. Several human remains appear in...,"In New York City's Chinatown, an ornery, chain...",Prisoners await execution by firing squad when...
5440,"Léo goes on vacation at his cousin's, in a fis...",Putting together stunning visual and performan...,On her trip back from a working holiday abroad...
33017,Story set in the Middle Ages. A page and the d...,"It’s summer again, and everyone’s favorite Jun...",A teenage girl in Medieval England navigates l...


In [173]:
# Shuffle the dataset
df = shuffled_embeddings_df[['overview', 'negative_overview', 'positive_overview']].sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train, validation, and test sets (e.g., 80% train, 10% validation, 10% test)
train_frac = 0.8
valid_frac = 0.1
test_frac = 0.1

# define train and validation size
train_size = int(train_frac * len(df))
valid_size = int(valid_frac * len(df))

# create train, validation, and test datasets
df_train = df[:train_size]
df_valid = df[train_size:train_size + valid_size]
df_test = df[train_size + valid_size:]

In [None]:
from huggingface_hub import login

login(huggingface_key)

In [219]:
from datasets import DatasetDict, Dataset

train_ds = Dataset.from_pandas(df_train)
valid_ds = Dataset.from_pandas(df_valid)
test_ds = Dataset.from_pandas(df_test)

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_ds,
    'validation': valid_ds,
    'test': test_ds
})

dataset_dict.push_to_hub("trihoang131/movie_dataset_50K")

Creating parquet from Arrow format: 100%|██████████| 20/20 [00:00<00:00, 294.38ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:07<00:00,  7.05s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 152.56ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.07s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 148.17ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.47s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/trihoang131/movie_dataset_50K/commit/060877eb323903c322d0733b3c9bfaaa698311a6', commit_message='Upload dataset', commit_description='', oid='060877eb323903c322d0733b3c9bfaaa698311a6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/trihoang131/movie_dataset_50K', endpoint='https://huggingface.co', repo_type='dataset', repo_id='trihoang131/movie_dataset_50K'), pr_revision=None, pr_num=None)