In [None]:
!pip install transformers

In [None]:
# import extract_text
import os
from tqdm import tqdm
import pandas as pd
# from preprocess import save_df_to_pickle
import pickle
import gc

# Dowload and process data

In [None]:
import requests
import zipfile

def download_file_from_dropbox(url, destination):
    r = requests.get(url)
    if r.status_code == 200:
        with open(destination, 'wb') as f:
            f.write(r.content)
    else:
        print(f"Failed to download file, status code: {r.status_code}")

def unzip_file(zip_filepath, dest_directory):
    with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
        zip_ref.extractall(dest_directory)

dropbox_url = "https://www.dropbox.com/scl/fi/pd4lj3edgdwdeun3ngdvd/text.pkl?rlkey=7xu6jr2nktlxxahp5mse6vpkh&dl=1"
destination = 'text.pkl'
download_file_from_dropbox(dropbox_url, destination)
# unzip_file(destination, './data')

In [None]:
# def extract():
#     directory_path = "./data"
#     output_dir = "./text"

#     files = sorted(os.listdir(directory_path))
#     columns = ["id", "lang", "title", "summary", "article_count"]
#     data = pd.DataFrame(columns=columns)
#     data.set_index("id", inplace=True)

#     for filename in tqdm(files, ncols=100, desc="Processing"):
#         file_path = os.path.join(directory_path, filename)
#         file_name = os.path.splitext(filename)[0]
#         if file_path.endswith(".pkl"):
#             df = extract_text.extract_text(file_path)
#             data = pd.concat([data, df])

#     save_df_to_pickle(df, output_dir, 'text')

In [None]:
# extract()

# Embed

In [None]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained("bert-base-multilingual-cased").to(device)

def generate_embedding(text):
  encoded_input = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
  output = model(**encoded_input)
  return output[1]

In [None]:
def save_to_pickle(data, folder, filename):
    if not os.path.exists(folder):
      os.makedirs(folder)
    with open(f"{folder}/{filename}.pkl", "wb") as f:
        pickle.dump(data, f)

In [None]:
chunk_size = 1000

# Load data
data = pd.read_pickle("./text.pkl")

# Initialize the list to hold the dictionaries
embeddings = {}

# Processing loop
for b in tqdm(range(0, len(data), chunk_size), desc="Processing batches", ncols=100):
    batch = data.iloc[b:b+chunk_size]

    # Generate embeddings and store them in the list along with IDs
    for i, row in batch.iterrows():
        embeddings[i] = {
            'title_embed': generate_embedding(row["title"]).cpu().detach().numpy(),  # Move to CPU and detach
            'summary_embed': generate_embedding(row["summary"]).cpu().detach().numpy()  # Move to CPU and detach
        }

    # Explicitly free GPU memory
    gc.collect()
    torch.cuda.empty_cache()

    # Save the generated embeddings to disk and clear the list
    save_to_pickle(embeddings, './embeds', f'batch_{b}')
    embeddings.clear()

Processing batches: 100%|███████████████████████████████████████| 195/195 [1:40:45<00:00, 31.00s/it]


# Zip

In [None]:
import shutil

def zip_folder(folder_path, zip_path):
    # Create a zip file from the folder
    shutil.make_archive(zip_path, 'zip', folder_path)

In [None]:
zip_folder('./embeds', 'embeds')

In [None]:
!ls -A

.config  embeds  embeds.zip  sample_data  text.pkl


In [None]:
!du -sh embeds.zip

1.1G	embeds.zip
