In [12]:
from google.cloud import storage
import os
import tarfile

# Define GCS bucket and file details
bucket_name = "nlp_proj"  # Replace with your bucket name
source_blob_name = "datasets/translated_dataset_v3.tar.gz"  # Path in GCS
destination_file_name = "translated_dataset.tar_v3.gz"  # Local file name

# Function to download a file from GCS
def download_from_gcs(bucket_name, source_blob_name, destination_file_name):
    """Downloads a file from GCS bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)
    print(f"File {source_blob_name} downloaded to {destination_file_name}.")

# Download the file from GCS
download_from_gcs(bucket_name, source_blob_name, destination_file_name)

# Unzip the downloaded tar.gz file
def extract_tar_gz(file_name, extract_path):
    """Extracts a tar.gz file to the specified path."""
    with tarfile.open(file_name, "r:gz") as tar:
        tar.extractall(path=extract_path)
    print(f"Extracted {file_name} to {extract_path}.")

# Define extraction path
extracted_path = "./extracted_translated_dataset_v3"

# Ensure the directory exists
os.makedirs(extracted_path, exist_ok=True)

# Extract the dataset
extract_tar_gz(destination_file_name, extracted_path)


File datasets/translated_dataset_v3.tar.gz downloaded to translated_dataset.tar_v3.gz.
Extracted translated_dataset.tar_v3.gz to ./extracted_translated_dataset_v3.


In [13]:
from datasets import load_from_disk

# Load the dataset from the extracted path
dataset = load_from_disk(extracted_path)
print(dataset)

Dataset({
    features: ['summary', 'topic', 'title', 'date', 'translated_text'],
    num_rows: 3000
})


In [14]:
df = dataset.to_pandas()
print(df.head())

                                             summary     topic  \
0  Transport in a cattle carriage, smell of meat ...  Politics   
1  Marble zebra stripes, pompous buildings: Sinde...  Politics   
2  Oskar Lafontaine resigns as party chairman of ...  Politics   
3  The roots of poverty lie in the past. Haiti is...  Politics   
4  Black-yellow is not yet the dream coalition th...  Politics   

                                               title        date  \
0        Auschwitz: Memories of a Holocaust Survivor  00/01/2010   
1  Municipalities in Need (3): Sindelfingen - Bey...  00/01/2010   
2  Staff debate on the left - who is coming to La...  00/01/2010   
3             History of Haiti - Napoleon's disgrace  00/01/2010   
4  Black-and-yellow cabinet - Merkel's team in th...  00/01/2010   

                                     translated_text  
0  Transport in a cattle carriage, smell of meat ...  
1  Marble zebra stripes, pompous buildings: Sinde...  
2  This Monday, when the coun