In [None]:
#@title ## Setting up the Environment
#@markdown This section installs the libs for Big Data demonstration.

# Install required libraries
!pip install datasets dask distributed pandas
import time
import dask.dataframe as dd
from datasets import load_dataset
import pandas as pd

print("Environment setup completed!")

#@markdown ## 1. Loading a Small Dataset
#@markdown We'll load a small dataset from the Hugging Face `ag_news`

# Load the ag_news dataset (using a small portion for demonstration purposes)
t0 = time.time()
dataset = load_dataset("ag_news", split='train[:10000]')
t1 = time.time()
print(f'Dataset loaded in: {t1-t0:.2f} seconds')
print(f'Dataset contains {len(dataset)} entries.')

#@markdown ## 2. Convert to pandas DataFrame

#@markdown This will convert the dataset to pandas DataFrame
# Convert Hugging Face dataset to pandas DataFrame
t0 = time.time()
df = pd.DataFrame(dataset)
t1 = time.time()
print(f'Pandas DataFrame Created in {t1-t0:.2f} seconds')

#@markdown ## 3.  Create a Dask DataFrame
#@markdown Here, we create a Dask dataframe, for parallel computations
# Convert Pandas DataFrame to Dask DataFrame for parallel processing
t0 = time.time()
ddf = dd.from_pandas(df, npartitions=4)  # npartitions can be adjusted based on resources
t1 = time.time()
print(f'Dask DataFrame Created in {t1-t0:.2f} seconds')

#@markdown ## 4. Basic Data Analysis with Dask

#@markdown Perform basic data analysis using dask
# Example Analysis (counting the number of words in each sentence using Dask)

# Function to count words in a string
def count_words(text):
    if isinstance(text, str):
      return len(text.split())
    return 0

t0 = time.time()
ddf['word_count'] = ddf['text'].apply(count_words, meta=('text', 'int'))
t1 = time.time()
print(f"Word Count Column Created in {t1-t0:.2f} seconds")

# Compute the mean word count
t0 = time.time()
mean_word_count = ddf['word_count'].mean().compute()
t1 = time.time()

print(f"Mean Word Count Computed in {t1-t0:.2f} seconds, Value: {mean_word_count:.2f}")

# Calculate the number of sentences with more than 20 words
t0 = time.time()
large_sentences = ddf[ddf['word_count'] > 20]['word_count'].count().compute()
t1 = time.time()
print(f"Number of sentences with more than 20 words calculated in {t1-t0:.2f} seconds, Value {large_sentences}")

# Print the first five rows to verify that word_count column has been created.
print("Sample of the data with the word_count column")
print(ddf.head())

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting distributed
  Downloading distributed-2024.12.1-py3-none-any.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting dask
  Downloading dask-2024.12.1-py3-none-any.whl.metadata (3.7 kB)
Collecting sortedcontainers>=2.0.5 (from distributed)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting tblib>=1.6.0 (from distributed)
  Downloading tblib-3.0.0-py3-none-any.whl.metadata (25

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Environment setup completed!


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset loaded in: 5.90 seconds
Dataset contains 10000 entries.
Pandas DataFrame Created in 0.46 seconds
Dask DataFrame Created in 0.02 seconds
Word Count Column Created in 0.01 seconds
Mean Word Count Computed in 0.49 seconds, Value: 38.66
Number of sentences with more than 20 words calculated in 0.06 seconds, Value 9626
Sample of the data with the word_count column
                                                text  label  word_count
0  Wall St. Bears Claw Back Into the Black (Reute...      2          21
1  Carlyle Looks Toward Commercial Aerospace (Reu...      2          36
2  Oil and Economy Cloud Stocks' Outlook (Reuters...      2          36
3  Iraq Halts Oil Exports from Main Southern Pipe...      2          36
4  Oil prices soar to all-time record, posing new...      2          37
