In [None]:
import tensorflow_datasets as tfds

In [None]:
# Creating a .kaggle folder to store Kaggle API credentials
!mkdir ~/.kaggle
# Installed Kaggle API client
!pip install -q kaggle

In [None]:
kaggle_username =''
kaggle_key = ''
!echo '{"username":kaggle_username,"key":kaggle_key}' > ~/.kaggle/kaggle.json

In [None]:
# Downloading the dataset using the Kaggle API and storing it in the /content directory
!kaggle datasets download -d a0155991rliwei/c4-200m -p /content

Downloading c4-200m.zip to /content
100% 19.3G/19.3G [03:35<00:00, 79.4MB/s]
100% 19.3G/19.3G [03:35<00:00, 96.1MB/s]


### Load the dataset and shard it

In [None]:
# Installing sentencepiece library (used for tokenization)
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.9 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [None]:
# for working with pre-trained models
!pip install -q transformers

In [None]:
from transformers import (
   AutoConfig,
   AutoTokenizer,
   AutoModelForSequenceClassification,
)
import pandas as pd

In [None]:
# Initialize the tokenizer for the T5 model
model_name = 't5-base'
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [None]:
# Unzip the dataset to extract the contents
!unzip -q /content/c4-200m.zip

In [None]:
# a builder object to load the dataset from the extracted directory
c4_builder = tfds.core.builder_from_directory('/content/c4200m/1.0.0')


INFO:absl:Load dataset info from /content/c4200m/1.0.0


In [None]:
#Number of examples available in the training split of the dataset
num_train_examples = c4_builder.info.splits['train'].num_examples
print(num_train_examples)

183894319


In [None]:
#Download and prepare the dataset,
c4_builder.download_and_prepare()

INFO:absl:Reusing dataset c4200m (/content/c4200m/1.0.0)


In [None]:
## Loading the training data into a tf.data.Dataset object, shuffling the files for randomness
train_data = c4_builder.as_dataset(split='train', shuffle_files=True)
# Convert the dataset to a Pandas DataFrame and select the first 550,000 examples
train_df = tfds.as_dataframe(train_data.take(550000))
train_df.shape

INFO:absl:Constructing tf.data.Dataset c4200m for split train, from /content/c4200m/1.0.0


(550000, 2)

In [None]:
train_df.head()

Unnamed: 0,input,output
0,"b""Can be empenty'' for more damage, but not ne...","b'Can be empathy for more damage, but not need..."
1,b'Miguelx completed Pollster Badge.',b'Miguelx completed the Pollster Badge.'
2,b'This classic three day itinerary is take you...,b'This classic three-day itinerary takes you t...
3,b'Kimbrelle shares an inspirational story wher...,b'Kimbrelle shares an inspirational story wher...
4,b'Variation: The utility players get a guideli...,b'Variation: The utility players get a guideli...


In [None]:
#Decode the byte strings in the 'input' and 'output' columns to UTF-8
train_df['input'] = train_df['input'].str.decode(encoding = 'UTF-8')
train_df['output'] = train_df['output'].str.decode(encoding = 'UTF-8')
train_df.head()

Unnamed: 0,input,output
0,The steps below describe how to remove data fo...,The steps below describe how to remove data fo...
1,When I wake up it\'s usually comes out dreamsI...,When I wake up it\'s usually dreams I\'m think...
2,One of the cardinal factors to be considered t...,One of the cardinal factors to consider when t...
3,Answers » Regions » Is in Nagorno-Karabakt reg...,Answers » Regions » Is Nagorno-Karabakh region...
4,Flaneuring in fun at maple creek SK!,Flaneuring Fun in Maple Creek SK!


In [None]:
train_df.to_csv('/content/drive/MyDrive/c4_200m/c4_200m_550k.csv', index=False)

#### Also Created a 1MM dataset

In [None]:
train_data = c4_builder.as_dataset(split='train', shuffle_files=True)
train_df = tfds.as_dataframe(train_data.take(550000))
train_df.shape

INFO:absl:Constructing tf.data.Dataset c4200m for split train, from /content/c4200m/1.0.0


(1000000, 2)

In [None]:
train_df['input'] = train_df['input'].str.decode(encoding = 'UTF-8')
train_df['output'] = train_df['output'].str.decode(encoding = 'UTF-8')
train_df.head()

Unnamed: 0,input,output
0,"Medell he, Ohio W. Shannon Kansas,R. C.","Medell, Ohio; W. Shannon. Kansas; R. C."
1,quarter of 1999 $ 25 million was repaid under ...,"quarter of 1999, $25 million was repaid under ..."
2,It used as service center by the Block office ...,It can be used as a service center by the Bloc...
3,"Tom offered two this time, one of old restaura...","Tom offered two this time, one of old restaura..."
4,You can see 'Spring beauties' at The Botanical...,You can see Spring Beauties at The Botanical G...


In [None]:
train_df.to_csv('/content/drive/MyDrive/c4_200m/c4_200m_1M.csv', index=False)