In [1]:
# Importing required libraries
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datasets import Dataset
from huggingface_hub import HfApi, HfFolder
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function to read JSONL file
def read_jsonl(file_path):
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

# Read the JSONL file
file_path = 'dataset/turaco_raw.jsonl'  # Replace with your actual file path
data = read_jsonl(file_path)

In [3]:
print("Original data structure:")
print(json.dumps(data[0], indent=2))

# rename the columns from question, context, and answers to instruction, input, and output respectively


Original data structure:
{
  "question": "You are an AI model trained to communicate solely in Pidgin English. No matter the language of the input, you must respond in Pidgin English. If the user asks you something in English or any other language, you should understand their query but only reply in Pidgin English. Your responses should be natural, fluent, and accurate in Pidgin English, reflecting typical conversational patterns used by native speakers. Ensure you provide clear, helpful, and contextually appropriate answers. Always adhere to this communication style unless explicitly instructed otherwise.",
  "context": "Can you tell me the time for the next meeting?",
  "answer": "The next meeting go start for 3 o'clock di afternoon."
}


In [4]:
# Convert to DataFrame
df = pd.DataFrame(data)

print("\nDataFrame structure:")
print(df.head())


DataFrame structure:
                                            question  \
0  You are an AI model trained to communicate sol...   
1  You are an AI model trained to communicate sol...   
2  You are an AI model trained to communicate sol...   
3  You are an AI model trained to communicate sol...   
4  You are an AI model trained to communicate sol...   

                                             context  \
0     Can you tell me the time for the next meeting?   
1  What are the requirements for the job applicat...   
2                       How can I reset my password?   
3            What is the weather forecast for today?   
4                   Where can I find the user guide?   

                                              answer  
0  The next meeting go start for 3 o'clock di aft...  
1  The requirements them for the job application ...  
2  For reset your password, go for login page, cl...  
3  The weather forecast for today dey like say e ...  
4  You fit find the user guid

In [5]:
# Create new tables
df_new = pd.DataFrame({
    'instruction': df['question'],
    'input': df['context'],
    'output': df['answer']
})

# # Create new tables (appending with question)
# df_new = pd.DataFrame({
#     'prompt': df['question'] + " "+df['context'],
#     'response': df['answer']
# })

In [6]:
print("\nNew DataFrame structure:")
print(df_new.head())


New DataFrame structure:
                                         instruction  \
0  You are an AI model trained to communicate sol...   
1  You are an AI model trained to communicate sol...   
2  You are an AI model trained to communicate sol...   
3  You are an AI model trained to communicate sol...   
4  You are an AI model trained to communicate sol...   

                                               input  \
0     Can you tell me the time for the next meeting?   
1  What are the requirements for the job applicat...   
2                       How can I reset my password?   
3            What is the weather forecast for today?   
4                   Where can I find the user guide?   

                                              output  
0  The next meeting go start for 3 o'clock di aft...  
1  The requirements them for the job application ...  
2  For reset your password, go for login page, cl...  
3  The weather forecast for today dey like say e ...  
4  You fit find the user 

In [7]:
# Convert DataFrame to PyArrow Table
table = pa.Table.from_pandas(df_new)

In [8]:
# Write to Parquet file
pq.write_table(table, 'output.parquet')


In [9]:
# Read Parquet file to verify
parquet_df = pd.read_parquet('output.parquet')

print("\nData from Parquet file:")
print(parquet_df.head())


Data from Parquet file:
                                         instruction  \
0  You are an AI model trained to communicate sol...   
1  You are an AI model trained to communicate sol...   
2  You are an AI model trained to communicate sol...   
3  You are an AI model trained to communicate sol...   
4  You are an AI model trained to communicate sol...   

                                               input  \
0     Can you tell me the time for the next meeting?   
1  What are the requirements for the job applicat...   
2                       How can I reset my password?   
3            What is the weather forecast for today?   
4                   Where can I find the user guide?   

                                              output  
0  The next meeting go start for 3 o'clock di aft...  
1  The requirements them for the job application ...  
2  For reset your password, go for login page, cl...  
3  The weather forecast for today dey like say e ...  
4  You fit find the user g

In [10]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(parquet_df)

# make dataset to pandas DataFrame
dataset = dataset.to_pandas()

In [11]:
#  print dataset features and number of rows
print("\nDataset features:")
dataset



Dataset features:


Unnamed: 0,instruction,input,output
0,You are an AI model trained to communicate sol...,Can you tell me the time for the next meeting?,The next meeting go start for 3 o'clock di aft...
1,You are an AI model trained to communicate sol...,What are the requirements for the job applicat...,The requirements them for the job application ...
2,You are an AI model trained to communicate sol...,How can I reset my password?,"For reset your password, go for login page, cl..."
3,You are an AI model trained to communicate sol...,What is the weather forecast for today?,The weather forecast for today dey like say e ...
4,You are an AI model trained to communicate sol...,Where can I find the user guide?,You fit find the user guide inside the 'Help' ...
5,You are an AI model trained to communicate sol...,How for you today?,"Na man this nor grand, something no dey?"
6,You are an AI model trained to communicate sol...,Wetin be your name?,"Ma name na Turaco, A bi AI wey dey speak pidgi..."
7,You are an AI model trained to communicate sol...,Wetin you fit do?,A fit helep you with plenty tins like answer q...
8,You are an AI model trained to communicate sol...,How you dey take learn new tins?,"Boy, A dey learn new tins by using plenty data..."
9,You are an AI model trained to communicate sol...,Wetin make you different from other AI?,Wetin make me different na say a dey communica...


In [13]:


# Assuming 'dataset' is a pandas DataFrame with 'prompt' and 'response' columns
train_df, test_df = train_test_split(dataset, test_size=0.25, random_state=42)

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

# print dataset_dict features and number of rows
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', '__index_level_0__'],
        num_rows: 18
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', '__index_level_0__'],
        num_rows: 7
    })
})

### Push the dataset huggingface hub

In [14]:
!huggingface-cli
dataset_dict.push_to_hub("fotiecodes/turaco-dataset")

usage: huggingface-cli <command> [<args>]

positional arguments:
  {download,upload,repo-files,env,login,whoami,logout,repo,lfs-enable-largefiles,lfs-multipart-upload,scan-cache,delete-cache,tag}
                        huggingface-cli command helpers
    download            Download files from the Hub
    upload              Upload a file or a folder to a repo on the Hub
    repo-files          Manage files in a repo on the Hub
    env                 Print information about the environment.
    login               Log in using a token from
                        huggingface.co/settings/tokens
    whoami              Find out which huggingface.co account you are logged
                        in as.
    logout              Log out
    repo                {create} Commands to interact with your huggingface.co
                        repos.
    lfs-enable-largefiles
                        Configure your repository to enable upload of files >
                        5GB.
    scan-cache

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 551.88ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1396.70ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.57s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/fotiecodes/turaco-dataset/commit/e800587391e7c70e9e7a06b4eff9cda81e11a1fe', commit_message='Upload dataset', commit_description='', oid='e800587391e7c70e9e7a06b4eff9cda81e11a1fe', pr_url=None, pr_revision=None, pr_num=None)