In [1]:
%pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl (30 kB)
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-3.0.2 dill-0.3.8 multiprocess-0.70.16 xxhash-3.5.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
from datasets import Dataset
import json
 
def load_translation_dataset(file_path):
    """
    Load and prepare the translation dataset
    """
    # Read the JSON file
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    # Prepare the examples in Llama instruction format
    formatted_data = []
    for example in data['training_data']:
        # Create instruction that includes preservation information
        instruction = f"""Translate from {example['source_lang']} to {example['target_lang']}.
Context: {example['context']}
Preserve these words unchanged and it has to be transliterated to target langauge keeping Abbreviations as is: {', '.join(example['domain_terms'])}
 
Text: {example['source_text']}"""
        formatted_data.append({
            'instruction': instruction,
            'input': '',  # Empty as instruction contains the source text
            'output': example['target_text']
        })
    # Create Hugging Face dataset
    dataset = Dataset.from_list(formatted_data)
    dataset.save_to_disk(r'C:\Interview_Preparation\LLM_Finetuning\final_instructional_format') 
    return dataset

In [8]:
 # Example usage
if __name__ == "__main__":
    # Load the dataset
    dataset = load_translation_dataset(r'C:\Interview_Preparation\LLM_Finetuning\translation_dataset_v1.json')
    # Print a sample
    print("\nSample instruction format:")
    print(dataset[0]['instruction'])
    print("\nExpected output:")
    print(dataset[1]['output'])

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Sample instruction format:
Translate from en to ja.
Context: dashboard_header_notification
Preserve these words unchanged and it has to be transliterated to target langauge keeping Abbreviations as is: EY, Mobility, Pathway, GTR, APAC
 
Text: EY Mobility Pathway Dashboard: GTR approval pending for APAC assignment

Expected output:
Bienvenido a EY Mobilidad PathFainder. Su acceso al portal GMS para la revisión de cumplimiento FSO está listo.


In [10]:
import json
from datasets import Dataset
from typing import Dict
from pathlib import Path

# Define the Alpaca prompt format globally
ALPACA_PROMPT = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def load_json_dataset(file_path: str) -> Dict:
    """
    Load JSON dataset from file path.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def convert_to_alpaca_format(examples):
    """
    Convert translation dataset to Alpaca format with clear translation instructions.
    """
    alpaca_format = {
        "instruction": [],
        "input": [],
        "output": []
    }
    
    for item in examples["training_data"]:
        # Create comprehensive instruction
        instruction_parts = [
            f"Translate the following text from {item['source_lang']} to {item['target_lang']}",
            "Translation guidelines:",
            "1. Maintain the original meaning and context",
            "2. Preserve sentence structure where appropriate for the target language",
            f"3. Domain terms ({', '.join(item.get('domain_terms', []))}):",
            "   - Transliterate non-abbreviation terms using target language characters while preserving their pronunciation",
            "   - Keep abbreviations as is in the target text"
        ]
        
        instruction = "\n".join(instruction_parts)
        
        # Prepare input with context
        input_parts = [item["source_text"]]
        
        context_info = []
        if item.get("context"):
            context_info.append(f"Context: {item['context']}")
        if item.get("content_type"):
            context_info.append(f"Content type: {item['content_type']}")
        if item.get("domain_terms"):
            context_info.append(f"Domain terms: {', '.join(item['domain_terms'])}")
            
        if context_info:
            input_parts.append("Additional information:")
            input_parts.extend(context_info)
        
        alpaca_format["instruction"].append(instruction)
        alpaca_format["input"].append("\n".join(input_parts))
        alpaca_format["output"].append(item["target_text"])
    
    return alpaca_format

def process_dataset(json_path: str, output_path: str = None, eos_token: str = "</s>"):
    """
    Process JSON dataset and convert it to Alpaca format.
    
    Args:
        json_path: Path to input JSON file
        output_path: Optional path to save processed dataset
        eos_token: End of sequence token (default: "</s>")
    
    Returns:
        datasets.Dataset: Processed dataset in Alpaca format
    """
    # Load JSON data
    print(f"Loading dataset from {json_path}")
    data = load_json_dataset(json_path)
    
    # Convert to Alpaca format
    print("Converting to Alpaca format")
    alpaca_formatted = convert_to_alpaca_format(data)
    
    # Create Dataset object
    dataset = Dataset.from_dict(alpaca_formatted)
    
    # Apply final formatting
    print("Applying final formatting")
    final_dataset = dataset.map(
        lambda examples: {
            "text": [
                ALPACA_PROMPT.format(i, inp, o) + eos_token
                for i, inp, o in zip(examples["instruction"], examples["input"], examples["output"])
            ]
        },
        batched=True
    )
    
    # Save processed dataset if output path is provided
    if output_path:
        print(f"Saving processed dataset to {output_path}")
        final_dataset.save_to_disk(output_path)
    
    return final_dataset

In [12]:
if __name__ == "__main__":
    try:
        # Example usage
        json_path = r"C:\Interview_Preparation\LLM_Finetuning\translation_dataset_v1.json"
        output_path = r"C:\Interview_Preparation\LLM_Finetuning\final_instructional_format"  # Optional
        
        # Process with output saving
        dataset = process_dataset(
            json_path=json_path,
            output_path=output_path,
            eos_token="</s>"  # You can change this to match your tokenizer's EOS token
        )
        
        # Print sample to verify
        print("\nSample from processed dataset:")
        print(dataset[0])
        
        print(f"\nTotal examples processed: {len(dataset)}")
        
    except FileNotFoundError:
        print(f"Error: Could not find file at {json_path}")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {json_path}")
    except Exception as e:
        print(f"Error occurred: {str(e)}")

Loading dataset from C:\Interview_Preparation\LLM_Finetuning\translation_dataset_v1.json
Converting to Alpaca format
Applying final formatting


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Saving processed dataset to C:\Interview_Preparation\LLM_Finetuning\final_instructional_format


Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]


Sample from processed dataset:
{'instruction': 'Translate the following text from en to ja\nTranslation guidelines:\n1. Maintain the original meaning and context\n2. Preserve sentence structure where appropriate for the target language\n3. Domain terms (EY, Mobility, Pathway, GTR, APAC):\n   - Transliterate non-abbreviation terms using target language characters while preserving their pronunciation\n   - Keep abbreviations as is in the target text', 'input': 'EY Mobility Pathway Dashboard: GTR approval pending for APAC assignment\nAdditional information:\nContext: dashboard_header_notification\nContent type: header\nDomain terms: EY, Mobility, Pathway, GTR, APAC', 'output': 'イーワイ・モビリティ・パスウェイ ダッシュボード：APACアサインメントのGTR承認待ち', 'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nTranslate the following text from en to ja\nTranslation guidelines:\n1. Maintain