### Converting Json to Jsonl

In [2]:
import json

# Read the JSON file
with open('10000dataset.json', 'r') as f:
    data = json.load(f)

# Open a new file for JSONL output
with open('output.jsonl', 'w') as f:
    for item in data:
        # Write each item as a JSON string on a new line
        f.write(json.dumps(item) + '\n')


### Shuffling Jsonl Randomly

In [2]:
import json
import random

# Read the JSONL file
with open('10000dataset.jsonl', 'r') as f:
    lines = f.readlines()

# Shuffle the lines randomly
random.shuffle(lines)

# Write the shuffled lines to a new JSONL file
with open('shuffled_output.jsonl', 'w') as f:
    f.writelines(lines)


### Convert Input to string from dictionary

In [3]:
import json

# File paths
input_file = 'test1.jsonl'  # Replace with your input JSONL file
output_file = 'test.jsonl'  # Replace with your desired output JSONL file

# Process the JSONL file
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        # Parse each line as JSON
        data = json.loads(line.strip())
        
        # Convert the 'input' dictionary to a JSON string if it's a dictionary
        if isinstance(data.get("input"), dict):
            data["input"] = json.dumps(data["input"])
        
        # Write the modified line back to the output file
        outfile.write(json.dumps(data) + '\n')

print(f"Conversion complete! Saved to '{output_file}'.")


Conversion complete! Saved to 'test.jsonl'.


### Steps to Convert Data to Llama-3.1 Format

In [10]:
import json

# File paths
input_file = "train1.jsonl"  # Your raw dataset file
output_file = "train.jsonl"  # Output file in Llama-3.1 format

def convert_to_llama3_format(input_file, output_file):
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        for line in infile:
            try:
                # Parse the current line as JSON
                data = json.loads(line.strip())
                
                # Deserialize the input field
                user_input_dict = json.loads(data["input"])
                
                # Extract values with defaults for missing fields
                token = user_input_dict.get("token", "unknown")
                category = user_input_dict.get("category", "unknown")
                market_scenario = user_input_dict.get("market_scenario", "unknown")
                twitter_handle = user_input_dict.get("twitter_handle", user_input_dict.get("handle", "unknown"))
                tone = user_input_dict.get("tone", "unknown")
                
                # Handle metrics or direct fields like "market_cap", "price", etc.
                metrics = user_input_dict.get("metrics", {})
                market_cap = user_input_dict.get("market_cap", metrics.get("market cap", "unknown"))
                price = user_input_dict.get("price", metrics.get("price", "unknown"))
                price_change = user_input_dict.get("price_change", "unknown")
                volume = user_input_dict.get("volume", metrics.get("volume", "unknown"))
                volume_change = user_input_dict.get("volume_change", "unknown")
                
                # Construct user input string
                user_input = (
                    f"Token: {token}, "
                    f"Category: {category}, "
                    f"Market Scenario: {market_scenario}, "
                    f"Metrics: Market Cap {market_cap}, Price {price}, "
                    f"Price Change {price_change}, Volume {volume}, Volume Change {volume_change}, "
                    f"Twitter Handle: {twitter_handle}, "
                    f"Tone: {tone}"
                )
                
                # Get the assistant's output
                assistant_output = data["output"]
                
                # Format into Llama-3.1 style
                formatted_conversation = (
                    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n"
                    f"{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
                    f"{assistant_output}<|eot_id|>\n"
                )
                
                # Write to output file
                outfile.write(formatted_conversation)
            
            except Exception as e:
                # Log any issues with processing specific lines
                print(f"Error processing line: {line.strip()}\nError: {e}")

convert_to_llama3_format(input_file, output_file)
print(f"Formatted dataset saved to {output_file}")


Formatted dataset saved to train.jsonl
