<a href="https://colab.research.google.com/github/ycsama0703/FinGPT/blob/master/Copy_of_FinGPT_Training_with_LoRA_and_ChatGLM2%E2%80%936B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FinGPT: Training with LoRA and ChatGLM2–6B

## Part 1: Preparing the Data

### 1.1 Initialize Directories

In [None]:
import os
import shutil

In [None]:
if not os.path.exists('./data'):
    os.makedirs('./data')


jsonl_path = "../data/dataset_new.jsonl"
save_path = '../data/dataset_new'


if os.path.exists(jsonl_path):
    os.remove(jsonl_path)

if os.path.exists(save_path):
    shutil.rmtree(save_path)

directory = "../data"
if not os.path.exists(directory):
    os.makedirs(directory)

In [None]:
!ls -l ./data/dataset_new

total 5148
-rw-r--r-- 1 root root 5263248 Sep 14 05:07 data-00000-of-00001.arrow
-rw-r--r-- 1 root root     742 Sep 14 05:07 dataset_info.json
-rw-r--r-- 1 root root     250 Sep 14 05:07 state.json


### 1.2 Load and Prepare Dataset

In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset
import datasets

In [None]:
dic = {
    0:'negative',
    1:'positive',
    2:'neutral'
}

In [None]:
tfns = load_dataset('zeroshot/twitter-financial-news-sentiment') #tfns = Twitter Financial News Sentiment

In [None]:
tfns = tfns['train']
tfns = tfns.to_pandas()

tfns['label'] = tfns['label'].apply(lambda x : dic[x])  # Map numerical labels to their corresponding sentiments

#Add instruction for each data entry, which is crucial for Instruction Tuning.
tfns['instruction'] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'
tfns.columns = ['input','output','instruction']

#Convert the Pandas dataframe back to a Hugging Face Dataset object.
tfns = datasets.Dataset.from_pandas(tfns)
tfns

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 9543
})

### 1.3 Concatenate and Shuffle Dataset

In [None]:
tmp_dataset = datasets.concatenate_datasets([tfns]*2) #Creat a list that contains 2 tfns
train_dataset = tmp_dataset
print(tmp_dataset.num_rows)

19086


In [None]:
all_dataset = train_dataset.shuffle(seed = 42)
all_dataset.shape

(19086, 3)

The training data is all set

## Part 2: Dataset Formatting and Tokenization

### 2.1 Dataset Fromatting

You must structure your data in a specific format that aligns with the training process.

In [None]:
import json
from tqdm.notebook import tqdm
# Used to display a progress bar in Jupyter Notebook to help visualize the progress of data processing

In [None]:
def format_examle(example:dict) -> dict:    #Defines a function named format_example that takes a dictionary as input (example: dict) and returns a dictionary (-> dict).
  context = f"Instruction:{example['instruction']}\n"   #Initializes a string variable context using an f-string to format the instruction.
  if example.get('input'):     #Checks if the example dictionary has an input key and whether it contains a value.
    context += f"Input:{example['input']}\n"
  context += 'Answer: '
  target = example['output']
  return {"context": context , "target":target}  # This is the format of json data.



data_list = []
for item in all_dataset.to_pandas().itertuples():    #Iterates over each row of the dataset all_dataset, which has been converted into a Pandas DataFrame using .to_pandas().
  tmp = {}
  tmp['instruction'] = item.instruction
  tmp['input'] = item.input
  tmp['output'] = item.output
  data_list.append(tmp)


This is what the elements in data_list look like before formatting

---



In [None]:
data_list[0]

{'instruction': 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.',
 'input': '$DRIP $LABU $GASX - SOXL, LABU, JO and GUSH among weekly ETF movers https://t.co/FntrWNY9sn',
 'output': 'neutral'}

In [None]:
# save to a json file
with open("../data/dataset_new.jsonl",'w') as f:
  for example in tqdm(data_list,desc = 'formatting..'):
    f.write(json.dumps(format_examle(example)) + '\n')

formatting..:   0%|          | 0/19086 [00:00<?, ?it/s]

In [None]:
json_data_list = []  # Var to save json data

# Save to a jsonl file and store in json_data_list
with open("../data/dataset_new.jsonl", 'r') as f:
    for line in f:
        json_line = json.loads(line.strip())
        json_data_list.append(json_line)

This is what it is look like after formatting

In [None]:
json_data_list[0]

{'context': 'Instruction:What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.\nInput:$DRIP $LABU $GASX - SOXL, LABU, JO and GUSH among weekly ETF movers https://t.co/FntrWNY9sn\nAnswer: ',
 'target': 'neutral'}

In [None]:
json_data_list[0]['context']

'Instruction:What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.\nInput:$DRIP $LABU $GASX - SOXL, LABU, JO and GUSH among weekly ETF movers https://t.co/FntrWNY9sn\nAnswer: '

In [None]:
json_data_list[0]['target']

'neutral'

### 2.2 Tokenization

Tokenization is the process of converting input text into tokens that can be fed into the model.

In [None]:
## need to set the packages to run this code block
!pip install protobuf transformers==4.30.2 cpm_kernels torch>=2.0 gradio mdtex2html sentencepiece accelerate

In [None]:
from transformers import AutoTokenizer, AutoConfig

In [None]:
model_name = 'THUDM/chatglm2-6b'   #Specifies the model you're working with
jsonl_path = '../data/dataset_new.jsonl'
save_path = '../data/dataset_new'    #The path where the processed dataset will be saved after tokenization or any other processing
max_seq_length = 512    #Maximum sequence length for the inputs. If an input exceeds this length, it will either be truncated or skipped.
skip_overlength = True    #A flag that determines whether to skip overlength examples that exceed max_seq_length

This preprocess function tokenizes the promt and target, combines them into Input ids, trims or pads the squence to the maximum squence length.

In [None]:
def preprocess(tokenizer, config, example, max_seq_length):
  prompt = example['context']
  target = example['target']
  prompt_ids = tokenizer.encode(   #ids refers to the numerical identifiers that correspond to tokens.These token ids are what the model processes, as models require numerical input rather than raw text.
      prompt,
      max_length = max_seq_length,
      truncation = True
      )
  target_ids = tokenizer.encode(
      target,
      max_length = max_seq_length,
      truncation = True,
      add_special_tolens = False
      )
  input_ids = prompt_ids + target_ids + [config.eos_token_id]  #[config.eos_token_id] is a sign that marks the end of the list.
  return {'input_ids':input_ids,'seq_len':len(prompt_ids)}

input_ids is a complete list of token IDs that combines the input sentence (prompt), the target sentence (target), and the end-of-sequence token (eos_token_id).
This list is fed into the model for training or inference. The model uses these IDs to understand and process the input and generate the corresponding output.

The read_jsonl function reads each line from the JSONL file, preprocesses it using the preprocess function,
and then yields each preprocessed example.

In [None]:
def read_jsonl(path, max_seq_length, skip_overlength=False):
    tokenizer = AutoTokenizer.from_pretrained(    #Initializes a tokenizer using a pre-trained model specified by model_name.
        model_name, trust_remote_code=True)
    config = AutoConfig.from_pretrained(    #Loads the configuration for the model. device_map='auto' helps automatically map the model to available devices (e.g., GPU or CPU).
        model_name, trust_remote_code=True, device_map='auto')
    with open(path, "r") as f:
        for line in tqdm(f.readlines()):
            example = json.loads(line)
            #Preprocesses each example by tokenizing it and converting it into input_ids using the preprocess() function,
            #which takes the tokenizer, config, example, and max_seq_length as inputs.
            feature = preprocess(tokenizer, config, example, max_seq_length)
            if skip_overlength and len(feature["input_ids"]) > max_seq_length:
                continue
            feature["input_ids"] = feature["input_ids"][:max_seq_length]  #Truncates the input_ids to ensure they do not exceed max_seq_length.
            yield feature
#Uses yield to return one preprocessed feature at a time, making the function a generator.
#This allows you to iterate over the processed features one by one without loading everything into memory at once.

### 2.3 Save the Dataset

In [None]:
save_path = './data/dataset_new'

In [None]:
dataset = datasets.Dataset.from_generator(
    lambda: read_jsonl(jsonl_path, max_seq_length, skip_overlength)
    )
dataset.save_to_disk(save_path)

Saving the dataset (0/1 shards):   0%|          | 0/19086 [00:00<?, ? examples/s]

In [None]:
from datasets import load_from_disk

# Load Dataset
loaded_dataset = load_from_disk('./data/dataset_new')

# Check the structure of Dataset
print(loaded_dataset)

# Print the first sample of the dataset
print(loaded_dataset['input_ids'][0])


Dataset({
    features: ['input_ids', 'seq_len'],
    num_rows: 19086
})
[64790, 64792, 29101, 30954, 5011, 323, 267, 17523, 290, 434, 16933, 30987, 3485, 3331, 284, 3238, 428, 729, 26058, 30967, 679, 24961, 30967, 25545, 19784, 13, 10426, 30954, 30975, 9084, 3839, 506, 30957, 4573, 30976, 506, 30964, 2080, 31017, 560, 10391, 31017, 30957, 30932, 380, 4573, 30976, 30932, 20242, 293, 402, 3290, 30956, 2087, 9467, 8898, 30960, 5869, 663, 2886, 1214, 30912, 30930, 2074, 30967, 30960, 30916, 1884, 30959, 13121, 30969, 14862, 13, 4244, 1902, 266, 30954, 30910, 64790, 64792, 11043, 2]
