diff --git a/FinGPT_Training_LoRA_with_ChatGLM2_6B_for_Beginners.ipynb b/FinGPT_Training_LoRA_with_ChatGLM2_6B_for_Beginners.ipynb index fce31d5..1d9ba2a 100644 --- a/FinGPT_Training_LoRA_with_ChatGLM2_6B_for_Beginners.ipynb +++ b/FinGPT_Training_LoRA_with_ChatGLM2_6B_for_Beginners.ipynb @@ -3,8 +3,8 @@ { "cell_type": "markdown", "metadata": { - "id": "view-in-github", - "colab_type": "text" + "colab_type": "text", + "id": "view-in-github" }, "source": [ "\"Open" @@ -12,35 +12,31 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "X8H-Vc6w6WSU" + }, "source": [ "# Getting Started with FinGPT\n", "Welcome to this comprehensive guide aimed at beginners diving into the realm of Financial Large Language Models (FinLLMs) with FinGPT. This blog post demystifies the process of training FinGPT using Low-Rank Adaptation (LoRA) with the robust base model ChatGlm2-6b.\n", "\n" - ], - "metadata": { - "id": "X8H-Vc6w6WSU" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "4oLjc7bbv0tO" + }, "source": [ "## Part 1: Preparing the Data\n", "Data preparation is a crucial step when it comes to training Financial Large Language Models. Here, we’ll guide you on how to get your dataset ready for FinGPT using Python.\n", "\n", "In this section, you’ve initialized your working directory and loaded a financial sentiment dataset. Let’s break down the steps:\n", "\n" - ], - "metadata": { - "id": "4oLjc7bbv0tO" - } + ] }, { "cell_type": "code", - "source": [ - "!pip install datasets transformers torch tqdm pandas huggingface_hub\n", - "!pip install sentencepiece\n", - "!pip install protobuf transformers==4.30.2 cpm_kernels torch>=2.0 gradio mdtex2html sentencepiece accelerate\n" - ], + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -48,11 +44,10 @@ "id": "-maUV8CH7JPB", "outputId": "dc512a8f-b4e3-44cc-f489-b8f768d82f7e" }, - "execution_count": 12, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (2.14.5)\n", "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.30.2)\n", @@ -99,18 +94,23 @@ "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.1.99)\n" ] } + ], + "source": [ + "!pip install datasets transformers torch tqdm pandas huggingface_hub\n", + "!pip install sentencepiece\n", + "!pip install protobuf transformers==4.30.2 cpm_kernels torch>=2.0 gradio mdtex2html sentencepiece accelerate\n" ] }, { "cell_type": "markdown", + "metadata": { + "id": "hJp_UOiB70o3" + }, "source": [ "### 1.1 Initialize Directories:\n", "This block checks if certain paths exist; if they do, it deletes them to avoid data conflicts, and then creates a new directory for the upcoming data.\n", "\n" - ], - "metadata": { - "id": "hJp_UOiB70o3" - } + ] }, { "cell_type": "code", @@ -247,105 +247,104 @@ }, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": [ - "Downloading readme: 0%| | 0.00/1.57k [00:00 Change runtime type -> A100 GPU**\n", "* retart runtime and run again if not working\n" - ], - "metadata": { - "id": "WMqFjp_mBVqO" - } + ] }, { "cell_type": "code", "execution_count": 6, "metadata": { - "id": "th_3Rnqy9Rkg", "colab": { "base_uri": "https://localhost:8080/", "height": 392, @@ -1305,152 +1303,153 @@ "169215329249494aaacfa43106b7f542" ] }, + "id": "th_3Rnqy9Rkg", "outputId": "1e91c290-beec-416f-f7ec-5f33a2953581" }, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": [ - "Downloading (…)model.bin.index.json: 0%| | 0.00/20.4k [00:00 to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is\n", ":DefaultFlowCallback\n", @@ -1731,11 +1730,7 @@ ] }, { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", "
\n", @@ -1759,9 +1754,13 @@ " \n", " \n", "

" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -1784,20 +1783,18 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "brHTWfnmCn5D" + }, "source": [ "### 4.3 Model Saving and Download:\n", "After training, save and download your model. You can also check the model's size.\n", "\n" - ], - "metadata": { - "id": "brHTWfnmCn5D" - } + ] }, { "cell_type": "code", - "source": [ - "!zip -r /content/saved_model.zip /content/{training_args.output_dir}\n" - ], + "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1805,11 +1802,10 @@ "id": "HUYxzwS_9lMI", "outputId": "7840afdf-ab03-4664-fd1b-99aa55f81125" }, - "execution_count": 19, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ " adding: content/./finetuned_model/ (stored 0%)\n", " adding: content/./finetuned_model/checkpoint-500/ (stored 0%)\n", @@ -1827,15 +1823,14 @@ " adding: content/./finetuned_model/adapter_config.json (deflated 42%)\n" ] } + ], + "source": [ + "!zip -r /content/saved_model.zip /content/{training_args.output_dir}\n" ] }, { "cell_type": "code", - "source": [ - "# download to local\n", - "from google.colab import files\n", - "files.download('/content/saved_model.zip')" - ], + "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1844,81 +1839,37 @@ "id": "s-7s2Cjw9pAM", "outputId": "f00c656b-1122-4a81-896b-c8d94a31979c" }, - "execution_count": 20, "outputs": [ { - "output_type": "display_data", "data": { + "application/javascript": "\n async function download(id, filename, size) {\n if (!google.colab.kernel.accessAllowed) {\n return;\n }\n const div = document.createElement('div');\n const label = document.createElement('label');\n label.textContent = `Downloading \"${filename}\": `;\n div.appendChild(label);\n const progress = document.createElement('progress');\n progress.max = size;\n div.appendChild(progress);\n document.body.appendChild(div);\n\n const buffers = [];\n let downloaded = 0;\n\n const channel = await google.colab.kernel.comms.open(id);\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n\n for await (const message of channel.messages) {\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n if (message.buffers) {\n for (const buffer of message.buffers) {\n buffers.push(buffer);\n downloaded += buffer.byteLength;\n progress.value = downloaded;\n }\n }\n }\n const blob = new Blob(buffers, {type: 'application/binary'});\n const a = document.createElement('a');\n a.href = window.URL.createObjectURL(blob);\n a.download = filename;\n div.appendChild(a);\n a.click();\n div.remove();\n }\n ", "text/plain": [ "" - ], - "application/javascript": [ - "\n", - " async function download(id, filename, size) {\n", - " if (!google.colab.kernel.accessAllowed) {\n", - " return;\n", - " }\n", - " const div = document.createElement('div');\n", - " const label = document.createElement('label');\n", - " label.textContent = `Downloading \"${filename}\": `;\n", - " div.appendChild(label);\n", - " const progress = document.createElement('progress');\n", - " progress.max = size;\n", - " div.appendChild(progress);\n", - " document.body.appendChild(div);\n", - "\n", - " const buffers = [];\n", - " let downloaded = 0;\n", - "\n", - " const channel = await google.colab.kernel.comms.open(id);\n", - " // Send a message to notify the kernel that we're ready.\n", - " channel.send({})\n", - "\n", - " for await (const message of channel.messages) {\n", - " // Send a message to notify the kernel that we're ready.\n", - " channel.send({})\n", - " if (message.buffers) {\n", - " for (const buffer of message.buffers) {\n", - " buffers.push(buffer);\n", - " downloaded += buffer.byteLength;\n", - " progress.value = downloaded;\n", - " }\n", - " }\n", - " }\n", - " const blob = new Blob(buffers, {type: 'application/binary'});\n", - " const a = document.createElement('a');\n", - " a.href = window.URL.createObjectURL(blob);\n", - " a.download = filename;\n", - " div.appendChild(a);\n", - " a.click();\n", - " div.remove();\n", - " }\n", - " " ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "display_data", "data": { + "application/javascript": "download(\"download_0a618fc5-4945-4791-b2cd-8ac358536eeb\", \"saved_model.zip\", 28956881)", "text/plain": [ "" - ], - "application/javascript": [ - "download(\"download_0a618fc5-4945-4791-b2cd-8ac358536eeb\", \"saved_model.zip\", 28956881)" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } + ], + "source": [ + "# download to local\n", + "from google.colab import files\n", + "files.download('/content/saved_model.zip')" ] }, { "cell_type": "code", - "source": [ - "# save to google drive\n", - "from google.colab import drive\n", - "drive.mount('/content/drive')" - ], + "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1926,43 +1877,47 @@ "id": "rvBgMgO8RADU", "outputId": "52bd847d-5bef-4b12-b9ed-d7e9795b3fcb" }, - "execution_count": 22, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Mounted at /content/drive\n" ] } + ], + "source": [ + "# save to google drive\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')" ] }, { "cell_type": "code", - "source": [ - "# save the finetuned model to google drive\n", - "!cp -r \"/content/finetuned_model\" \"/content/drive/MyDrive\"\n" - ], + "execution_count": 23, "metadata": { "id": "UUctmjm8RIfQ" }, - "execution_count": 23, - "outputs": [] + "outputs": [], + "source": [ + "# save the finetuned model to google drive\n", + "!cp -r \"/content/finetuned_model\" \"/content/drive/MyDrive\"\n" + ] }, { "cell_type": "code", "execution_count": 21, "metadata": { - "id": "unRoLshR9RQZ", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "unRoLshR9RQZ", "outputId": "9e39e6a5-cbc4-4459-9346-7df02f7c5c5d" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Model size: 29.84746265411377 MB\n" ] @@ -1983,51 +1938,42 @@ }, { "cell_type": "markdown", + "metadata": { + "id": "1LCjYKuoCusU" + }, "source": [ "Now your model is trained and saved! You can download it and use it for generating financial insights or any other relevant tasks in the finance domain. The usage of TensorBoard allows you to deeply understand and visualize the training dynamics and performance of your model in real-time.\n", "\n", "Happy FinGPT Training! 🚀" - ], - "metadata": { - "id": "1LCjYKuoCusU" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "76g_Qlp8t_Yp" + }, "source": [ "## Part 5: Inference and Benchmarks using FinGPT\n", "Now that your model is trained, let’s understand how to use it to infer and run benchmarks.\n", "* Took about 10 compute units\n", "\n" - ], - "metadata": { - "id": "76g_Qlp8t_Yp" - } + ] }, { "cell_type": "code", - "source": [ - "!pip install transformers==4.30.2 peft==0.4.0\n", - "!pip install sentencepiece\n", - "!pip install accelerate\n", - "!pip install torch\n", - "!pip install peft\n", - "!pip install datasets\n", - "!pip install bitsandbytes" - ], + "execution_count": 25, "metadata": { - "id": "ehjG2bpft_OH", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, + "id": "ehjG2bpft_OH", "outputId": "1806add3-b383-41eb-f6c9-76f982162b1b" }, - "execution_count": 25, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Requirement already satisfied: transformers==4.30.2 in /usr/local/lib/python3.10/dist-packages (4.30.2)\n", "Collecting peft==0.4.0\n", @@ -2069,7 +2015,6 @@ ] }, { - "output_type": "display_data", "data": { "application/vnd.colab-display-data+json": { "pip_warning": { @@ -2079,11 +2024,12 @@ } } }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" }, { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.1.99)\n", "Requirement already satisfied: accelerate in /usr/local/lib/python3.10/dist-packages (0.23.0)\n", @@ -2182,26 +2128,29 @@ "Requirement already satisfied: bitsandbytes in /usr/local/lib/python3.10/dist-packages (0.41.1)\n" ] } + ], + "source": [ + "!pip install transformers==4.30.2 peft==0.4.0\n", + "!pip install sentencepiece\n", + "!pip install accelerate\n", + "!pip install torch\n", + "!pip install peft\n", + "!pip install datasets\n", + "!pip install bitsandbytes" ] }, { "cell_type": "markdown", - "source": [ - "### 5.1 Load the model" - ], "metadata": { "id": "P91SXTrLS34i" - } + }, + "source": [ + "### 5.1 Load the model" + ] }, { "cell_type": "code", - "source": [ - "#clone the FinNLP repository\n", - "!git clone https://github.com/AI4Finance-Foundation/FinNLP.git\n", - "\n", - "import sys\n", - "sys.path.append('/content/FinNLP/')" - ], + "execution_count": 26, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2209,11 +2158,10 @@ "id": "y5jyY7S_uEls", "outputId": "739e49ed-0b31-46dd-96e0-162b4ef8073d" }, - "execution_count": 26, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Cloning into 'FinNLP'...\n", "remote: Enumerating objects: 1316, done.\u001b[K\n", @@ -2224,10 +2172,22 @@ "Resolving deltas: 100% (592/592), done.\n" ] } - ] - }, - { - "cell_type": "code", + ], + "source": [ + "#clone the FinNLP repository\n", + "!git clone https://github.com/AI4Finance-Foundation/FinNLP.git\n", + "\n", + "import sys\n", + "sys.path.append('/content/FinNLP/')" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "id": "zRsmSTFZuEjt" + }, + "outputs": [], "source": [ "from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM\n", "\n", @@ -2239,31 +2199,23 @@ "from finnlp.benchmarks.fiqa import test_fiqa , add_instructions\n", "from finnlp.benchmarks.tfns import test_tfns\n", "from finnlp.benchmarks.nwgi import test_nwgi" - ], - "metadata": { - "id": "zRsmSTFZuEjt" - }, - "execution_count": 27, - "outputs": [] + ] }, { "cell_type": "code", - "source": [ - "!pip install --upgrade peft" - ], + "execution_count": 28, "metadata": { - "id": "EBqKeUYV9VjF", "colab": { "base_uri": "https://localhost:8080/", "height": 756 }, + "id": "EBqKeUYV9VjF", "outputId": "20f6ad0e-9e52-4e8b-dde8-33509ef2f381" }, - "execution_count": 28, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Requirement already satisfied: peft in /usr/local/lib/python3.10/dist-packages (0.4.0)\n", "Collecting peft\n", @@ -2305,7 +2257,6 @@ ] }, { - "output_type": "display_data", "data": { "application/vnd.colab-display-data+json": { "pip_warning": { @@ -2315,47 +2266,42 @@ } } }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } + ], + "source": [ + "!pip install --upgrade peft" ] }, { "cell_type": "code", - "source": [ - "# load model from google drive\n", - "from google.colab import drive\n", - "drive.mount('/content/drive')" - ], + "execution_count": 29, "metadata": { - "id": "gRRw9drdA2hv", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "gRRw9drdA2hv", "outputId": "d75720b1-35da-4305-cd38-309f64d38eb0" }, - "execution_count": 29, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ] } + ], + "source": [ + "# load model from google drive\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')" ] }, { "cell_type": "code", - "source": [ - "# Define the path you want to check\n", - "path_to_check = \"/content/drive/My Drive/finetuned_model\"\n", - "\n", - "# Check if the specified path exists\n", - "if os.path.exists(path_to_check):\n", - " print(\"Path exists.\")\n", - "else:\n", - " print(\"Path does not exist.\")\n" - ], + "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -2363,31 +2309,29 @@ "id": "LxCAhg9QpkyI", "outputId": "81989b85-4abf-4403-c730-ac6e8ebe3488" }, - "execution_count": 30, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Path exists.\n" ] } + ], + "source": [ + "# Define the path you want to check\n", + "path_to_check = \"/content/drive/My Drive/finetuned_model\"\n", + "\n", + "# Check if the specified path exists\n", + "if os.path.exists(path_to_check):\n", + " print(\"Path exists.\")\n", + "else:\n", + " print(\"Path does not exist.\")\n" ] }, { "cell_type": "code", - "source": [ - "## load the chatglm2-6b base model\n", - "base_model = \"THUDM/chatglm2-6b\"\n", - "peft_model = training_args.output_dir\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)\n", - "model = AutoModel.from_pretrained(base_model, trust_remote_code=True, load_in_8bit=True, device_map=\"auto\")\n", - "\n", - "model = PeftModel.from_pretrained(model, peft_model)\n", - "\n", - "model = model.eval()" - ], + "execution_count": 33, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -2409,38 +2353,38 @@ "id": "bRljPCKC_srt", "outputId": "b7ab1104-fae1-4f31-9bfc-06cc53b3fbc9" }, - "execution_count": 33, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": [ - "Loading checkpoint shards: 0%| | 0/7 [00:00