In [8]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
amalrajsingh_cryptocurrency_blockchain_and_stock_market_qa_path = kagglehub.dataset_download('amalrajsingh/cryptocurrency-blockchain-and-stock-market-qa')

print('Data source import complete.')


Data source import complete.


<div class="welcome-header" style="font-family: Arial, sans-serif; background-color: #4A90E2; border-radius: 5px; padding: 20px; margin-bottom: 20px; text-align: center;">
    <h1 style="color: white; margin: 0;">Welcome to this Notebook!</h1>
    <p style="color: white; font-size: 1.2em; margin-top: 10px; text-align: center;"">Explore the exciting world of Cryptocurrencies using a fine-tuned Large Language Model§</p>
</div>


<div class="description-container" style="font-family: Arial, sans-serif; background-color: #f9f9f9; border: 1px solid #ddd; border-radius: 5px; padding: 15px; margin-bottom: 20px;">
  <div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: navy; padding: 10px; border-radius: 3px;">Overview</h2>
    <p style="line-height: 1.6; padding: 10px; background-color: white; border: 1px solid navy; border-radius: 3px;">
      This notebook demonstrates how to finetune GPT-2, a powerful Transformer model for natural language generation, on the <strong>cryptocurrency-blockchain-and-stock-market</strong> dataset using Hugging Face's <code>Trainer</code> API.
    </p>
  </div>

  <div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: teal; padding: 10px; border-radius: 3px;">Dataset</h2>
    <p style="line-height: 1.6; padding: 10px; background-color: white; border: 1px solid teal; border-radius: 3px;">
      The dataset, <strong>cryptocurrency-blockchain-and-stock-market</strong>, contains diverse information on cryptocurrency and stock markets. It's an excellent choice for creating a model that can generate summaries or answer questions related to finance and blockchain.
    </p>
  </div>

  <div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: indigo; padding: 10px; border-radius: 3px;">Model Selection - GPT2</h2>
    <p style="line-height: 1.6; padding: 10px; background-color: white; border: 1px solid indigo; border-radius: 3px;">
      <strong>GPT-2</strong> (Generative Pre-trained Transformer 2) is a powerful, unsupervised Transformer model known for its text generation capabilities. It's highly effective for language modeling, enabling it to produce coherent and contextually relevant sentences based on input data.
    </p>
  </div>

  <div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: maroon; padding: 10px; border-radius: 3px;">Training Approach</h2>
    <p style="line-height: 1.6; padding: 10px; background-color: white; border: 1px solid maroon; border-radius: 3px;">
      Using the <code>Trainer</code> API, we’ll finetune BART on this dataset. The <code>Trainer</code> provides robust tools for model training, evaluation, and hyperparameter tuning, making it ideal for our project.
    </p>
  </div>

  <div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: olive; padding: 10px; border-radius: 3px;">Objective</h2>
    <p style="line-height: 1.6; padding: 10px; background-color: white; border: 1px solid olive; border-radius: 3px;">
      The goal is to train GPT-2 to handle question-answering tasks related to finance, cryptocurrency, and blockchain topics. This will enhance the model's ability to provide concise and relevant answers to specific queries in these domains.
    </p>
  </div>

</div>


<div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: teal; padding: 10px; border-radius: 3px;">Code</h2>
  </div>

<div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: indigo; padding: 10px; border-radius: 3px;">1. Importing libraries</h2>
  </div>

In [5]:
!pip install transformers datasets




<div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: indigo; padding: 10px; border-radius: 3px;">2. Importing dataset</h2>
</div>

In [9]:
import pandas as pd
from datasets import Dataset

df=pd.read_csv('/kaggle/input/cryptocurrency-blockchain-and-stock-market-qa/train.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804 entries, 0 to 803
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  804 non-null    int64 
 1   question    804 non-null    object
 2   answer      804 non-null    object
 3   text        804 non-null    object
dtypes: int64(1), object(3)
memory usage: 25.3+ KB


In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,question,answer,text
0,0,What significant challenges does the rapid exp...,"Balancing scalability and security, computatio...",###Human:\nAnswer this question in the context...
1,1,How does the proposed framework address the in...,By employing edge aggregating servers and Ethe...,###Human:\nAnswer this question in the context...
2,2,What are the primary benefits of using blockch...,"Data integrity, device authentication, and pro...",###Human:\nAnswer this question in the context...
3,3,Why are traditional blockchain-based solutions...,"Due to scalability, cost issues, and computati...",###Human:\nAnswer this question in the context...
4,4,How does the proposed framework ensure data pr...,Through the use of Zero-Knowledge Proofs (ZKPs...,###Human:\nAnswer this question in the context...


<div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: indigo; padding: 10px; border-radius: 3px;">3. Convert data to transformers dataset</h2>
    <p style="line-height: 1.6; padding: 10px; background-color: white; border: 1px solid indigo; border-radius: 3px;">
Rename the columns in the dataframe to align with the model’s expected input format, changing "question" to "input" and "answer" to "output.". Then, Convert the modified dataframe into a <code>Dataset</code> object, making it ready for tokenization and further preprocessing steps.    </p>
  </div>




In [11]:
df = df.rename(columns={"question": "input", "answer": "output"})
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['Unnamed: 0', 'input', 'output', 'text'],
    num_rows: 804
})

<div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: indigo; padding: 10px; border-radius: 3px;">4. Load the model and Tokenizer</h2>
    <p style="line-height: 1.6; padding: 10px; background-color: white; border: 1px solid indigo; border-radius: 3px;">
Load the pre-trained GPT-2 model and its corresponding tokenizer from the <code>transformers</code> library. The model handles text generation and language understanding, while the tokenizer preprocesses raw text into token sequences that GPT-2 can interpret. Together, these components form the foundation for finetuning the model on specific tasks.    </p>
  </div>



In [12]:
import os
from getpass import getpass
from transformers import AutoTokenizer, AutoModelForCausalLM


In [14]:
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_auth_token=os.getenv("HUGGINGFACE_TOKEN"))
model = AutoModelForCausalLM.from_pretrained("gpt2", use_auth_token=os.getenv("HUGGINGFACE_TOKEN"))



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

<div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: indigo; padding: 10px; border-radius: 3px;">5. Tokenize Question and answer</h2>
    <div style="line-height: 1.6; padding: 10px; background-color: white; border: 1px solid indigo; border-radius: 3px;">
        We concatenate Question and Answer together because :
        <ul>
            <li>GPT expects a single input sequence</li>
            <li>It helps the model understand the relationship between both of them</li>
        </ul>
    </div>
  </div>



In [15]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize_data(batch):
    # Concatenate each question and answer pair
    input_texts = ["Q: " + q + " A: " + a for q, a in zip(batch["input"], batch["output"])]
    return tokenizer(input_texts, padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_data, batched=True)


Map:   0%|          | 0/804 [00:00<?, ? examples/s]

In [16]:
tokenized_dataset

Dataset({
    features: ['Unnamed: 0', 'input', 'output', 'text', 'input_ids', 'attention_mask'],
    num_rows: 804
})

<div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: indigo; padding: 10px; border-radius: 3px;">6. Fine tuning</h2>
    <div style="line-height: 1.6; padding: 10px; background-color: white; border: 1px solid indigo; border-radius: 3px;">
        <ul>
            <li> Trainer: A high-level API for training models with pre-defined functionalities.</li>
            <li>DataCollatorForLanguageModeling: A utility for preparing batches.</li>
            <li>Output dir contains the model checkpoints</li>
            <li>Epochs : number of times the model wwill iterate through the whole dataset</li>
        </ul>
    </div>
  </div>


In [18]:
from google.colab import userdata

# Access secrets stored in Colab's 'Secrets' tab
secret_value_0 = userdata.get("HUGGINGFACE_TOKEN")
secret_value_1 = userdata.get("WANDB_API_KEY") # Using a more conventional secret name

In [19]:
import os
import wandb

wandb_api_key = secret_value_1

if wandb_api_key is None:
    raise ValueError("Please set the WANDB_API_KEY environment variable.")

wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33makshat2003singh[0m ([33makshat2003singh-bennett-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [20]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=500,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

In [21]:
trainer.train()



`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,2.4516


TrainOutput(global_step=603, training_loss=2.369443422130882, metrics={'train_runtime': 345.3545, 'train_samples_per_second': 6.984, 'train_steps_per_second': 1.746, 'total_flos': 630236381184000.0, 'train_loss': 2.369443422130882, 'epoch': 3.0})

<div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: indigo; padding: 10px; border-radius: 3px;">7. Saving the finetuned model and its tokenizer</h2>
    <p style="line-height: 1.6; padding: 10px; background-color: white; border: 1px solid indigo; border-radius: 3px;">
      Tokenization is the process of breaking down text into smaller units, typically words or subwords (tokens).
    </p>
  </div>


In [22]:
model.save_pretrained('path_to_save_model')
tokenizer.save_pretrained('path_to_save_tokenizer')


('path_to_save_tokenizer/tokenizer_config.json',
 'path_to_save_tokenizer/special_tokens_map.json',
 'path_to_save_tokenizer/vocab.json',
 'path_to_save_tokenizer/merges.txt',
 'path_to_save_tokenizer/added_tokens.json',
 'path_to_save_tokenizer/tokenizer.json')

<div class="description" style="margin-bottom: 20px;">
    <h2 style="color: white; background-color: red; padding: 10px; border-radius: 3px;">Testing</h2>
  </div>


**model.generate()** is a function from the Huggingface Transformers library that returns an output related to the statement (the input). Doesn't matter if the input is a question or a statement.

In [23]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained('path_to_save_model')
tokenizer = GPT2Tokenizer.from_pretrained('path_to_save_tokenizer')


In [24]:
input_text = "What is blockchain?"
input_ids = tokenizer.encode(input_text, return_tensors='pt') #PyTorch tensor
output = model.generate(input_ids, max_length=50, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


What is blockchain? It combines information exchange and tamper-proof proofs in a secure, decentralized, and transparent environment for the trading and trading of cryptocurrency.

Blockchain technology revolutionizes the trading environment through decentralized trading and tamper-proof proofs


In [36]:
input_text = "What is a cryptocurrency?"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is a cryptocurrency? A cryptocurrency is a term applied to digital assets that are immutable and unchangeable in the digital economy, such as digital stock portfolios. Bitcoin is a hybrid of a traditional and digital asset class, characterized by the potential to outper


In [37]:
input_text = "Use of public blockchain networks?"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Use of public blockchain networks? A: Social media platforms facilitate offline collaboration and transparency, improving humanitarian access and enhancing disaster response. A focus is on data privacy and security, with interest in privacy protection and the application of distributed ledger technology. A focus is


In [38]:
input_text = "What is Industry 4.0?"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is Industry 4.0? A framework developed by Industry 4.0 stakeholders to address complex industries in IoT. Industry 4.0 stakeholders include traditional data management platforms and blockchain as platforms for decentralized, tamper-proof applications in gaming. Industry 4


In [39]:
input_text = "What is a smart contract?"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is a smart contract? A smart contract is a framework that facilitates data exchange between parties within an ecosystem. Smart contracts ensure that data is securely stored, transmitted, and processed without intermediaries. Through smart contracts, participants are authorized to securely access,


In [40]:
input_text = "Who can add a block into the blockchain?"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Who can add a block into the blockchain? A consensus algorithm can add the block to the blockchain, allowing for faster transactions and enhanced security and privacy. Blockchain-based solutions can address privacy and scalability challenges while improving transparency and secure data handling. Benefits


In [41]:
input_text = "What is the Bitcoin?"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is the Bitcoin? A cryptocurrency is defined as an immutable, unchangeable, verifiable record of all transactions, verifiable by consensus. It is an asset under international and state-owned financial and financial rules, and is not subject to the


In [27]:
input_text = "Top cryptocurrencies"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Top cryptocurrencies are in the same round of volatility as traditional stock prices. Lending companies are valued based on the volatility of their returns and holdings, thus influencing the cryptocurrency's price.

Market research indicates Lending companies' returns are in the same


In [43]:
input_text = "Ethereum?"
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=50, num_return_sequences=1, do_sample=True, top_k=50, top_p=0.95)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Ethereum? How is it achieved in the context of IoT automation and smart building systems? A: Through decentralized applications and smart contracts, enabling secure, transparent, and efficient ownership, recording, and monitoring of transactions.

How does the proposed system


In [26]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer from saved directory
model_path = 'path_to_save_model'
tokenizer_path = 'path_to_save_tokenizer'

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Fix for pad token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# Input text
input_text = "What is blockchain?"

# Encode input with attention mask
inputs = tokenizer.encode_plus(
    input_text,
    return_tensors='pt',
    padding='max_length',
    truncation=True,
    max_length=50 # Keep max_length here for input encoding
)

# Generate output
output = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_new_tokens=50,  # Use max_new_tokens to generate 50 new tokens
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    pad_token_id=tokenizer.pad_token_id
)

# Decode and print
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Output:\n", generated_text)

Generated Output:
 What is blockchain?Abstract: Blockchain enables secure, transparent and transparent access to secret transmission channels between blockchain devices and the public, protecting the integrity of private transactions. Blockchain technology revolutionizes data transmission and data integrity, enhancing privacy and ensuring security and privacy-preserving data.


<div class="description-container" style="font-family: Arial, sans-serif; background-color: #f9f9f9; border: 1px solid #ddd; border-radius: 5px; padding: 15px; margin-bottom: 20px;">
    <h3 style="color: white; background-color: navy; padding: 10px; border-radius: 3px;">Conclusion</h3>
    <center><p style="line-height: 1.6; padding: 10px; border-radius: 3px; border: 1px solid teal; background-color: white;">
        Thank you for taking the time to read this notebook! <br/>I hope you found the insights and analyses helpful and informative.
If you have any questions or feedback, feel free to reach out. Your thoughts and suggestions are greatly appreciated!<br/>
Happy coding and best of luck with your data science journey! 🚀<br/>
If you found this notebook useful, please consider giving it an upvote! It encourages me to share more content. <br/>Thank you!</p></center>
</div>

