In [None]:
pip install PyPDF2 transformers datasets torch

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import PyPDF2

# Path to the folder containing PDFs in Google Drive
pdf_folder_path = '/content/drive/MyDrive/stock_market_pdfs/'

# Function to extract text from PDFs
def extract_text_from_pdfs(folder_path):
    extracted_texts = []
    for file_name in os.listdir(folder_path):
        print(file_name)  # Print the file name for debugging
        if file_name.endswith('.pdf'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'rb') as pdf_file:
                reader = PyPDF2.PdfReader(pdf_file)
                text = ""
                for page_num in range(len(reader.pages)):
                    text += reader.pages[page_num].extract_text()
                extracted_texts.append(text)
    return extracted_texts

# Extract text from all PDFs in the folder
pdf_texts = extract_text_from_pdfs(pdf_folder_path)

# Print out the first extracted text (for sanity check)
# print(pdf_texts[0])


Module 1_Introduction to Stock Markets (1).pdf
Module 2_Technical Analysis.pdf
Module 3_Fundamental Analysis.pdf
Module 4_Futures Trading.pdf
Module 5_Options-Theory-for-Professional-Trading.pdf
Module 6_Option Strategies.pdf
Module 7_Markets & Taxation.pdf
Module 8_Currency and Commodity Futures.pdf
Module 9_Risk Management & Trading Psychology.pdf
Module 10_Trading Systems.pdf
Module11_Personal-Finance.pdf


In [None]:
import json

# Function to create "Did you know?" style training data
def create_training_data(texts):
    training_data = []
    for text in texts:
        sentences = text.split('. ')  # Split the text into sentences
        for sentence in sentences:
            if len(sentence.strip()) > 20:  # Ensure the sentence is long enough to be a fact
                training_data.append({
                    'prompt': 'Did you know?',
                    'completion': sentence.strip() + '.'
                })
    return training_data

# Create training data from extracted PDF texts
training_data = create_training_data(pdf_texts)

# Save the training data as a JSON file for fine-tuning
with open('/content/training_data.json', 'w') as f:
    json.dump(training_data, f, indent=4)

print(f"Created {len(training_data)} 'Did you know?' samples.")


Created 19056 'Did you know?' samples.


In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the dataset we prepared
dataset = load_dataset('json', data_files='/content/training_data.json')

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add a padding token to the GPT-2 tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Use the end-of-sequence token as the padding token

model = GPT2LMHeadModel.from_pretrained('gpt2')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['prompt'] + examples['completion'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "completion"])

# Data collator to handle batching of inputs
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    save_steps=500,
    logging_steps=500,
    save_total_limit=2,
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['train'],  # You can split it to use a validation set
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('/content/fine_tuned_gpt2')
tokenizer.save_pretrained('/content/fine_tuned_gpt2')


Map:   0%|          | 0/19056 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,2.3038,1.171177


Epoch,Training Loss,Validation Loss
1,2.3038,1.171177
2,2.0562,1.041796
3,1.8939,0.993097


('/content/fine_tuned_gpt2/tokenizer_config.json',
 '/content/fine_tuned_gpt2/special_tokens_map.json',
 '/content/fine_tuned_gpt2/vocab.json',
 '/content/fine_tuned_gpt2/merges.txt',
 '/content/fine_tuned_gpt2/added_tokens.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')


NameError: name 'GPT2LMHeadModel' is not defined

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
model = GPT2LMHeadModel.from_pretrained('gpt2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
model.save_pretrained('/content/fine_tuned_gpt2')
tokenizer.save_pretrained('/content/fine_tuned_gpt2')

NameError: name 'tokenizer' is not defined

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:
tokenizer.save_pretrained('/content/fine_tuned_gpt2')


NameError: name 'tokenizer' is not defined

In [None]:
def movedir(src, dst):
    try:
        os.rename(src, dst)
        return
    except FileExistsError:
        pass
    for root, dirs, files in os.walk(src):
        dest_root = os.path.join(dst, os.path.relpath(root, src))
        done = []
        for dir_ in dirs:
            try:
                os.rename(os.path.join(root, dir_), os.path.join(dest_root, dir_))
                done.append(dir_)
            except FileExistsError:
                pass
        for dir_ in done:
            dirs.remove(dir_)
        for file in files:
            os.replace(os.path.join(root, file), os.path.join(dest_root, file))
    for root, dirs, files in os.walk(src, topdown=False):
        os.rmdir(root)

In [None]:
from google.colab import files

# Zip the folder containing the fine-tuned model
!zip -r fine_tuned_gpt2.zip /content/fine_tuned_gpt2

# Download the zipped file
files.download('/content/fine_tuned_gpt2.zip')


  adding: content/fine_tuned_gpt2/ (stored 0%)
  adding: content/fine_tuned_gpt2/tokenizer_config.json (deflated 55%)
  adding: content/fine_tuned_gpt2/config.json (deflated 52%)
  adding: content/fine_tuned_gpt2/generation_config.json (deflated 24%)
  adding: content/fine_tuned_gpt2/vocab.json (deflated 68%)
  adding: content/fine_tuned_gpt2/merges.txt (deflated 53%)
  adding: content/fine_tuned_gpt2/special_tokens_map.json (deflated 74%)
  adding: content/fine_tuned_gpt2/model.safetensors (deflated 7%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Create a directory in your Google Drive to store the model (optional)
!mkdir -p /content/drive/MyDrive/fine_tuned_gpt2

# Copy the model files to Google Drive
!cp -r /content/fine_tuned_gpt2 /content/drive/MyDrive/fine_tuned_gpt2


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/fine_tuned_gpt2/fine_tuned_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Function to generate stock market facts or explanations
def generate_facts(prompt, max_length=100, num_return_sequences=5):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_p=0.95,
        temperature=0.9
    )
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True).strip() for output in outputs]
    return generated_texts

# List to store generated facts
all_facts = []

# Custom prompts for generating facts
prompts = [
    "Tell me a fact about the stock market.",
    "Explain what stock market indices are.",
    "What is the P/E ratio?",
    "Explain dividend stocks in simple terms.",
    "How does technical analysis help in stock trading?",
    "Give me a fact about market capitalization.",
    "Explain the concept of liquidity in the stock market.",
    "What is a blue-chip stock?",
    "How are stock prices determined?",
    "Explain the role of brokers in the stock market.",
    "What is the difference between a bear and bull market?",
    "Explain the significance of volume in stock trading.",
    "How does stock market volatility affect traders?",
    "What is a stock split?",
    "Give me a fact about the bid-ask spread."
]

# Generate facts iteratively from each prompt
for prompt in prompts:
    facts = generate_facts(prompt, max_length=100, num_return_sequences=10)  # 10 facts per prompt
    all_facts.extend(facts)

# Ensure at least 100 facts
while len(all_facts) < 100:
    # Continue generating more from the same prompts if needed
    for prompt in prompts:
        facts = generate_facts(prompt, max_length=100, num_return_sequences=5)
        all_facts.extend(facts)
        if len(all_facts) >= 100:
            break

# Slice the list to get exactly 100 facts
final_facts = all_facts[:100]

# Save the facts to a JSON file as an array of strings
json_output_path = "/content/drive/MyDrive/stock_market_facts_100.json"
with open(json_output_path, 'w') as json_file:
    json.dump(final_facts, json_file, indent=4)

print(f"Generated 100 stock market facts and saved to {json_output_path}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Generated 100 stock market facts and saved to /content/drive/MyDrive/stock_market_facts_1001.json
