In [1]:
# import sys
# print(sys.executable)
# !{sys.executable} -m pip install transformers

In [2]:
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

import torch
print(torch.cuda.is_available())  # Should print True
print(torch.cuda.device_count())  # Number of GPUs
print(torch.cuda.get_device_name(0))  # GPU Name


True
1
NVIDIA GeForce GTX 1660 Ti


In [3]:
# prompt: load the gpt-2 medium model so i can use it to train my data and build a qa chat bot

# !pip3 install transformers

from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model and tokenizer
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.model_max_length = 512  # Reduce if necessary



In [4]:
# %pip install --upgrade jupyter
# %pip install ipywidgets

In [5]:
# Example usage (replace with your Q&A data)
question = "What is the capital of France?"
inputs = tokenizer(question, return_tensors="pt")
outputs = model.generate(**inputs)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Question: {question}")
print(f"Answer: {answer}")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: What is the capital of France?
Answer: What is the capital of France?

The capital of France is Paris.

What is the capital of France?




In [6]:
# prompt: I have loaded my gpt2 medium model.. i have a json file that has 'question', and 'answer' stuff... and i have .txt files that have content i scraped from wikipedia. THey are in the 'data' folder.. so i need them to be loaded and preprocessed

import json
import os
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel


# Function to preprocess text (example)
def preprocess_text(text):
    # Add your text preprocessing steps here (e.g., lowercasing, removing punctuation)
    text = text.lower()  # Example: Convert to lowercase
    return text


In [7]:
# Load and preprocess data from JSON file
qa_data = []
try:
  with open('data/chess_com_qa.json', 'r') as f:  # Replace 'your_json_file.json' with your JSON filename
    qa_data = json.load(f)
except FileNotFoundError:
    print("Error: 'data/chess_com_qa.json' not found. Please make sure the file exists in the current directory or provide the correct path.")


In [8]:
# Load and preprocess data from text files
data_dir = 'data'  # Assuming text files are in the 'data' folder
wikipedia_content = []
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        filepath = os.path.join(data_dir, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as file:  # Specify encoding if needed
                content = file.read()
                preprocessed_content = preprocess_text(content)
                wikipedia_content.append(preprocessed_content)
        except Exception as e:
            print(f"Error reading or processing file '{filename}': {e}")


In [9]:
# Example: Accessing a question and answer from the JSON data
if qa_data:  # Check if qa_data is not empty
    question = qa_data[0].get('question', 'No question found') #safe access if key does not exist
    answer = qa_data[0].get('answer', 'No answer found')    #safe access if key does not exist
    print("Example from JSON:")
    print(f"Question: {question}")
    print(f"Answer: {answer}")

# Example: Accessing the preprocessed Wikipedia content
if wikipedia_content:
  print("\nExample from Wikipedia:")
  print(f"First Wikipedia text file content:\n{wikipedia_content[0][:200]}...") # Print the first 200 characters

Example from JSON:
Question: What is a more efficient thinking process?
Answer: I'm a bit of a perfectionist and strive to make the best move every game. As a result I think I expend too much brain power throughout the game and get burned out or overwhelmed by the possibilites, for example in the opening I would try consider all my opponents responses of which there are many in the opening, sometimes I feel habits such as these are unnecessary and are just a waste of energy. What is the most correct way to expend your mental energy in a chess game?

Example from Wikipedia:
First Wikipedia text file content:
from wikipedia, the free encyclopedia
this article is about the western board game. for other chess games or other uses, see chess (disambiguation).
chess
a selection of white and black chess pieces o...


## Training the Model

In [10]:
# prompt: now use the data that we have loaded to train the model so that it will be domain specific to chess..

from transformers import Trainer, TrainingArguments, GPT2Tokenizer, GPT2LMHeadModel
import json
import os
import torch   # Import torch


# Prepare data for training (example using a simple list of text)
train_data = []
if qa_data:
    for item in qa_data:
        question = item.get('question', '')
        answer = item.get('answer', '')
        train_data.append(question + " " + answer)

if wikipedia_content:
    train_data.extend(wikipedia_content)




In [13]:
# Tokenize the data
# Add the padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token
train_encodings = tokenizer(train_data, truncation=True, padding=True, return_tensors='pt')

# Create a custom dataset
class ChessDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx].clone().detach() for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = ChessDataset(train_encodings)


In [14]:
import sys
!{sys.executable} -m pip install --upgrade accelerate

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./chess_gpt2_results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate a larger batch
    fp16=True,
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./chess_gpt2_logs',            # directory for storing logs
    logging_steps=10,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./chess_gpt2_fine_tuned")
tokenizer.save_pretrained("./chess_gpt2_fine_tuned")

Defaulting to user installation because normal site-packages is not writeable


ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

In [None]:
# prompt: evaluate the model and do an extensive eda of the data we loaded using matplotlib

import matplotlib.pyplot as plt
import pandas as pd
import json
import os

# Load the JSON data (assuming qa_data is already loaded as in the previous code)
try:
    with open('/content/data/chess_com_qa.json', 'r') as f:
        qa_data = json.load(f)
except FileNotFoundError:
    print("Error: '/content/data/chess_com_qa.json' not found.")
    qa_data = [] # Initialize as empty list to avoid errors

# EDA for JSON data
if qa_data:
    df_qa = pd.DataFrame(qa_data)

    # Example 1: Question Length Distribution
    df_qa['question_length'] = df_qa['question'].apply(len)
    plt.figure(figsize=(10, 6))
    plt.hist(df_qa['question_length'], bins=20)
    plt.title('Distribution of Question Lengths')
    plt.xlabel('Question Length')
    plt.ylabel('Frequency')
    plt.show()


    # Example 2: Answer Length Distribution
    df_qa['answer_length'] = df_qa['answer'].apply(len)
    plt.figure(figsize=(10, 6))
    plt.hist(df_qa['answer_length'], bins=20)
    plt.title('Distribution of Answer Lengths')
    plt.xlabel('Answer Length')
    plt.ylabel('Frequency')
    plt.show()


    # Example 3: Scatter plot of question vs answer length
    plt.figure(figsize=(10,6))
    plt.scatter(df_qa['question_length'], df_qa['answer_length'])
    plt.title('Question Length vs Answer Length')
    plt.xlabel('Question Length')
    plt.ylabel('Answer Length')
    plt.show()

# Load and preprocess text data (wikipedia_content) - assuming it's available
wikipedia_content = []
data_dir = 'data'
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        filepath = os.path.join(data_dir, filename)
        try:
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read()
                wikipedia_content.append(content)
        except Exception as e:
            print(f"Error reading file '{filename}': {e}")

# EDA for Text Data
if wikipedia_content:
    # Example 4: Word count distribution in text files
    word_counts = [len(text.split()) for text in wikipedia_content]
    plt.figure(figsize=(10,6))
    plt.hist(word_counts, bins=20)
    plt.title("Word count Distribution in Wikipedia Text Files")
    plt.xlabel("Number of words")
    plt.ylabel("Number of files")
    plt.show()

# Model Evaluation (Example: Perplexity - you'll need to calculate perplexity)

# Note: Replace with your actual model evaluation metrics

# Assuming 'perplexity' is already calculated
# perplexity = your_model_evaluation_metric
# print(f"Perplexity: {perplexity}")


