In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv('text_data.csv')

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

df['cleaned_text'] = df['text_column'].apply(clean_text)
print(df[['text_column', 'cleaned_text']].head())
df.to_csv('cleaned_text_data.csv', index=False)

## Imports & Installation

### Intallation

| Library         | Description                                                                                                           |
|-----------------|-----------------------------------------------------------------------------------------------------------------------|
| transformers    | A library that provides state-of-the-art pretrained models for various NLP tasks.                                     |
| datasets        | A library that simplifies the process of accessing and working with a wide range of machine learning datasets.        |
| mlflow          | A platform for managing the end-to-end machine learning lifecycle, from experimentation to deployment.                |
| torch (PyTorch) | A powerful deep learning framework used for building, training, and deploying neural networks.                        |
| pyngrok         | A tool that allows local servers (like Gradio apps) to be exposed to the internet for easy testing and sharing.       |
| gradio          | A user-friendly library for creating interactive UIs for machine learning models, enabling easy sharing and testing.  |


In [None]:
        # You can use this cell if you don't need to run requirements.txt
# !pip install transformers
# !pip install datasets
# !pip install mlflow
# !pip install torch
# !pip install pyngrok -q
# !pip install gradio
# !pip install accelerate>=0.26.0
# !ngrok config add-authtoken 0000000000000000000000000000000000000000000000000

### Imports

In [5]:
# Import necessary modules for subprocess management
import subprocess
# Import pyngrok for handling public access tunnels and configurations
from pyngrok import ngrok, conf
# For securely handling password inputs
import getpass
# Importing os module to interact with the operating system
import os
# Importing MLflow to track machine learning experiments with PyTorch models
import mlflow
import mlflow.pytorch
# Import transformers' pre-trained GPT-2 model and tokenizer, as well as Trainer utilities
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
# For loading and handling datasets
from datasets import load_dataset, DatasetDict
# Import PyTorch, a machine learning framework
import torch
# Import pre-trained models and tokenizers for causal language modeling tasks
from transformers import AutoTokenizer, AutoModelForCausalLM
# Importing Gradio, a framework to create web interfaces for machine learning models
import gradio as gr

## Initialization

Initializing MLflow Tracking with a SQLite Backend

In [6]:
# Set the URI for MLflow to use a SQLite database as the backend store for tracking experiments.
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

# Start the MLflow tracking UI in a new process, using the specified SQLite database as the backend store.
subprocess.Popen(["mlflow", "ui", "--backend-store-uri", MLFLOW_TRACKING_URI])

<Popen: returncode: None args: ['mlflow', 'ui', '--backend-store-uri', 'sqli...>

Establishing MLflow Tracking Configuration for Experiment Management

In [7]:
# Set the MLflow tracking URI to specify where the tracking data will be stored.
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Set the name of the experiment to track runs under a specific experiment name in MLflow.
mlflow.set_experiment("duration-prediction-experiment")

<Experiment: artifact_location='file:d:/MO/Ai Projects/creative writing/mlruns/1', creation_time=1728799754107, experiment_id='1', last_update_time=1728799754107, lifecycle_stage='active', name='duration-prediction-experiment', tags={}>

Configuring ngrok with Authentication Token for Secure Tunneling

In [8]:
# Prompt the user to enter their ngrok authentication token
print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth")

# Get the authentication token securely (input will not be shown on the console)
conf.get_default().auth_token = getpass.getpass()

# Set the local port number that the ngrok tunnel will forward to
port = 5000

# Establish an ngrok tunnel to the specified local port and retrieve the public URL
public_url = ngrok.connect(port).public_url

# Print the public URL provided by ngrok, which forwards to the local server
print(f' * ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:{port}\"')

Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth
 * ngrok tunnel "https://b8d8-154-239-194-127.ngrok-free.app" -> "http://127.0.0.1:5000"


Creating MLflow Directory and Starting a New Experiment Run

In [9]:
# Create a directory named "mlruns" to store MLflow tracking data.
# The exist_ok=True parameter means that no error will be raised if the directory already exists.
os.makedirs("mlruns", exist_ok=True)

# End any active MLflow run to ensure that there are no overlapping runs.
# This is useful to clean up before starting a new run.
mlflow.end_run()

# Start a new MLflow run to track metrics, parameters, and models associated with this particular experiment.
mlflow.start_run()

<ActiveRun: >

## Tokenization and Training

### Importing Dataset and Configuring GPT-2 for Language Modeling

In [10]:
# Specify the path to the cleaned dataset CSV file
data_files = 'cleaned_creative_writing_dataset.csv'

# Load the dataset from the specified CSV file
# The 'csv' argument indicates the file format, and the 'data_files' argument specifies the path to the file
dataset = load_dataset('csv', data_files=data_files)

# Remove the 'text' column from the dataset
# This is done to avoid any potential conflicts or redundant information
dataset = dataset['train'].remove_columns(['text'])

# Rename the 'cleaned_text' column to 'text' for consistency
# This makes it easier to refer to the main text column in subsequent processing
dataset = dataset.rename_column('cleaned_text', 'text')

# Load the pre-trained GPT-2 tokenizer
# The tokenizer is responsible for converting text into token IDs that the model can understand
# it uses BPETokenizer for subword tokenization and Byte-Pair Encoding (BPE) algorithm, it has 117M parameters
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load the pre-trained GPT-2 model
# The 'GPT2LMHeadModel' is the model architecture that can generate text
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the tokenizer's padding token to the end-of-sequence token
# This is important for ensuring that input sequences have consistent lengths during training or inference
tokenizer.pad_token = tokenizer.eos_token

In [11]:
# Tokenization example on a sample sentence
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
sentence = "I love Dotpy"
tokens = tokenizer.encode(sentence)
tokenized_words = tokenizer.convert_ids_to_tokens(tokens)
print("    Token IDs    :", tokens)
print("Tokenized Words  :", tokenized_words)

    Token IDs    : [40, 1842, 22875, 9078]
Tokenized Words  : ['I', 'Ġlove', 'ĠDot', 'py']


In [13]:
sequences = ["I love DotPy", "DotPy is amazing and cool"]
tokenizer.pad_token = tokenizer.eos_token
tokens = tokenizer(sequences, padding=True, return_tensors='pt')
tokenized_output = [tokenizer.convert_ids_to_tokens(token_ids) for token_ids in tokens['input_ids']]
print("Token IDs      :", *tokens['input_ids'], sep='\n')
print("Tokenized Words:", *tokenized_output,sep='\n')

Token IDs      :
tensor([   40,  1842, 22875, 20519, 50256, 50256, 50256])
tensor([   35,   313, 20519,   318,  4998,   290,  3608])
Tokenized Words:
['I', 'Ġlove', 'ĠDot', 'Py', '<|endoftext|>', '<|endoftext|>', '<|endoftext|>']
['D', 'ot', 'Py', 'Ġis', 'Ġamazing', 'Ġand', 'Ġcool']


### Dataset Preprocessing: Tokenization and Train-Test Split for GPT-2

In [14]:
# Define a tokenization function to process the dataset
def tokenize_function(examples):
    # Tokenize the 'text' field from the dataset examples using the pre-loaded tokenizer
    # padding='max_length' ensures that all sequences are padded to the maximum length
    # truncation=True cuts off sequences that exceed the max_length
    # max_length=32 sets a fixed length of 32 tokens for each input
    input_ids = tokenizer(
        examples['text'],
        padding='max_length',  # Pads to 32 tokens per sequence
        truncation=True,       # Truncates sequences longer than 32 tokens
        max_length=32          # Sets the maximum token length to 32
    )

    # Copy the 'input_ids' into a new field 'labels' to use as the target for training
    # This is often done in language models to predict the next word in a sequence
    input_ids['labels'] = input_ids['input_ids'].copy()
    
    # Return the tokenized input dictionary, including both 'input_ids' and 'labels'
    return input_ids

# Apply the tokenization function to the entire dataset
# The map() method applies the function to each example in the dataset, with batched=True
# meaning that multiple examples are passed in a single batch for faster processing
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the tokenized dataset into a training set and a validation set
# train_test_split(test_size=0.2) splits 80% of the data for training and 20% for validation
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)

# Organize the train and validation datasets into a DatasetDict for easy reference
tokenized_datasets = DatasetDict({
    'train': train_test_split['train'],        # Training dataset (80% of the data)
    'validation': train_test_split['test']     # Validation dataset (20% of the data)
})

### Training GPT-2 with Custom Hyperparameters and Logging with MLflow

In [15]:
# Define a function to compute evaluation metrics
# This function will be called during the evaluation phase of the model
def compute_metrics(eval_pred):
    logits, labels = eval_pred  # Extract the logits (model outputs) and true labels
    predictions = logits.argmax(axis=-1)  # Get the predicted class by taking the argmax along the last axis
    accuracy = (predictions == labels).mean()  # Compute the accuracy by comparing predictions with labels
    return {'accuracy': accuracy}  # Return accuracy as a dictionary for logging

# Define training hyperparameters
learning_rate = 2e-5  # The learning rate for the optimizer
per_device_train_batch_size = 1  # Batch size per device (1 sample per training step)
num_train_epochs = 1  # Number of training epochs
max_length = 32  # Maximum sequence length for inputs

# Set up training arguments using Hugging Face's TrainingArguments class
training_args = TrainingArguments(
    output_dir='./results',  # Directory where results (like checkpoints and logs) will be saved
    evaluation_strategy='epoch',  # Evaluate the model at the end of each epoch
    save_strategy='epoch',  # Save the model at the end of each epoch
    learning_rate=learning_rate,  # Set the learning rate for training
    per_device_train_batch_size=per_device_train_batch_size,  # Set the batch size per device
    num_train_epochs=num_train_epochs,  # Define the number of training epochs
    weight_decay=0.01,  # Weight decay to avoid overfitting (used in regularization)
    load_best_model_at_end=True,  # Load the best model based on evaluation metrics after training ends
    metric_for_best_model="accuracy",  # The metric used to select the best model (accuracy in this case)
    no_cuda=True,  # Force training on CPU, set to False if using GPU
)

# Instantiate a Trainer to manage the training loop
trainer = Trainer(
    model=model,  # The pre-trained GPT-2 model that you want to fine-tune
    args=training_args,  # Training arguments defined above
    train_dataset=tokenized_datasets['train'],  # The training dataset
    eval_dataset=tokenized_datasets['validation'],  # The validation dataset for evaluation
    compute_metrics=compute_metrics,  # The function to compute evaluation metrics (accuracy here)
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Early stopping callback to avoid overfitting
    # The model stops training if it doesn't improve for 3 evaluation cycles (epochs in this case)
)

# Log hyperparameters using MLflow
mlflow.log_param("data_files", data_files)  # Log the dataset file used for training
mlflow.log_param("learning_rate", learning_rate)  # Log the learning rate used
mlflow.log_param("per_device_train_batch_size", per_device_train_batch_size)  # Log batch size
mlflow.log_param("num_train_epochs", num_train_epochs)  # Log number of epochs
mlflow.log_param("max_length", max_length)  # Log the max sequence length for tokenization
mlflow.log_param("model_name", "gpt2")  # Log the model name (GPT-2 in this case)

# Start training the model using the Trainer instance
trainer.train()


2024/10/19 16:02:37 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 89472e3da6cf45ba9d0ff405f8cd676f: Failed to log run data: Exception: Changing param values is not allowed. Params were already logged='[{'key': 'max_length', 'old_value': '32', 'new_value': '20'}]' for run ID='89472e3da6cf45ba9d0ff405f8cd676f'.


  0%|          | 0/1143 [00:00<?, ?it/s]

{'loss': 5.7737, 'grad_norm': 44.217384338378906, 'learning_rate': 1.1251093613298338e-05, 'epoch': 0.44}
{'loss': 5.7421, 'grad_norm': 14.4274320602417, 'learning_rate': 2.502187226596676e-06, 'epoch': 0.87}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 5.446900367736816, 'eval_accuracy': 0.17755681818181818, 'eval_runtime': 28.9373, 'eval_samples_per_second': 9.883, 'eval_steps_per_second': 1.244, 'epoch': 1.0}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'train_runtime': 1645.038, 'train_samples_per_second': 0.695, 'train_steps_per_second': 0.695, 'train_loss': 5.7256335694854545, 'epoch': 1.0}


TrainOutput(global_step=1143, training_loss=5.7256335694854545, metrics={'train_runtime': 1645.038, 'train_samples_per_second': 0.695, 'train_steps_per_second': 0.695, 'total_flos': 18666049536000.0, 'train_loss': 5.7256335694854545, 'epoch': 1.0})

## Model Evaluation and Metrics Logging with MLflow for GPT-2

### Completion of Training: Saving Model, Logging Metrics, and Ending MLflow Run

In [16]:
# Save the fine-tuned model and tokenizer locally
model.save_pretrained('./fine_tuned_gpt2')  # Save the fine-tuned GPT-2 model to the specified directory
tokenizer.save_pretrained('./fine_tuned_gpt2')  # Save the tokenizer (required for text preprocessing) to the same directory

# Log the model to MLflow using the PyTorch logging interface
# This will store the model artifact in the MLflow tracking system for later use
mlflow.pytorch.log_model(model, "fine_tuned_gpt2")

# Evaluate the model using the trainer and store the evaluation metrics (e.g., loss, accuracy)
eval_metrics = trainer.evaluate()

# Extract the training loss from the trainer's state history if it exists
# The state.log_history holds a record of logs during training
if 'loss' in trainer.state.log_history[-1]:
    train_loss = trainer.state.log_history[-1]['loss']  # Get the last logged training loss
else:
    train_loss = None  # If not found, set training loss to None

# Log metrics to MLflow
mlflow.log_metric("Training Loss", train_loss if train_loss is not None else 0.0)  # Log training loss (set to 0.0 if unavailable)
mlflow.log_metric("Validation Loss", eval_metrics['eval_loss'])  # Log the validation loss from evaluation
mlflow.log_metric("Accuracy", eval_metrics.get('eval_accuracy', 0.0))  # Log the accuracy (default to 0.0 if not found)

# End the MLflow run to ensure all logs and artifacts are finalized
mlflow.end_run()

# Print confirmation message to indicate that the model training and saving process is complete
print("Model training and saving completed.")




  0%|          | 0/36 [00:00<?, ?it/s]

Model training and saving completed.


### Generating Text with GPT-2: Story Creation and Experiment Tracking

In [17]:
# Load the fine-tuned model and tokenizer from the specified directory
model_name = "./fine_tuned_gpt2"  # Path to the saved fine-tuned model directory
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load the tokenizer associated with the model
model = AutoModelForCausalLM.from_pretrained(model_name)  # Load the model for causal language modeling

# Set the model to evaluation mode
# This is essential for inference, disabling dropout and other training-specific behaviors
model.eval()

# Define a function to generate stories based on a given prompt
# max_length: maximum length of the generated story
# temperature: controls randomness in the generation process (higher values = more random)
# top_k: limits the sampling pool to the top-k most likely next words
def generate_story(prompt, max_length=1000, temperature=1.5, top_k=100):
    # Tokenize the input prompt and convert it into input IDs (tensor format)
    input_ids = tokenizer.encode(prompt, return_tensors='pt')  # 'pt' indicates PyTorch tensors
    
    # Disable gradient computation during generation for efficiency
    with torch.no_grad():
        # Generate text from the input prompt
        output = model.generate(
            input_ids,  # The input prompt as tokenized IDs
            max_length=max_length,  # Maximum length of the generated sequence
            temperature=temperature,  # Controls diversity in the output
            top_k=top_k,  # Limits sampling to the top-k most probable tokens
            do_sample=True,  # Enables sampling for more varied text generation
            num_return_sequences=1,  # Generate only one sequence
            pad_token_id=tokenizer.eos_token_id  # Use EOS token for padding
        )

    # Decode the generated tokens back into human-readable text
    generated_story = tokenizer.decode(output[0], skip_special_tokens=True)  # Skip special tokens in the output

    # Start a new MLflow run to log parameters related to text generation
    mlflow.start_run()
    mlflow.log_param("generation_max_length", max_length)  # Log the maximum length for generation
    mlflow.log_param("temperature", temperature)  # Log the temperature setting
    mlflow.log_param("top_k", top_k)  # Log the top-k value used for sampling
    mlflow.log_param("prompt", prompt)  # Log the prompt that was used for generation
    mlflow.end_run()  # End the MLflow run to save the logged parameters

    return generated_story  # Return the generated story

# Define a prompt to initiate the story generation
prompt = "Write a story about a girl's adventures in a magical forest where she finds strange creatures"
# Generate the story based on the prompt and specified parameters
generated_text = generate_story(prompt, max_length=1000)  # Adjust max_length as needed
# Print the generated story
print(generated_text)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Write a story about a girl's adventures in a magical forest where she finds strange creatures always found dreams so natural, her mother feasts far later upon story stories saying first son feasting children feasting strange trees would never come though memories go never come canard later memories could no need even children find children without strange encounters never found life no wonder creatures love children even living creatures known strange creatures found many animals even life find others time appear strange even dead man mike scipaul wrote first man half grown could be human later writing tales almost every account of could be like first man year aged born man aged sixteen oldest could still alive year without words narrator james raul quanich holds no love love love became would make life love one final time write new poem still unknown characters love grows would die even although death begins never felt anyone can can actually make anything love mike snobbish shuck far

### Gradio-Powered Story Generation: Generate Tales with Fine-Tuned GPT-2

In [19]:
# Import Gradio for creating web interfaces
import gradio as gr

# Define a wrapper function for generating stories using the previously defined generate_story function
def gradio_generate(prompt):
    # Call the generate_story function with the provided prompt to get the generated text
    generated_text = generate_story(prompt)  # This utilizes the fine-tuned model to generate the story
    return generated_text  # Return the generated story for display in the Gradio interface

# Create a Gradio interface
gradio_interface = gr.Interface(
    fn=gradio_generate,  # Function that will be called to generate text
    inputs="text",  # Input type is a text box for users to enter their prompts
    outputs="text",  # Output type is a text box for displaying the generated story
    title="Story Hallucinator",  # Title of the Gradio app displayed at the top
    description="Enter a prompt to generate a story using the fine-tuned GPT-2 model.",  # Description shown to users
)

# Launch the Gradio interface
# The share=True option allows the interface to be shared via a public link
gradio_interface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://6399ed0431c95b509e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


