In [1]:
# from google.colab import drive
# drive.mount('/content/drive')


In [38]:
import pandas as pd
df = pd.read_csv('/kaggle/input/amazon-review/Reviews.csv')

In [39]:
df.shape

(568454, 10)

In [40]:
# df["Text"]
df["Text"]

0         I have bought several of the Vitality canned d...
1         Product arrived labeled as Jumbo Salted Peanut...
2         This is a confection that has been around a fe...
3         If you are looking for the secret ingredient i...
4         Great taffy at a great price.  There was a wid...
                                ...                        
568449    Great for sesame chicken..this is a good if no...
568450    I'm disappointed with the flavor. The chocolat...
568451    These stars are small, so you can give 10-15 o...
568452    These are the BEST treats for training and rew...
568453    I am very satisfied ,product is as advertised,...
Name: Text, Length: 568454, dtype: object

In [41]:
import nltk
import re

# Download NLTK resources if not already installed
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
import nltk
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /kaggle/working/...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /kaggle/working/corpora/wordnet.zip


replace /kaggle/working/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


In [43]:
import nltk
# nltk.download('wordnet')
def preprocess_text(text):
    # Check if the input is a string
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize words
        tokens = word_tokenize(text)
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        # Lemmatize words
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        # Join tokens back into text
        cleaned_text = ' '.join(tokens)
        return cleaned_text
    else:
        return ''  # Return empty string for non-string values

# Create an empty DataFrame to store the chunks
df_preprocessed = pd.DataFrame()

# Preprocess the 'Text' and 'Summary' columns
df_preprocessed['Text'] = df['Text'].apply(preprocess_text)
df_preprocessed['Summary'] = df['Summary'].apply(preprocess_text)

# Print the first few rows to verify
print(df_preprocessed.head())


                                                Text                Summary
0  bought several vitality canned dog food produc...  good quality dog food
1  product arrived labeled jumbo salted peanutsth...             advertised
2  confection around century light pillowy citrus...            delight say
3  looking secret ingredient robitussin believe f...         cough medicine
4  great taffy great price wide assortment yummy ...            great taffy


In [44]:
df_preprocessed.shape
# df.shape

(568454, 2)

In [45]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

In [46]:
# Set the padding token index to a valid value explicitly
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
padding_token_id = tokenizer.pad_token_id
print("New padding token index:", padding_token_id)


New padding token index: 50257


In [47]:
# Set the padding token index to a valid value explicitly
tokenizer.pad_token = tokenizer.eos_token  # Setting padding token to the end-of-sequence token
padding_token_id = tokenizer.pad_token_id
print("New padding token index:", padding_token_id)


New padding token index: 50256


In [48]:
# Step 7: Verify Padding Token Index
padding_token_id = tokenizer.pad_token_id
print("Padding token index:", padding_token_id)

# If padding token index is outside the valid range, you can set it explicitly
if padding_token_id >= tokenizer.vocab_size:
    print("Padding token index is outside the valid range. Setting it to a valid value.")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    padding_token_id = tokenizer.pad_token_id
    print("New padding token index:", padding_token_id)


Padding token index: 50256


In [49]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df_preprocessed, test_size=0.25, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape:", train_df.shape)
print("Testing set shape:", test_df.shape)


Training set shape: (426340, 2)
Testing set shape: (142114, 2)


In [50]:
from transformers import GPT2Tokenizer

# Initialize GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Verify the added padding token
print("Padding token:", tokenizer.pad_token)

# Now you can use the tokenizer for data loading


Padding token: [PAD]


In [51]:
import torch
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class
class ReviewsDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=300):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        summary = self.summaries.iloc[idx]

        # Encode the text and summary
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return inputs


# # Instantiate the custom dataset with the training data
# train_dataset = ReviewsDataset(train_df['Text'], train_df['Summary'], tokenizer)

# # Create a data loader for the training data
# train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [52]:
import torch
from torch.utils.data import DataLoader, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup

# Define batch size and number of epochs
batch_size = 32
num_train_epochs = 3

# Convert training and testing data to DataLoader
train_data_loader = DataLoader(
    ReviewsDataset(train_df['Text'], train_df['Summary'], tokenizer),
    batch_size=batch_size,
    sampler=RandomSampler(train_df),  # Random sampler for training
)
test_data_loader = DataLoader(
    ReviewsDataset(test_df['Text'], test_df['Summary'], tokenizer),
    batch_size=batch_size,
)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters())
total_steps = len(train_data_loader) * num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Move model to appropriate device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

# Fine-tuning loop
for epoch in range(num_train_epochs):
    total_loss = 0
    for batch in train_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Calculate loss (cross-entropy loss)
        loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.shape[-1]), input_ids.view(-1))
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_data_loader)
    print(f'Epoch {epoch + 1}/{num_train_epochs}, Average Training Loss: {avg_train_loss:.4f}')

# Save the fine-tuned model
torch.save(model.state_dict(), '/kaggle/working/output_model.pth')




RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [16]:
import torch
from torch.utils.data import DataLoader, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm,tqdm_notebook
# Define batch size and number of epochs
batch_size = 32
num_train_epochs = 3

tran_bar=tqdm_notebook(desc='traning_routine',
                      total=13324,
                      position=0)
# Convert training and testing data to DataLoader
train_data_loader = DataLoader(
    ReviewsDataset(train_df['Text'], train_df['Summary'], tokenizer),
    batch_size=batch_size,
    sampler=RandomSampler(train_df),  # Random sampler for training
)
test_data_loader = DataLoader(
    ReviewsDataset(test_df['Text'], test_df['Summary'], tokenizer),
    batch_size=batch_size,
)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters())
total_steps = len(train_data_loader) * num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Move model to appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

# Fine-tuning loop
for epoch in tqdm(range(num_train_epochs)):
    total_loss = 0
    for batch in train_data_loader:
        input_ids = batch['input_ids'].clone().to(device)
        attention_mask = batch['attention_mask'].clone().to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Calculate loss (cross-entropy loss)
        loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.shape[-1]), input_ids.view(-1))
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        tran_bar.update()
    avg_train_loss = total_loss / len(train_data_loader)
    print(f'Epoch {epoch + 1}/{num_train_epochs}, Average Training Loss: {avg_train_loss:.4f}')

# Save the fine-tuned model
torch.save(model.state_dict(), '/kaggle/working/output_model.pth')


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tran_bar=tqdm_notebook(desc='traning_routine',


traning_routine:   0%|          | 0/13324 [00:00<?, ?it/s]


  0%|          | 0/3 [00:00<?, ?it/s][A


IndexError: index out of range in self

In [None]:
torch.save(model.state_dict(), '/kaggle/working/output_model.pth')

In [17]:
import torch

# Step 1: Check Input Indices
# Print the range of values in input_ids tensor
print("Minimum input_ids value:", torch.min(input_ids))
print("Maximum input_ids value:", torch.max(input_ids))

# Step 2: Inspect Embedding Weight Matrix
# Print the shape of the embedding weight matrix
print(model)

# Step 3: Check Padding Index
# Print the padding index used in the embedding layer
# print("Padding index:", model.embeddings.padding_idx)

# Step 4: Verify Model Initialization
# Ensure that the model is correctly initialized and loaded
# For example, if you are using a pretrained model, check its configuration
print("Model configuration:", model.config)

# Step 5: Verify Tokenization
# If tokenization was used, print the tokenizer configuration or inspect tokenized input
print("Tokenizer configuration:", tokenizer)

# Step 6: Inspect Input Data
# Print and inspect the input_ids tensor before passing it to the model
print("Input_ids tensor:", input_ids)


Minimum input_ids value: tensor(64)
Maximum input_ids value: tensor(50257)
GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
Model configuration: GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 502

In [18]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Step 1: Check Input Indices
# Print the range of values in input_ids tensor
input_ids = tokenizer.encode("Sample input text", return_tensors="pt")
print("Minimum input_ids value:", torch.min(input_ids))
print("Maximum input_ids value:", torch.max(input_ids))

# Step 2: Inspect Embedding Weight Matrix
# Print the shape of the embedding weight matrix
print("Embedding weight matrix shape:", model.transformer.wte.weight.shape)

# Step 3: Check Padding Index
# Print the padding index used in the tokenizer
print("Padding token index:", tokenizer.pad_token_id)

# Step 4: Verify Model Initialization
# Ensure that the model is correctly initialized and loaded
print("Model configuration:", model.config)

# Step 5: Verify Tokenization
# If tokenization was used, print the tokenizer configuration or inspect tokenized input
print("Tokenizer configuration:", tokenizer)
print("Tokenized input:", tokenizer.tokenize("Sample input text"))

# Step 6: Inspect Input Data
# Print and inspect the input_ids tensor before passing it to the model
print("Input_ids tensor:", input_ids)


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Minimum input_ids value: tensor(2420)
Maximum input_ids value: tensor(36674)
Embedding weight matrix shape: torch.Size([50257, 768])
Padding token index: None
Model configuration: GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transfor

In [19]:
# Step 3: Check Padding Index
# Print the padding index used in the tokenizer
print("Padding token index:", tokenizer.pad_token_id)


Padding token index: None


In [20]:
# Add padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})


1

In [21]:
# Print the padding index used in the tokenizer
print("Padding token index:", tokenizer.pad_token_id)


Padding token index: 50257


In [22]:
import torch
from torch.utils.data import DataLoader, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup

# Define batch size and number of epochs
batch_size = 8
num_train_epochs = 3

# Convert training and testing data to DataLoader
train_data_loader = DataLoader(
    ReviewsDataset(train_df['Text'], train_df['Summary'], tokenizer),
    batch_size=batch_size,
    sampler=RandomSampler(train_df),  # Random sampler for training
)
test_data_loader = DataLoader(
    ReviewsDataset(test_df['Text'], test_df['Summary'], tokenizer),
    batch_size=batch_size,
)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters())
total_steps = len(train_data_loader) * num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Move model to appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

# Fine-tuning loop
for epoch in range(num_train_epochs):
    total_loss = 0
    for batch in train_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_data_loader)
    print(f'Epoch {epoch + 1}/{num_train_epochs}, Average Training Loss: {avg_train_loss:.4f}')

# Save the fine-tuned model
torch.save(model.state_dict(), output_dir)


IndexError: index out of range in self

In [None]:
import torch

# Step 1: Check Tokenization
print("Tokenized input:", tokenizer.tokenize("Sample input text"))

# Step 2: Inspect Token IDs
print("Minimum input_ids value:", torch.min(input_ids))
print("Maximum input_ids value:", torch.max(input_ids))

# Step 3: Verify Vocabulary Size
print("Vocabulary size:", tokenizer.vocab_size)

# Step 4: Check Padding Token Index
print("Padding token index:", tokenizer.pad_token_id)

# Step 5: Verify Model Initialization
print("Model configuration:", model.config)

# Step 6: Inspect Input Data
print("Input_ids tensor:", input_ids)


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Define custom dataset class
class ReviewsDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=128):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        summary = self.summaries.iloc[idx]

        inputs = self.tokenizer(text, summary, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return inputs

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Create data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Create datasets and data loaders
train_dataset = ReviewsDataset(train_df['Text'], train_df['Summary'], tokenizer)
test_dataset = ReviewsDataset(test_df['Text'], test_df['Summary'], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=training_args.per_device_train_batch_size, shuffle=True)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model()


In [None]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['Text']
        summary = self.dataframe.iloc[idx]['Summary']

        # Tokenize text and summary
        encoded_input = self.tokenizer(text, summary,
                                       padding='max_length',
                                       truncation=True,
                                       max_length=self.max_length,
                                       return_tensors='pt')

        input_ids = encoded_input['input_ids'].squeeze(0)
        attention_mask = encoded_input['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }


In [None]:
import torch
from torch.utils.data import Dataset

class ReviewsDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_length=128):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        summary = self.summaries.iloc[idx]

        inputs = self.tokenizer(text, summary, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        return inputs


In [None]:
!pip install accelerate -U

In [None]:
from torch.utils.data import DataLoader

# Define batch size and number of workers for DataLoader
batch_size = 4
num_workers = 2

# Create DataLoader for training dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)


In [None]:
import torch
from transformers import AdamW, get_linear_schedule_with_warmup

# Define optimizer and scheduler
optimizer = AdamW(model.parameters())
total_steps = len(train_loader) * num_train_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Move model to appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

# Fine-tuning loop
for epoch in range(num_train_epochs):
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch, labels=batch['input_ids'])
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_train_epochs}, Average Training Loss: {avg_train_loss:.4f}')

# Save the fine-tuned model
torch.save(model.state_dict(), output_dir)


In [None]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

# Define fine-tuning parameters
output_dir = './fine_tuned_model'  # Output directory for the fine-tuned model
num_train_epochs = 3  # Number of training epochs
per_device_train_batch_size = 4  # Batch size per device during training
save_steps = 10_000  # Save checkpoint every 10,000 steps
save_total_limit = 2  # Limit the total number of checkpoints saved

# Initialize GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Prepare training and testing datasets
train_dataset = ReviewsDataset(train_df['Text'], train_df['Summary'], tokenizer)
test_dataset = ReviewsDataset(test_df['Text'], test_df['Summary'], tokenizer)

# Define data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_steps=save_steps,
    save_total_limit=save_total_limit
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model()

# Generate summaries using the fine-tuned model
# You can use the generate_text function defined earlier for this purpose
# Example usage:
# sequence = "The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners."
# max_len = 20
# generate_text(sequence, max_len)


In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Load the review dataset
reviews_df = pd.read_csv('reviews.csv')  # Adjust the file path as needed

# Preprocess the review dataset (assuming you have already defined preprocess_text function)
reviews_df['Text'] = reviews_df['Text'].apply(preprocess_text)
reviews_df['Summary'] = reviews_df['Summary'].apply(preprocess_text)

# Fine-tuning parameters
output_dir = './fine_tuned_model'  # Output directory for the fine-tuned model
num_train_epochs = 3  # Number of training epochs
per_device_train_batch_size = 4  # Batch size per device during training
save_steps = 10_000  # Save checkpoint every 10,000 steps
save_total_limit = 2  # Limit the total number of checkpoints saved

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Prepare dataset for fine-tuning
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='reviews.csv',  # Adjust the file path as needed
    block_size=128  # Maximum sequence length
)

# Prepare data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    save_steps=save_steps,
    save_total_limit=save_total_limit
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained(output_dir)

# Generate summaries using the fine-tuned model
# You can use the generate_text function defined earlier for this purpose
# Example usage:
# sequence = "The Fender CD-60S Dreadnought Acoustic Guitar is a great instrument for beginners."
# max_len = 20
# generate_text(sequence, max_len)


In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Set padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Custom training parameters
learning_rate = 5e-5
batch_size = 8
num_epochs = 3

# Prepare DataLoader
train_dataset = ReviewsDataset(train_df['Text'], train_df['Summary'], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


# Prepare optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tuning loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch, labels=batch['input_ids'])
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}')
