In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("Apps-Apps.csv")

In [3]:
df.dropna(inplace=True)

In [4]:
df['training'] = df['Text']  + 'TL;DR' + df['Summary']

In [None]:
df[].info()

In [6]:
min_df = df[['Summary','Text','training']][:1000]

In [7]:
sum_all_tokens = sum([len(review.split()) for review in min_df['training']])
avg_length = sum_all_tokens / len(min_df['training'])
print(avg_length)

75.852


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
 
class GPT2ReviewDataset(Dataset):  
    def __init__(self, tokenizer, reviews, max_len=100):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.reviews = reviews
        self.result = []

        for review in self.reviews:
            # Encode the text using tokenizer.encode(). We add EOS at the end
            tokenized = self.tokenizer.encode(review + self.eos)
            
            # Padding/truncating the encoded sequence to max_len 
            padded = self.pad_truncate(tokenized)            

            # Creating a tensor and adding to the result
            self.result.append(torch.tensor(padded))

    def __len__(self):
        return len(self.result)


    def __getitem__(self, item):
        return self.result[item]

    def pad_truncate(self, name):
        extra_length = 4
        name_length = len(name) - extra_length
        if name_length < self.max_len:
            difference = self.max_len - name_length
            result = name + [self.eos_id] * difference
        elif name_length > self.max_len:
            result = name[:self.max_len + 3]+[self.eos_id] 
        else:
            result = name
        return result

In [4]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import DataLoader

# Check if GPU is available, otherwise use CPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device ("cpu")

# Load pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

In [10]:
train_df = min_df.head (750)
test_df = min_df.tail (250)

In [11]:
train_dataset = GPT2ReviewDataset(tokenizer, train_df['training'], 100)
test_dataset = GPT2ReviewDataset(tokenizer, test_df['training'], 100)

Token indices sequence length is longer than the specified maximum sequence length for this model (1430 > 1024). Running this sequence through the model will result in indexing errors


In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

In [13]:
def train(model, optimizer, dl, epochs, device):    
    for epoch in range(epochs):
        print (f"Epoch {epoch}")
        for idx, batch in enumerate(dl):
             with torch.set_grad_enabled(True):
                optimizer.zero_grad()
                batch = batch.to(device)
                output = model(batch, labels=batch)
                loss = output[0]
                loss.backward()
                optimizer.step()
                if idx % 50 == 0:
                    print("loss: %f, %d"%(loss, idx))

In [14]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch import cuda
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelWithLMHead

parameters={
    "BATCH_SIZE":4,          
    "EPOCHS":3,              
    "LEARNING_RATE":1e-4,          
    "MAX_TARGET_TEXT_LENGTH":100
}

# Load pretrained model from Hugging face
model = AutoModelWithLMHead.from_pretrained("gpt2")
device = 'cuda' if cuda.is_available() else 'cpu'
device = torch.device ("cpu")
model.to(device)

optimizer = torch.optim.AdamW(params = model.parameters(), lr=parameters['LEARNING_RATE'])
train(model, optimizer, train_dataset, epochs=parameters['EPOCHS'], device=device)



Epoch 0
loss: 7.990021, 0
loss: 0.958284, 50
loss: 2.280205, 100
loss: 2.489177, 150
loss: 3.791001, 200
loss: 3.893262, 250
loss: 1.879658, 300
loss: 1.739902, 350
loss: 2.014563, 400
loss: 3.689391, 450
loss: 1.533364, 500
loss: 3.820602, 550
loss: 2.326473, 600
loss: 2.449049, 650
loss: 2.228420, 700
Epoch 1
loss: 1.411688, 0
loss: 0.721575, 50
loss: 1.638689, 100
loss: 1.892254, 150
loss: 2.658958, 200
loss: 2.881997, 250
loss: 1.382528, 300
loss: 1.297520, 350
loss: 1.360433, 400
loss: 2.460260, 450
loss: 1.271334, 500
loss: 1.774893, 550
loss: 1.649122, 600
loss: 1.682940, 650
loss: 1.458864, 700
Epoch 2
loss: 1.080663, 0
loss: 0.714477, 50
loss: 0.976882, 100
loss: 1.415865, 150
loss: 1.685136, 200
loss: 1.871129, 250
loss: 0.862328, 300
loss: 0.923563, 350
loss: 0.821626, 400
loss: 1.342887, 450
loss: 0.889995, 500
loss: 0.950819, 550
loss: 1.001452, 600
loss: 1.114013, 650
loss: 0.781810, 700


In [16]:
save_directory = "model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.json',
 'model/merges.txt',
 'model/added_tokens.json')

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Define directory where the model was saved
save_directory = "model"

# Load the model and tokenizer
loaded_model = GPT2LMHeadModel.from_pretrained(save_directory)
loaded_tokenizer = GPT2Tokenizer.from_pretrained(save_directory)

In [2]:
def generate_summary (input_text):
  input_text += " TL;DR"
  tokenized_input = tokenizer.encode(input_text, return_tensors='pt')

  # Generate summary
  model.eval()
  with torch.no_grad():
      generated_ids = model.generate(
          input_ids=tokenized_input.to(device),
          max_length= 100,  # Adjust the max length as needed
          num_beams=5,    # Adjust the number of beams for beam search
          early_stopping=True
      )

  # Decode generated summary
  return tokenizer.decode(generated_ids[0], skip_special_tokens=True).split ("TL;DR")[1]

In [5]:
input_text = "I first had this drink when my mother brought it back with her from a visit to her home in Brittany France and thought it was delicious!  I wish it were a bit cheaper but other than that, I love this product!"
print("Generated Summary:", generate_summary (input_text))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Summary: : This is a great drink for those who are looking for a quick, refreshing drink.

Reviewed By Date Rating Strength Flavoring Taste Room Note Emeritus Account (28392) 2008-10-29 Medium None Detected Medium Pleasant to Tolerable I


In [17]:
from rouge import Rouge
rouge = Rouge()

In [None]:
generated = list ()
reference = list ()
for index, row in test_df.iterrows():
    generated.append (generate_summary (row["Text"][:100]))
    reference.append (row["Summary"][:100])

In [24]:
from rouge import Rouge

rouge = Rouge()

rouge_scores = []
for generated_text, reference_summary in zip(generated, test_df['Summary']):
    rouge_scores.append(rouge.get_scores(generated_text, reference_summary))
 

In [27]:
avg_rouge_scores = {
        'rouge-1': {
            'f': sum(score[0]['rouge-1']['f'] for score in rouge_scores) / len(rouge_scores),
            'p': sum(score[0]['rouge-1']['p'] for score in rouge_scores) / len(rouge_scores),
            'r': sum(score[0]['rouge-1']['r'] for score in rouge_scores) / len(rouge_scores)
        },
        'rouge-2': {
            'f': sum(score[0]['rouge-2']['f'] for score in rouge_scores) / len(rouge_scores),
            'p': sum(score[0]['rouge-2']['p'] for score in rouge_scores) / len(rouge_scores),
            'r': sum(score[0]['rouge-2']['r'] for score in rouge_scores) / len(rouge_scores)
        },
        'rouge-l': {
            'f': sum(score[0]['rouge-l']['f'] for score in rouge_scores) / len(rouge_scores),
            'p': sum(score[0]['rouge-l']['p'] for score in rouge_scores) / len(rouge_scores),
            'r': sum(score[0]['rouge-l']['r'] for score in rouge_scores) / len(rouge_scores)
        }
    }
print(f'Average ROUGE scores: {avg_rouge_scores}')

Average ROUGE scores: {'rouge-1': {'f': 0.12873521765863918, 'p': 0.5228629789022853, 'r': 0.07385728715728716}, 'rouge-2': {'f': 0.03908225040021741, 'p': 0.21843137254901965, 'r': 0.0223}, 'rouge-l': {'f': 0.11494087063455671, 'p': 0.4528629789022853, 'r': 0.06771457431457432}}
