In [6]:
!pip install transformers



In [7]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

comments = pd.read_csv('/content/drive/MyDrive/ASRCML/comments.csv')
code = pd.read_csv('/content/drive/MyDrive/ASRCML/code.csv')

#df = comments.merge(code, how="inner")

#Create a very small test set to compare generated text with the reality
#test_set = df.sample(n=200)
#df = df.loc[~df.index.isin(test_set.index)]

#Reset the indexes
#test_set = test_set.reset_index()
#df = df.reset_index()

df = comments
print(df)
test_set = df.sample(n=200)
df = df.loc[~df.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
df = df.reset_index()

print(test_set)
#print(test_set['Comments'][0])

class Comments(Dataset):  
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.comments = []

        for row in df['Comments']:
          self.comments.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))               
        if truncate:
            self.comments = self.comments[:20000]
        self.comments_count = len(self.comments)
        
    def __len__(self):
        return self.comments_count

    def __getitem__(self, item):
        return self.comments[item]
    


#ready the dataset
#dataset = SongLyrics(df['Lyric'], truncate=True, gpt2_type="gpt2") 

#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

                                                 Comments Unnamed: 1
0       /**     * Chu Li Mei Yi Tiao Ji Lu      * <b>Y...        NaN
1       /**     * Fen Ye Chu Li      * @param batchPar...        NaN
2       /* ---------------------- Ende der inneren Kla...        NaN
3                    /**         * Konstruktor         */        NaN
4       /**         * Diese Methode fugt ein neues Obj...        NaN
...                                                   ...        ...
547646                       /** * @return the regdate */        NaN
547647         /** * @param regdate the regdate to set */        NaN
547648                    /** * @return the grpRegDate */        NaN
547649   /** * @param grpRegDate the grpRegDate to set */        NaN
547650                        /** * @return the grpCnt */        NaN

[547651 rows x 2 columns]
      index  ...                                         Unnamed: 1
0    155948  ...                                                NaN
1     969

In [8]:
print(test_set['Comments'][0])

/**   * Registers a {@link Closeable} resource that should be closed after the suite   * completes.   *    * @return <code>resource</code> (for call chaining).   */


In [9]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

def text_generation(test_data):
  generated_comments = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['Comments'][i], entry_count=1)
    generated_comments.append(x)
  return generated_comments

generated_comments = text_generation(test_set)

100%|██████████| 1/1 [00:15<00:00, 15.57s/it]
100%|██████████| 1/1 [00:06<00:00,  6.64s/it]
100%|██████████| 1/1 [00:07<00:00,  7.50s/it]
100%|██████████| 1/1 [00:17<00:00, 17.62s/it]
100%|██████████| 1/1 [00:05<00:00,  5.50s/it]
100%|██████████| 1/1 [00:08<00:00,  8.01s/it]
100%|██████████| 1/1 [00:05<00:00,  5.62s/it]
100%|██████████| 1/1 [00:09<00:00,  9.74s/it]
100%|██████████| 1/1 [00:25<00:00, 25.80s/it]
100%|██████████| 1/1 [00:13<00:00, 13.39s/it]
100%|██████████| 1/1 [00:06<00:00,  6.93s/it]
100%|██████████| 1/1 [00:21<00:00, 21.08s/it]
100%|██████████| 1/1 [00:22<00:00, 22.23s/it]
100%|██████████| 1/1 [00:05<00:00,  5.63s/it]
100%|██████████| 1/1 [00:11<00:00, 11.08s/it]
100%|██████████| 1/1 [00:18<00:00, 18.50s/it]
100%|██████████| 1/1 [00:26<00:00, 26.41s/it]
100%|██████████| 1/1 [00:06<00:00,  6.22s/it]
100%|██████████| 1/1 [00:10<00:00, 10.49s/it]
100%|██████████| 1/1 [00:10<00:00, 10.85s/it]
100%|██████████| 1/1 [00:06<00:00,  6.48s/it]
100%|██████████| 1/1 [00:31<00:00,

In [10]:
#Loop to keep only generated text and add it as a new column in the dataframe
my_generations=[]

for i in range(len(generated_comments)):
  a = test_set['Comments'][i].split()[-30:] #Get the matching string we want (30 words)
  b = ' '.join(a)
  c = ' '.join(generated_comments[i]) #Get all that comes after the matching string
  my_generations.append(c.split(b)[-1])

test_set['Generated_comments'] = my_generations


#Finish the sentences when there is a point, remove after that
final=[]

for i in range(len(test_set)):
  to_remove = test_set['Generated_comments'][i].split('.')[-1]
  final.append(test_set['Generated_comments'][i].replace(to_remove,''))

test_set['Generated_comments'] = final
test_set.head()

Unnamed: 0,index,Comments,Unnamed: 1,Generated_comments
0,155948,/** * Registers a {@link Closeable} resource...,,/** * Registers a {@link Closeable} resource...
1,96982,/** * Create the frame. * Don't touch the code...,,/** * Create the frame.
2,130308,/** * Return the information of the curren...,,/** * Return the information of the curren...
3,429966,/** * This method was generated by Apache iBAT...,,/** * This method was generated by Apache iBAT...
4,196813,/** * enum please.. */,,"\n\nreturn valid_types ( this, _this, get_vali..."


In [18]:
print(test_set['Comments'][10])
print(test_set['Generated_comments'][10])


/** Returns an upper bound for the utility of refinements for the given hypothesis. */
 public function validateInput(Pipeline params) { if(params.length > 2) { return params.


 public function validateInput(Pipeline params) { if(params.length > 2) { return params.

In [12]:
#Using BLEU score to compare the real sentences with the generated ones
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for i in range(len(test_set)):
  reference = test_set['Comments'][i]
  candidate = test_set['Generated_comments'][i]
  scores.append(sentence_bleu(reference, candidate))

statistics.mean(scores)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.4979932979543359

In [20]:
print(test_set['Comments'])
print(test_set['Generated_comments'])

0      /**   * Registers a {@link Closeable} resource...
1      /** * Create the frame. * Don't touch the code...
2      /**     * Return the information of the curren...
3      /** * This method was generated by Apache iBAT...
4                                 /** * enum please.. */
                             ...                        
195    /**     * This does BlueZ version detection   ...
196    /**     * Validates <a href="https://issues.al...
197    /**     * @see org.alfresco.repo.dictionary.Di...
198    /**     * This method will be called whenever ...
199    /**     * Triggers the request for a new api c...
Name: Comments, Length: 200, dtype: object
0      /**   * Registers a {@link Closeable} resource...
1                                /** * Create the frame.
2      /**     * Return the information of the curren...
3      /** * This method was generated by Apache iBAT...
4      \n\nreturn valid_types ( this, _this, get_vali...
                             ...             