In [None]:
!pip install transformers

In [None]:
import json
from sklearn.model_selection import train_test_split

with open('/content/drive/MyDrive/ResearchProject/083/reviews_083_training.json') as f: #specify training data path
    data = json.load(f)

train, test = train_test_split(data,test_size=0.15) 

def build_text_files_no_class(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for review in data_json:
        aspects = ""
        for pro in review["pros"]:
          aspects += f'<newaspect>{pro}'
        for con in review["cons"]:
          aspects += f'<newaspect>{con}'
        text = f'<startoftext><reviewtext>{review["text"]}<aspects>{aspects}<endoftext>'
        data += text + "\n"
    f.write(data)

def build_text_files_with_class(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for review in data_json:
        aspects = ""
        for pro in review["pros"]:
          aspects += f'<pro>{pro}'
        for con in review["cons"]:
          aspects += f'<con>{con}'
        text = f'<startoftext><reviewtext>{review["text"]}<aspects>{aspects}<endoftext>'
        data += text + "\n"
    f.write(data)


build_text_files_with_class(train,'train_dataset.txt')
build_text_files_with_class(test,'test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("GroNLP/gpt2-small-dutch")

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

In [None]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

In [None]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("GroNLP/gpt2-small-dutch")


training_args = TrainingArguments(
    output_dir="./gpt2-with-classification", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64, # batch size for evaluation
    eval_steps = 200, # Number of update steps between two evaluations.
    save_steps=2000000, # after # steps model is saved 
    warmup_steps=250, # number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)



Downloading:   0%|          | 0.00/457M [00:00<?, ?B/s]

In [None]:
from google.colab import files

trainer.train()
trainer.save_model()

In [None]:
from transformers import pipeline

pipe = pipeline('text-generation', model='/content/gpt2-with-classification', tokenizer="GroNLP/gpt2-small-dutch")

In [None]:
pipe("<startoftext><reviewtext>Geweldige wasmachine, draait hard. Maar wel luidruchtig. Ik vind het lawaai erg vervelend. De was wordt wel goed schoon. zeer mooie kleur<aspects>")

[{'generated_text': '<startoftext><reviewtext>Geweldige wasmachine, draait hard. Maar wel luidruchtig. Ik vind het lawaai erg vervelend. De was wordt wel goed schoon. zeer mooie kleur<aspects><pro>lekkere koffie<endoftext>\nDe stofzuiger heeft 2 opzetstukken waardoor hij heel makkelijk in gebruik is. Het enige nadeel dat je niet hoeft te kiezen tussen verschillende opzetstukken zou kunnen zijn met de kleine zuigmond maar die kun je gemakkelijk vervangen door een'}]

In [None]:
pipe("<startoftext><reviewtext>De stoel zit lekker, maar ziet er niet uit.<aspects>")

[{'generated_text': '<startoftext><reviewtext>De stoel zit lekker, maar ziet er niet uit.<aspects><con>doet wat het moet doen<endoftext>\nHet apparaat is eenvoudig in te stellen via de app en kan op je telefoon worden opgeladen. De batterijduur van dit apparaat is lang genoeg dat hij wel eens 2 uur aan gaat zonder problemen. Het enige minpunt waar ik erg mee over nagedacht ben zijn de reinigingscancelling functies'}]

In [None]:
pipe("<startoftext><reviewtext>Mooi design maar de software is slecht<aspects>")

[{'generated_text': "<startoftext><reviewtext>Mooi design maar de software is slecht<aspects><con>Slechte kwaliteit<endoftext>\n<startafleveringtext>Ik heb deze stofzuiger nu enkele weken in huis en ben zeer tevreden met mijn aankoop. Ik had verwacht dat ik dit product zou kopen, omdat het apparaat zo'n goed zuigkracht maakt (en daardoor veel te weinig geluid geeft). Maar na 2 maanden gebruik vond ik er wel iets minder kracht aan"}]

In [None]:
#download model
!zip -r /content/gpt2-with-classification.zip /content/gpt2-with-classification
files.download('/content/gpt2-with-classification.zip')

# Load existing model

In [None]:
from transformers import AutoModel
model = AutoModel.from_pretrained("/content/model")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("GroNLP/gpt2-small-dutch")

In [None]:
from transformers import pipeline

pipe = pipeline('text-generation', model='/content/drive/MyDrive/ResearchProject/083/GPT2_with_class/model', tokenizer="GroNLP/gpt2-small-dutch")

# Evaluate model

In [None]:
import json

with open('/content/drive/MyDrive/ResearchProject/083/reviews_083_testing.json') as f: #specify path to testing data
    test_data = json.load(f)

In [None]:
import json

outpath = '/content/drive/MyDrive/ResearchProject/083/GPT2_with_class/GPT2_with_class_083_result.json'

with open(outpath) as f:
  test_data = json.load(f)

def check_count():
  if count % 100 == 0:
    print("processed "+str(count)+" reviews out of " + str(total_num_reviews))
    with open(outpath, 'w') as outfile:
      json.dump(test_data, outfile)

count = 0
total_num_reviews = len(test_data)

for review in test_data:
  if (("generated_pros" in review.keys()) and review["generated_pros"]) or (("generated_cons" in review.keys()) and review["generated_cons"]):
    count += 1
    check_count()
    continue
  else:
    review["generated_pros"] = []
    review["generated_cons"] = []
  
  max_length = len(tokenizer(review["text"])["input_ids"]) + 15 + 15
  generated = pipe("<startoftext><reviewtext>" + review["text"] + "<aspects>", max_length = max_length)
  generated = generated[0]["generated_text"]
  generated = generated.replace(f'<startoftext><reviewtext>{review["text"]}<aspects>',"")
  
  if (not generated.startswith("<endoftext>")) and ("<endoftext>" in generated):
    generated = generated.split("<endoftext>")[0]
  else:
    continue

  generated = generated.split("<")
  for substring in generated:
    aspect = substring.split(">")
    if len(aspect) is not 2:
      continue
    if aspect[0] == "pro":
      review["generated_pros"].append(aspect[1])
    elif aspect[0] == "con":
      review["generated_cons"].append(aspect[1])

  count += 1
  check_count()


with open(outpath, 'w') as outfile:
  json.dump(test_data, outfile)

files.download(outpath)