In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.1-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 7.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.2 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 37.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 68.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Foun

In [None]:
import pandas as pd

df = pd.read_csv('/content/cleaned_dataset.csv')
sarcastic = df[df['sarcastic'] == 1]
non_sarcastic = df[df['sarcastic'] == 0]

In [None]:
sarcastic.head()

Unnamed: 0,tweet,sarcastic
0,the only thing i get from college be a caffein...,1
1,i love it when professor draw a big question m...,1
2,remember the hundred email from company when c...,1
3,today my pop pop tell me i be not force to go ...,1
4,i do too and i also report cancun cruz not wor...,1


In [None]:
non_sarcastic.head()

Unnamed: 0,tweet,sarcastic
867,i always think go braless be a good idea until...,0
868,life be so much good with a heating blanket,0
869,sometimes i just go through my phone and look ...,0
870,be not back in the state for even 5 minute bef...,0
871,in desperate need of and i can not stress this...,0


In [None]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
  def __init__(self, df):
    super().__init__()

    self.data_list = []
    self.end_of_text_token = " <|endoftext|> "

    for index, row in df.iterrows():
      data_str = f"{row[0]}{self.end_of_text_token}"
      self.data_list.append(data_str)

  def __len__(self):
    return len(self.data_list)

  def __getitem__(self, item):
    return self.data_list[item]


dataset_sarcastic = MyDataset(sarcastic)
dataset_non_sarcastic = MyDataset(non_sarcastic)
data_loader_sarcastic = DataLoader(dataset_sarcastic, batch_size=1, shuffle=True)
data_loader_non_sarcastic = DataLoader(dataset_non_sarcastic, batch_size=1, shuffle=True)

In [None]:
import torch

device = 'cpu'
if torch.cuda.is_available():
	device = 'cuda'
print(device)

cuda


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

In [None]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [None]:
model = model.to(device)

In [None]:
def train(epochs, data_loader, batch_size, tokenizer, model, device):	
  batch_counter = 0
  sum_loss = 0.0

  for epoch in range(epochs):
    print (f'Running {epoch+1} epoch')

    for idx, txt in enumerate(data_loader):
      txt = torch.tensor(tokenizer.encode(txt[0]))
      txt = txt.unsqueeze(0).to(device)
      outputs = model(txt, labels=txt)
      loss, _ = outputs[:2]
      loss.backward()
      sum_loss += loss.data

      if idx%batch_size==0:
        batch_counter += 1
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        model.zero_grad()

      if batch_counter == 10:
        print(f"Total Loss is {sum_loss}")
        batch_counter = 0
        sum_loss = 0.0

  return model

In [None]:
from transformers import AdamW, get_cosine_with_hard_restarts_schedule_with_warmup

model.train()
optimizer = AdamW(model.parameters(), lr=1e-3)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=50, num_training_steps=-1)



In [None]:
model = train(4, data_loader_sarcastic, 8, tokenizer, model, device)

Running 1 epoch
Total Loss is 400.5715637207031
Total Loss is 379.5576477050781
Total Loss is 374.8067321777344
Total Loss is 368.4908142089844
Total Loss is 396.3864440917969
Total Loss is 393.97509765625
Total Loss is 396.3054504394531
Total Loss is 385.0041198730469
Total Loss is 394.3728942871094
Total Loss is 401.3706970214844
Running 2 epoch
Total Loss is 377.47705078125
Total Loss is 342.4095458984375
Total Loss is 332.7271423339844
Total Loss is 329.0473327636719
Total Loss is 346.2327880859375
Total Loss is 336.55633544921875
Total Loss is 337.2743225097656
Total Loss is 341.426513671875
Total Loss is 334.7634582519531
Total Loss is 325.00323486328125
Total Loss is 344.3791198730469
Running 3 epoch
Total Loss is 308.1657409667969
Total Loss is 325.9278564453125
Total Loss is 318.2651672363281
Total Loss is 325.0956726074219
Total Loss is 337.4659118652344
Total Loss is 341.8387145996094
Total Loss is 339.9996643066406
Total Loss is 359.1162109375
Total Loss is 335.83154296875


In [None]:
def save_model(model, name):
	torch.save(model.state_dict(), f"/content/{name}.pt")
	return

save_model(model, 'sarcastic')

In [None]:
import numpy as np

def choose_from_top_k_top_n(probs, k=50, p=0.8):
  ind = np.argpartition(probs, -k)[-k:]
  top_prob = probs[ind]
  top_prob = {i: top_prob[idx] for idx,i in enumerate(ind)}
  sorted_top_prob = {k: v for k, v in sorted(top_prob.items(), key=lambda item: item[1], reverse=True)}

  t=0
  f=[]
  pr = []
  for k,v in sorted_top_prob.items():
    t+=v
    f.append(k)
    pr.append(v)
    if t>=p:
      break
  top_prob = pr / np.sum(pr)
  token_id = np.random.choice(f, 1, p = top_prob)

  return int(token_id)

In [None]:
import gc
import torch

gc.collect()

torch.cuda.empty_cache()

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer_sarcastic = GPT2Tokenizer.from_pretrained('gpt2-medium')
model_sarcastic = GPT2LMHeadModel.from_pretrained('gpt2-medium')
tokenizer_non_sarcastic = GPT2Tokenizer.from_pretrained('gpt2-medium')
model_non_sarcastic = GPT2LMHeadModel.from_pretrained('gpt2-medium')

model_sarcastic_path = f"/content/drive/MyDrive/GPT2Sarcasm/sarcastic.pt"
model_non_sarcastic_path = f"/content/drive/MyDrive/GPT2Sarcasm/non-sarcastic.pt"

model_sarcastic.load_state_dict(torch.load(model_sarcastic_path))
model_non_sarcastic.load_state_dict(torch.load(model_non_sarcastic_path))

model_sarcastic = model_sarcastic.to(device)
model_non_sarcastic = model_non_sarcastic.to(device)

In [None]:
from tqdm import tqdm

def generate(tokenizer, model, sentences, label):
	result = []
	with torch.no_grad():
	  for idx in tqdm(range(sentences)):
		  finished = False
		  cur_ids = torch.tensor(tokenizer.encode(label)).unsqueeze(0).to(device)
		  for i in range(100):
			  outputs = model(cur_ids, labels=cur_ids)
			  loss, logits = outputs[:2]

			  softmax_logits = torch.softmax(logits[0,-1], dim=0)

			  if i < 5:
				  n = 10
			  else:
				  n = 5

			  next_token_id = choose_from_top_k_top_n(softmax_logits.to(device).cpu().numpy())
			  cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1)

			  if next_token_id in tokenizer.encode('<|endoftext|>'):
				  finished = True
				  break

		  if finished:	          
			  output_list = list(cur_ids.squeeze().to(device).cpu().numpy())
			  output_text = tokenizer.decode(output_list)
			  result.append(output_text)
		  else:
			  output_list = list(cur_ids.squeeze().to(device).cpu().numpy())
			  output_text = tokenizer.decode(output_list)
			  result.append(output_text)
	  return result

In [None]:
SAR = generate(tokenizer_sarcastic, model_sarcastic, 4000, 'SAR')

100%|██████████| 4000/4000 [17:34<00:00,  3.79it/s]


In [None]:
f = open('/content/SAR.txt', 'w')
for l in SAR:
  f.write(l.replace('SAR', '').replace('<|endoftext|>', '').replace("\n", "").replace(",", " "))
  f.write("\n")
f.close()

In [None]:
NON = generate(tokenizer_non_sarcastic, model_non_sarcastic, 4000, 'NON')

100%|██████████| 4000/4000 [28:44<00:00,  2.32it/s]


In [None]:
NON[0:5]

['NON mean there be not an ever a good idea not try all your food by hand i want my entire food turn out to be cooked on the <|endoftext|>',
 'NON <|endoftext|>',
 'NON be the same reason that it be night again im not get to think that it wasnt get to do and then im not get to think that it wasnt get to do it yet <|endoftext|>',
 'NON my body to show on the air in the summer please you want to see the body again <|endoftext|>',
 'NON wanna move to something really sad like you dont wanna move it like you want to move to be sad and sad people <|endoftext|>']

In [None]:
f = open('/content/NON.txt', 'w')
for l in NON:
  f.write(l.replace('NON', '').replace('<|endoftext|>', '').replace("\n", "").replace(",", " "))
  f.write("\n")
f.close()

In [None]:
data = { "tweet": [] , "label": [] }

for l in NON:
  tweet = l.replace('NON', '').replace('<|endoftext|>', '').replace("\n", "")
  if tweet == "" or tweet == " " or "\n" in tweet:
    pass
  else:
    if tweet[0] == ' ':
      data["tweet"].append(tweet[1:])
    else:
      data["tweet"].append(tweet)
    data["label"].append(0)

for l in SAR:
  tweet = l.replace('SAR', '').replace('<|endoftext|>', '').replace("\n", "")
  if tweet == "" or tweet == " " or "\n" in tweet:
    pass
  else:
    if tweet[0] == ' ':
      data["tweet"].append(tweet[1:])
    else:
      data["tweet"].append(tweet)
    data["label"].append(1)

print(len(data["tweet"]))

7186


In [None]:
import pandas as pd

df = pd.DataFrame(data)

In [None]:
df = df.sample(frac=1).reset_index(drop=True)
df.to_csv("/content/gpt.csv", index=False)