In [1]:
#!pip install transformers


In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import spacy
from spacy import displacy


sia = SentimentIntensityAnalyzer()
nlp = spacy.load("en_core_web_sm")

# Load the data
df = pd.read_csv('X:/home/comp creativity/AITA generator/aita_clean.csv', low_memory=True)
df = df.dropna()
# Filter the data for each class
class_0 = df[df['is_asshole'] == 0]
class_1 = df[df['is_asshole'] == 1]
# Sample an equal number of instances from each class
sample_size = 500
class_0_sample = class_0.sample(sample_size, random_state=42)
class_1_sample = class_1.sample(sample_size, random_state=42)
# Combine the sampled data and shuffle it
df = pd.concat([class_0_sample, class_1_sample]).sample(frac=1, random_state=42).reset_index(drop=True)
# Preprocess the data
def remove_tags(text):
    return BeautifulSoup(text, 'html.parser').get_text()
def preprocess_text(text):
    # Lowercase
   # text = text.lower()
    # Remove HTML tags
    text = remove_tags(text)
    # Replace punctuations and numbers with a space (except for '.', ',', '!', '?')
    text = re.sub('[^a-zA-Z.,!?]', ' ', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text
# Preprocess titles and bodies using the preprocess_text function
preprocessed_titles = [preprocess_text(text) for text in df['title']]
preprocessed_bodies = [preprocess_text(text) for text in df['body']]





In [3]:
def extract_entities_and_pos(text):
    doc = nlp(text)
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]
    pos_tags = [(token.text, token.pos_) for token in doc]

    return named_entities, pos_tags

# Define a grammar_check function
def grammar_check(text):
    tb = TextBlob(text)
    return tb.edit_distance(tb.correct())
def compute_body_reward(preprocessed_body):
    body_reward = 0
    body = preprocessed_body
    sentiment_score = sia.polarity_scores(body)
    sentiment_reward = 1 if abs(sentiment_score['compound']) >= 0.75 else 0

    named_entities, pos_tags = extract_entities_and_pos(preprocessed_body)

    # Reward based on the presence of specific POS tags or entities
    nlp_reward = 0
    if any([tag[1] == 'VERB' for tag in pos_tags]):
        nlp_reward += 1
    if any([ent[1] == 'PERSON' for ent in named_entities]):
        nlp_reward += 1

    grammar_errors = grammar_check(body)
    grammar_reward = 1 if grammar_errors == 0 else 0

    w_sentiment, w_nlp, w_grammar = 1, 2, 1
    reward = (w_sentiment * sentiment_reward +
              w_nlp * nlp_reward +
              w_grammar * grammar_reward)
    reward /= (w_sentiment + w_nlp + w_grammar)

    return reward

    
def compute_body_reward(preprocessed_body):
    body_reward = 0
    body = preprocessed_body
    sentiment_score = sia.polarity_scores(body)
    sentiment_reward = 1 if abs(sentiment_score['compound']) >= 0.75 else 0
    situation_keywords = ['situation', 'happened', 'issue']
    action_keywords = ['action', 'did', 'took']
    justifiable_keywords = ['justifiable', 'wrong', 'right']
    situation_present = any(keyword in token.text for keyword in situation_keywords for token in nlp(preprocessed_body))
    action_present = any(keyword in token.text for keyword in action_keywords for token in nlp(preprocessed_body))
    justifiable_present = any(keyword in token.text for keyword in justifiable_keywords for token in nlp(preprocessed_body))
    structure_reward = sum([situation_present, action_present, justifiable_present])
    grammar_errors = grammar_check(body)
    grammar_reward = 1 if grammar_errors == 0 else 0
    w_sentiment, w_structure, w_grammar = 1, 3, 1
    reward = (w_sentiment * sentiment_reward +
              w_structure * structure_reward +
              w_grammar * grammar_reward)
    reward /= (w_sentiment + w_structure + w_grammar)
    return reward
def compute_reward( generated_title, generated_body):
    # Preprocess the title and body
    preprocessed_title = preprocess_text(generated_title)
    preprocessed_body = preprocess_text(generated_body)
    # Compute rewards for the title and body
    title_reward = compute_title_reward(preprocessed_title)
    body_reward = compute_body_reward(preprocessed_body)
    # Combine the title and body rewards with appropriate weights
    w_title, w_body = 0.5, 0.5
    reward = (w_title * title_reward + w_body * body_reward)
    reward /= (w_title + w_body)
    return reward



In [4]:
import torch
from torch.utils.data import Dataset


# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
# Set the padding token to the end-of-sequence (eos) token
tokenizer.pad_token = tokenizer.eos_token

# Update the tokenizer with the separator token
separator_token = "<SEP>"
tokenizer.add_special_tokens({"additional_special_tokens": [separator_token]})
# Update the model with the new vocabulary size
model.resize_token_embeddings(len(tokenizer))
# Tokenize the dataset
texts = [title + separator_token + body for title, body in zip(preprocessed_titles, preprocessed_bodies)]
tokenized_texts = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)




class CustomTextDataset(Dataset):
    def __init__(self, tokenizer, texts, block_size):
        self.tokenizer = tokenizer
        self.texts = texts
        self.block_size = block_size

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokenized_text = self.tokenizer(self.texts[idx], return_tensors="pt", padding="max_length", truncation=True, max_length=self.block_size)
        input_ids = tokenized_text["input_ids"].squeeze()
        attention_mask = tokenized_text["attention_mask"].squeeze()

        return {"input_ids": input_ids, "attention_mask": attention_mask}

# Prepare the dataset
dataset = CustomTextDataset(tokenizer=tokenizer, texts=texts, block_size=128)





# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-reddit",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)
# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)
# Train the model
trainer.train()



Step,Training Loss
500,4.121
1000,3.0629


TrainOutput(global_step=1250, training_loss=3.46015009765625, metrics={'train_runtime': 4277.4676, 'train_samples_per_second': 1.169, 'train_steps_per_second': 0.292, 'total_flos': 326615040000000.0, 'train_loss': 3.46015009765625, 'epoch': 5.0})

In [5]:
# from sklearn.model_selection import train_test_split
# import torch
# from transformers import DistilBertForSequenceClassification
# from transformers import Trainer, TrainingArguments

# X = preprocessed_bodies
# y = df['is_asshole'].values

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# train_encodings = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt")
# val_encodings = tokenizer(X_val, padding=True, truncation=True, return_tensors="pt")



# class AITADataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels[idx])
#         return item

#     def __len__(self):
#         return len(self.labels)

# train_dataset = AITADataset(train_encodings, y_train)
# val_dataset = AITADataset(val_encodings, y_val)


# S_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


# training_args = TrainingArguments(
#     output_dir='./sentiment_model',
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     logging_dir='./logs',
#     logging_steps=100,
#     evaluation_strategy="steps",
# )

# trainer = Trainer(
#     model=S_model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
# )

# trainer.train()






In [6]:
max_length=150
class TextProcessor:
    def __init__(self, ending_patterns=None):
        self.ending_patterns = ending_patterns if ending_patterns else ['?']

    def check_ending(self, text):
        return any(text.strip().endswith(pattern) for pattern in self.ending_patterns)

    def add_proper_ending(self, text):
        return text.strip() + self.ending_patterns[0]

    def generate_text(self, prompt, model, tokenizer, max_length=max_length, max_attempts=10, temperature=0.7):
        # Tokenize the prompt
        tokenized_prompt = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
        input_ids = tokenized_prompt["input_ids"]
        attention_mask = tokenized_prompt["attention_mask"]

        for _ in range(max_attempts):
            # Generate text
            output = model.generate(
                input_ids,
                max_length=max_length,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                temperature=temperature,  # Pass the temperature parameter
                attention_mask=attention_mask,  # Pass the attention mask
            )

            # Decode the generated text
            generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

            if self.check_ending(generated_text):
                return generated_text

        return self.add_proper_ending(generated_text)  # Add a proper ending if not found within max_attempts

In [9]:
# prompt = " AITA for "
# generated_text = generate_text(prompt, model, tokenizer)


In [11]:
text_processor = TextProcessor(ending_patterns=['.', '?', '!'])


In [None]:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox

def generate():
    prompt = prompt_entry.get()
    if not prompt:
        messagebox.showerror("Error", "Please enter a prompt")
        return

    temperature = temperature_var.get()
    try:
        generated_text = text_processor.generate_text(prompt, model, tokenizer, max_length=max_length, max_attempts=10, temperature=float(temperature))
        result_text.delete(1.0, tk.END)
        result_text.insert(tk.END, generated_text)
    except Exception as e:
        messagebox.showerror("Error", f"An error occurred: {e}")

def classify():
    generated_text = result_text.get(1.0, tk.END).strip()
    if not generated_text:
        messagebox.showerror("Error", "Please generate text before classifying")
        return

    input_text_preprocessed = preprocess_text(generated_text)
    input_text_sequence = word_tokenizer.texts_to_sequences([input_text_preprocessed])
    input_text_padded = pad_sequences(input_text_sequence, padding='post', maxlen=maxlen)

    prediction = lstm_model.predict(input_text_padded)
    assholish_percentage = round(prediction[0][0] * 100, 2)
    not_assholish_percentage = round((1 - prediction[0][0]) * 100, 2)

    messagebox.showinfo("Classification", f"This post is {assholish_percentage}% assholish and {not_assholish_percentage}% not assholish")

def generate_random():
    prompt = " "
    try:
        generated_text = generate_text(prompt, model, tokenizer)
        result_text.delete(1.0, tk.END)
        result_text.insert(tk.END, generated_text)
    except Exception as e:
        messagebox.showerror("Error", f"An error occurred: {e}")

# Create the main window
root = tk.Tk()
root.title("GPT Reddit")

# Create and position the UI elements
prompt_label = ttk.Label(root, text="Enter the prompt:")
prompt_label.grid(row=0, column=0, padx=(10, 0), pady=(10, 5), sticky=tk.W)

prompt_entry = ttk.Entry(root, width=60)
prompt_entry.grid(row=0, column=1, padx=(0, 10), pady=(10, 5), sticky=tk.W)

options_frame = ttk.Frame(root)
options_frame.grid(row=1, column=1, padx=(0, 10), pady=(0, 10), sticky=tk.E)

temperature_label = ttk.Label(options_frame, text="Temperature:")
temperature_label.grid(row=0, column=0, padx=(0, 5), pady=(0, 10), sticky=tk.W)

temperature_var = tk.StringVar(options_frame, value="0.6")
temperature_menu = ttk.OptionMenu(options_frame, temperature_var, "0.2", "0.4", "0.6", "0.8", "1.0")
temperature_menu.grid(row=0, column=1, padx=(0, 10), pady=(0, 10), sticky=tk.W)

generate_button = ttk.Button(options_frame, text="Generate Text", command=generate)
generate_button.grid(row=0, column=2, padx=(0, 5), pady=(0, 10), sticky=tk.E)

generate_random_button = ttk.Button(options_frame, text="I'm Feeling Lucky", command=generate_random)
generate_random_button.grid(row=0, column=3, padx=(0, 5), pady=(0, 10), sticky=tk.E)

classify_button = ttk.Button(options_frame, text="Classify", command=classify)
classify_button.grid(row=0, column=4, padx=(0, 10), pady=(0, 10), sticky=tk.E)

result_label = ttk.Label(root, text="Generated Text:")
result_label.grid(row=2, column=0, padx=(10, 0), pady=(0, 10), sticky=tk.W)

result_text = tk.Text(root, wrap=tk.WORD, width=80, height=20)
result_text.grid(row=2, column=1, padx=(0, 10), pady=(0, 10), sticky=tk.W)

scrollbar = ttk.Scrollbar(root, command=result_text.yview)
scrollbar.grid(row=2, column=2, pady=(0, 10), sticky=tk.NS)
result_text.config(yscrollcommand=scrollbar.set)

# Run the main event loop
root.mainloop()









Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\users\public\downloads\Anaconda3\lib\tkinter\__init__.py", line 1892, in __call__
    return self.func(*args)
  File "C:\Users\da476\AppData\Local\Temp\ipykernel_19164\4017118423.py", line 25, in classify
    input_text_sequence = word_tokenizer.texts_to_sequences([input_text_preprocessed])
NameError: name 'word_tokenizer' is not defined
