In [1]:
# !pip install pandas
# !pip install spacy
# !pip install nltk
# !pip install tensorflow
# !pip install python-Levenshtein
# !pip install textblob
# !pip install tensorflow


In [2]:
!python -m spacy download en_core_web_sm
import nltk
nltk.download('vader_lexicon')


Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 72.5 MB/s eta 0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     \\files.kent.ac.uk\usersk\ko275\home\Essential-User-
[nltk_data]     Settings\AppData\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
import pandas as pd
import spacy
import re
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense
from tensorflow.keras.models import Model
from textblob import TextBlob
import Levenshtein


In [4]:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])
sia = SentimentIntensityAnalyzer()

# Load the data

df = pd.read_csv('X:/home/aita_clean.csv')
df = df.dropna()


# Filter the data for each class
class_0 = df[df['is_asshole'] == 0]
class_1 = df[df['is_asshole'] == 1]

# Sample an equal number of instances from each class
sample_size = 25000
class_0_sample = class_0.samGGple(sample_size, random_state=42)
class_1_sample = class_1.sample(sample_size, random_state=42)

# Combine the sampled data and shuffle it
df = pd.concat([class_0_sample, class_1_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

This code preprocesses a dataset of titles and bodies, tokenizes the text, and trains a multi-task seq2seq model to generate titles and bodies.

The script involves preprocessing of text data, creation of sequences from preprocessed data, tokenization, splitting the data into training and validation sets, and defining and training a multi-task learning model. The model consists of an encoder and two decoders (one for the title and one for the body) and is trained to generate text for both title and body simultaneously. Inference models are defined for each decoder to generate new text. The script implements multi-task learning for text generation using seq2seq architecture with LSTM units.


In [5]:
# Preprocess the data
max_title_len = 20
max_body_len = 200
input_len = max_title_len + max_body_len + 1

def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove HTML links
    text = re.sub(r'http\S+', '', text)  # Remove links starting with http
    text = re.sub(r'www\S+', '', text)   # Remove links starting with www

    # Replace punctuations and numbers with a space (except for '.', ',', '!', '?' and numbers)
    text = re.sub('[^a-zA-Z0-9.,!?]', ' ', text)

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Preprocess titles and bodies using the preprocess_text function
preprocessed_titles = ['<start> ' + preprocess_text(text) + ' <end>' for text in df['title']]
preprocessed_bodies = ['<start> ' + preprocess_text(text) + ' <end>' for text in df['body']]

# We should ensure that both title and body are not empty after preprocessing
filtered_data = [(title, body) for title, body in zip(preprocessed_titles, preprocessed_bodies) if title and body]

separator_token = "<sep>"
texts = [title + ' ' + separator_token + ' ' + body for title, body in filtered_data]
texts = list(filter(None, texts))

tokenizer = Tokenizer(filters='', lower=False)
tokenizer.fit_on_texts(texts)
total_unique_words = len(tokenizer.word_index)
max_words = int(total_unique_words * 1.0)
tokenizer = Tokenizer(num_words=max_words, filters='', lower=False)
tokenizer.fit_on_texts(texts)



# Add the separator token to the tokenizer's word index
if separator_token not in tokenizer.word_index:
    tokenizer.word_index[separator_token] = max_words
    tokenizer.index_word[max_words] = separator_token

total_words = len(tokenizer.word_index) + 1

if '<start>' not in tokenizer.word_index:
    tokenizer.word_index['<start>'] = max_words - 1
    tokenizer.index_word[max_words - 1] = '<start>'
if '<end>' not in tokenizer.word_index:
    tokenizer.word_index['<end>'] = max_words
    tokenizer.index_word[max_words] = '<end>'


# Generate input sequences using the tokenizer
input_sequences = tokenizer.texts_to_sequences(texts)
input_sequences = pad_sequences(input_sequences, maxlen=input_len, padding='post', truncating='post')

input_sequences = np.array([seq for seq in input_sequences if 1 in seq], dtype=np.int32)


missing_start_count = 0
missing_end_count = 0
for seq in input_sequences:
    if tokenizer.word_index['<start>'] not in seq:
        missing_start_count += 1
    if tokenizer.word_index['<end>'] not in seq:
        missing_end_count += 1

print(f"Missing start token in {missing_start_count} sequences.")
print(f"Missing end token in {missing_end_count} sequences.")


# Find the separator token's index in each sequence, will set the separator index to the last index of the title part if it's not found in the sequence.
separator_indices = []
missing_separator_count = 0
for seq in input_sequences:
    separator_index = np.where(seq == tokenizer.word_index[separator_token])[0]
    if separator_index.size > 0:
        separator_indices.append(separator_index[0])
    else:
        separator_indices.append(max_title_len - 1)
        missing_separator_count += 1

separator_indices = np.array(separator_indices)
print(f"Missing separator token in {missing_separator_count} sequences.")

# Split the dataset into title and body parts
input_sequences_title = np.array([seq[:idx] for seq, idx in zip(input_sequences, separator_indices)], dtype=object)
input_sequences_body = np.array([seq[idx+1:] for seq, idx in zip(input_sequences, separator_indices)], dtype=object)

# Pad the sequences for title and body
input_sequences_title = pad_sequences(input_sequences_title, maxlen=max_title_len, padding='post', truncating='post')
input_sequences_body = pad_sequences(input_sequences_body, maxlen=max_body_len, padding='post', truncating='post')

# Concatenate title and body sequences with the separator token for the encoder input
input_sequences_combined = np.hstack([input_sequences_title, np.full((input_sequences_title.shape[0], 1), tokenizer.word_index[separator_token]), input_sequences_body])

# Split the dataset into training and validation sets
X_train_combined, X_val_combined = train_test_split(input_sequences_combined, test_size=0.1, random_state=42)
X_train_title, X_val_title = train_test_split(input_sequences_title, test_size=0.1, random_state=42)
X_train_body, X_val_body = train_test_split(input_sequences_body, test_size=0.1, random_state=42)

# Multi-task learning model
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_emb_layer = Embedding(total_words, latent_dim)
encoder_emb = encoder_emb_layer(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_emb)
encoder_states = [state_h, state_c]

# Define the encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Title Decoder
decoder_inputs_title = Input(shape=(None,))
decoder_emb_layer_title = Embedding(total_words, latent_dim)
decoder_emb_title = decoder_emb_layer_title(decoder_inputs_title)
decoder_lstm_title = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs_title, _, _ = decoder_lstm_title(decoder_emb_title, initial_state=encoder_states)
decoder_dense_title = TimeDistributed(Dense(total_words, activation='softmax'))
decoder_outputs_title = decoder_dense_title(decoder_outputs_title)

# Body Decoder
decoder_inputs_body = Input(shape=(None,))
decoder_emb_layer_body = Embedding(total_words, latent_dim)
decoder_emb_body = decoder_emb_layer_body(decoder_inputs_body)
decoder_lstm_body = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs_body, _, _ = decoder_lstm_body(decoder_emb_body, initial_state=encoder_states)
decoder_dense_body = TimeDistributed(Dense(total_words, activation='softmax'))
decoder_outputs_body = decoder_dense_body(decoder_outputs_body)

# Define the multi-task model
model = Model([encoder_inputs, decoder_inputs_title, decoder_inputs_body], [decoder_outputs_title, decoder_outputs_body])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', loss_weights=[0.5, 0.5])
print("X_train_title shape:", X_train_title.shape)
print("X_train_body shape:", X_train_body.shape)
 
# Train the seq2seq model
batch_size = 32
epochs = 50
validation_split = 0.1

model.fit([X_train_combined, X_train_title[:, :-1], X_train_body[:, :-1]], [X_train_title[:, 1:], X_train_body[:, 1:]],
          batch_size=batch_size,
          epochs=epochs,
          validation_split=validation_split,
          )

# Title Decoder Inference
decoder_state_input_h_title = Input(shape=(latent_dim,))
decoder_state_input_c_title = Input(shape=(latent_dim,))
decoder_states_inputs_title = [decoder_state_input_h_title, decoder_state_input_c_title]
decoder_emb_title = decoder_emb_layer_title(decoder_inputs_title)
decoder_outputs_title, state_h_title, state_c_title = decoder_lstm_title(decoder_emb_title, initial_state=decoder_states_inputs_title)
decoder_states_title = [state_h_title, state_c_title]
decoder_outputs_title = decoder_dense_title(decoder_outputs_title)
decoder_model_title = Model([decoder_inputs_title] + decoder_states_inputs_title, [decoder_outputs_title] + decoder_states_title)

# Body Decoder Inference
decoder_state_input_h_body = Input(shape=(latent_dim,))
decoder_state_input_c_body = Input(shape=(latent_dim,))
decoder_states_inputs_body = [decoder_state_input_h_body, decoder_state_input_c_body]
decoder_emb_body = decoder_emb_layer_body(decoder_inputs_body)
decoder_outputs_body, state_h_body, state_c_body = decoder_lstm_body(decoder_emb_body, initial_state=decoder_states_inputs_body)
decoder_states_body = [state_h_body, state_c_body]
decoder_outputs_body = decoder_dense_body(decoder_outputs_body)
decoder_model_body = Model([decoder_inputs_body] + decoder_states_inputs_body, [decoder_outputs_body] + decoder_states_body)

Missing start token in 0 sequences.
Missing end token in 0 sequences.
Missing separator token in 0 sequences.
X_train_title shape: (17895, 20)
X_train_body shape: (17895, 200)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50




The script contains functions for decoding sequences into titles and bodies, checking grammar, computing rewards for generated titles and bodies, and computing the total reward. The reward is based on several factors such as the presence of certain keywords, sentiment score, structure of the text, and number of grammar errors. Note: right now it is not concered with maximising the reward, only to see as a baseline to be compared with after PPO training 


In [30]:
# To ensure UTF-8 encoding is used
import locale

def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# Importing the Levenshtein distance for grammar check
import Levenshtein
!pip install python-Levenshtein

# Function to decode the title from the input sequence
def decode_sequence_title(input_seq):
    # Predict the state values using the encoder model and the input sequence
    states_value = encoder_model.predict(input_seq)
    # Initialize the target sequence with the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<start>']

    stop_condition = False
    decoded_sentence = ''

    # Loop until the end token is predicted or the maximum length is reached
    while not stop_condition:
        # Predict the next token and the state values
        output_tokens, h, c = decoder_model_title.predict([target_seq] + states_value)
        # The token with the highest probability is selected
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word[sampled_token_index]
        
        # If the end token is predicted or the maximum length is reached, stop
        if sampled_char == '<end>' or len(decoded_sentence) > max_title_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_char

        # Update the target sequence with the predicted token
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        # Update the state values
        states_value = [h, c]

    return decoded_sentence.strip()

# Same function as above but for the body
def decode_sequence_body(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<start>']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model_body.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer.index_word[sampled_token_index]
        
        if sampled_char == '<end>' or len(decoded_sentence) > max_body_len:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_char

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

# Function to check grammar in a text using Levenshtein distance
def grammar_check(text):
    tb = TextBlob(text)
    corrected_text = tb.correct()
    return Levenshtein.distance(str(corrected_text), text)


def compute_title_reward(preprocessed_title):
    title_reward = 0
    title_keywords = {'aita', 'wibta', 'am i the'}
    tokens = {token.text.lower() for token in nlp(preprocessed_title)}
    title_present = bool(title_keywords.intersection(tokens))
    if title_present:
        title_reward += 1

    grammar_errors = grammar_check(preprocessed_title)
    grammar_reward = 0
    if grammar_errors == 0:
        grammar_reward += 1

    w_title, w_grammar = 0.5, 0.5
    reward = (w_title * title_reward + w_grammar * grammar_reward)
    reward /= (w_title + w_grammar)

    return reward

def compute_body_reward(preprocessed_body):
    body_reward = 0
    body = preprocessed_body

    sentiment_score = sia.polarity_scores(body)
    sentiment_reward = 1 if abs(sentiment_score['compound']) >= 0.75 else 0

    situation_keywords = {'situation', 'happened', 'issue'}
    action_keywords = {'action', 'did', 'took'}
    justifiable_keywords = {'justifiable', 'wrong', 'right'}

    tokens = set(token.text for token in nlp(preprocessed_body))
    situation_present = bool(situation_keywords.intersection(tokens))
    action_present = bool(action_keywords.intersection(tokens))
    justifiable_present = bool(justifiable_keywords.intersection(tokens))
    structure_reward = sum([situation_present, action_present, justifiable_present])

    grammar_errors = grammar_check(body)
    grammar_reward = 1 if grammar_errors == 0 else 0

    w_sentiment, w_structure, w_grammar = 1, 3, 1
    reward = (w_sentiment * sentiment_reward +
              w_structure * structure_reward +
              w_grammar * grammar_reward)
    reward /= (w_sentiment + w_structure + w_grammar)

    return reward



def compute_reward( generated_title, generated_body):
    # Preprocess the title and body
    preprocessed_title = preprocess_text(generated_title)
    preprocessed_body = preprocess_text(generated_body)

    # Compute rewards for the title and body
    title_reward = compute_title_reward(preprocessed_title)
    body_reward = compute_body_reward(preprocessed_body)

    # Combine the title and body rewards with appropriate weights
    w_title, w_body = 0.5, 0.5
    reward = (w_title * title_reward + w_body * body_reward)
    reward /= (w_title + w_body)

    return reward




In [54]:
def generate_text(input_text):
    input_sequence = tokenizer.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=input_len, padding='post', truncating='post')
    generated_title = decode_sequence_title(input_sequence)
    generated_body = decode_sequence_body(input_sequence)
    return generated_title, generated_body

def process_example(input_text):
    generated_title, generated_body = generate_text(input_text)
    reward = compute_reward(generated_title, generated_body)
    return generated_title, generated_body, reward

# Example usage
input_text = ""
generated_title, generated_body, reward = process_example(input_text)
gprint("Generated Title:", generated_title)
print("Generated Body:", generated_body)
print("Reward:", reward)

Generated Title: aita for smoking inside?
Generated Body: hello reddit AITA smoke smoke? i F liked smoking because it looked nice 
Reward: 0.25


ppo section

This code includes libraries and classes necessary to set up a reinforcement learning environment for the task of text generation, specifically for generating titles and bodies of texts. The code uses the Proximal Policy Optimization (PPO) reinforcement learning algorithm to generate text and uses the gym library to set up the reinforcement learning environment.



In [2]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed
from tensorflow.keras.models import Model
import gym
from gym import spaces
from nltk.sentiment import SentimentIntensityAnalyzer
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import language_tool_python
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
import nltk
import numpy as np
from tensorflow.data import Dataset
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from numpy import asarray
from numpy import zeros

nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')

import gym
from gym import spaces
import numpy as np
from typing import Dict
from nltk.sentiment import SentimentIntensityAnalyzer
import language_tool_python

sia = SentimentIntensityAnalyzer()
tool = language_tool_python.LanguageTool('en-US')
max_episode_length = 150  #max_episode_length is a term commonly used in reinforcement learning, referring to the maximum number of steps (or actions) allowed in each episode of the learning environment.



def grammar_check(sequence):
    matches = tool.check(sequence)
    return len(matches)

def preprocess_for_ppo(generated_text):
    tokenized_text = preprocess_text_lda(generated_text)
    return tokenized_text





        
        
class TextGenerationEnv(gym.Env):
    def __init__(self, model, tokenizer, max_title_len, max_body_len, max_episode_length, preprocess_for_ppo):
        super(TextGenerationEnv, self).__init__()
        self.preprocess_fn = preprocess_for_ppo
        self.max_sequence_len = max_title_len + max_body_len
        self.model = model
        self.tokenizer = tokenizer
        self.max_title_len = max_title_len
        self.max_body_len = max_body_len
        self.max_episode_length = max_episode_length
        self.action_space = spaces.Discrete(vocab_size)
        self.observation_space = spaces.Box(low=0, high=vocab_size - 1, shape=(max_title_len + max_body_len,), dtype=np.int32)
        
    def reset(self):
        self.current_text = self.generate_initial_state()
        self.current_title = self.current_text[:self.max_title_len]
        self.current_step = 0
        print("Reset: current_text shape", self.current_text.shape)  # Add this line
        return self.current_text
        
    def generate_initial_state(self):
        initial_title = np.random.randint(1, vocab_size, self.max_title_len - 1)
        initial_body = np.random.randint(1, vocab_size, self.max_body_len)
        initial_text = np.concatenate((initial_title, [self.tokenizer.word_index[separator_token]], initial_body))
        return initial_text

    def step(self, action: np.ndarray):
        token = action
        self.current_text[:-1] = self.current_text[1:]
        self.current_text[-1] = token

        self.current_step += 1
        generated_text = self.tokenizer.sequences_to_texts([self.current_text])[0]

        # Separate title and body using the separator token
        generated_text_split = generated_text.split(separator_token, 1)
        if len(generated_text_split) == 2:
            title, body = generated_text_split
        else:
            half_length = len(generated_text) // 2
            title = generated_text[:half_length]
            body = generated_text[half_length:]

        # Concatenate title and body without the separator token
        final_generated_text = title + " " + body
        preprocessed_text = self.preprocess_fn(generated_text)

        # Compute the rewards for title and body
        title_reward = self.compute_title_reward(title)
        body_reward = self.compute_body_reward(body)

        # Combine the weighted scores to obtain the final reward
        w_title, w_body = 0.5, 0.5
        reward = (w_title * title_reward + w_body * body_reward)
        reward /= (w_title + w_body)

        # Fit the model using the concatenated title and body
        Xy = np.expand_dims(self.current_text, axis=0)
        #print("Xy shape:", Xy.shape)
        self.model.fit(Xy, Xy, epochs=1, verbose=0, batch_size=1)

        done = self.current_step >= self.max_episode_length

        return self.current_text, reward, done, {} 





        
        


    def compute_title_reward(self, preprocessed_title):
        title_reward = 0
        title_keywords = ['aita', 'wibta', 'am i the']
        title_present = any(keyword in preprocessed_title for keyword in title_keywords)
        if title_present:
            title_reward += 1

        grammar_errors = grammar_check(' '.join(preprocessed_title))
        grammar_reward = 0
        if grammar_errors == 0:
            grammar_reward += 1

        # Combine the weighted scores to obtain the final reward
        w_title, w_grammar = 0.5, 0.5
        reward = (w_title * title_reward + w_grammar * grammar_reward)
        reward /= (w_title + w_grammar)

        return reward

    
    def compute_body_reward(self, preprocessed_body):
        body_reward = 0
        body = ' '.join(preprocessed_body)

        # Sentiment analysis reward
        sentiment_score = sia.polarity_scores(body)
        sentiment_reward = 1 if abs(sentiment_score['compound']) >= 0.75 else 0

        # Topical coherence reward
        coherence_reward = self.compute_topic_coherence(body)

        # Post structure reward
        situation_keywords = ['situation', 'happened', 'issue']
        action_keywords = ['action', 'did', 'took']
        justifiable_keywords = ['justifiable', 'wrong', 'right']
        situation_present = any(keyword in preprocessed_body for keyword in situation_keywords)
        action_present = any(keyword in preprocessed_body for keyword in action_keywords)
        justifiable_present = any(keyword in preprocessed_body for keyword in justifiable_keywords)
        structure_reward = sum([situation_present, action_present, justifiable_present])

        # Grammar and fluency reward
        grammar_errors = grammar_check(body)
        grammar_reward = 1 if grammar_errors == 0 else 0

        # Combine the weighted scores to obtain the final reward
        w_sentiment, w_coherence, w_structure, w_grammar = 1, 1, 3, 1
        reward = (w_sentiment * sentiment_reward +
                  w_coherence * coherence_reward +
                  w_structure * structure_reward +
                  w_grammar * grammar_reward)
        reward /= (w_sentiment + w_coherence + w_structure + w_grammar)

        return reward


    
    def compute_topic_coherence(self, body):
        # Preprocess the title and body
        title = self.tokenizer.sequences_to_texts([self.current_title])[0]
        tokenized_title = preprocess_text_lda(title)
        tokenized_body = preprocess_text_lda(body)
        
        # Combine the tokenized title and body
        tokenized_sequence = tokenized_title + tokenized_body

        # Create a bag-of-words representation
        bow_sequence = dictionary.doc2bow(tokenized_sequence)

        # Get topic distribution for the generated sequence
        topic_dist = lda_model.get_document_topics(bow_sequence)

        # Compute coherence based on the highest probability topic
        coherence = max([prob for _, prob in topic_dist])

        return coherence
    
    def compute_reward(self, generated_text):
        # Split the generated text using the separator token
        title, body = generated_text.split(separator_token)

        # Preprocess the title and body
        preprocessed_title = preprocess(title)
        preprocessed_body = preprocess(body)

        # Compute rewards for the title and body
        title_reward = self.compute_title_reward(preprocessed_title)
        body_reward = self.compute_body_reward(preprocessed_body)

        # Combine the title and body rewards with appropriate weights
        w_title, w_body = 0.5, 0.5
        reward = (w_title * title_reward + w_body * body_reward)
        reward /= (w_title + w_body)

        return reward



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\da476/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\da476/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\da476/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:

#Topical coherence

from gensim.models import LdaModel
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize

# Preprocess the text
def remove_tags(text):
    TAG_RE = re.compile(r'<[^>]+>')
    return TAG_RE.sub('', text)

def preprocess_text_lda(text):
    corpus = []
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')

    # Lowercase
    text = text.lower()

    # Remove HTML tags
    text = remove_tags(text)

    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove stopwords
    text = pattern.sub('', text)

    # Tokenize the text
    tokenized_text = word_tokenize(text)

    return tokenized_text


# Create a list of tokenized titles and bodies
tokenized_titles = [preprocess_text_lda(text) for text in df['title']]
tokenized_bodies = [preprocess_text_lda(text) for text in df['body']]

# Combine the tokenized titles and bodies into a single list of documents
documents = [title + body for title, body in zip(tokenized_titles, tokenized_bodies)]


# Create a dictionary representation of the documents
dictionary = Dictionary(documents)

# Filter out words that occur less than 20 documents, or more than 50% of the documents
dictionary.filter_extremes(no_below=20, no_above=0.5)

# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in documents]


# Train the LDA model with increased passes and iterations
num_topics = 10  # You can choose the number of topics based on your dataset
passes = 20  # Increase the number of passes
iterations = 200  # Increase the number of iterations
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, random_state=42, passes=passes, iterations=iterations)







NameError: name 'df' is not defined

This script is responsible for training, evaluating, and testing a Proximal Policy Optimization (PPO) reinforcement learning agent that generates text in a custom environment defined as TextGenerationEnv.



In [4]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import EvalCallback


# Create a training environment
train_env = TextGenerationEnv(model, tokenizer, max_title_len, max_body_len, max_episode_length, preprocess_for_ppo)
train_env = DummyVecEnv([lambda: train_env])

# Train the PPO agent
ppo_agent = PPO("MlpPolicy", train_env, verbose=1)
ppo_agent.learn(total_timesteps=50000)  # Train for 50,000 time steps

# Create an evaluation environment
eval_env = TextGenerationEnv(model, tokenizer, max_title_len, max_body_len, max_episode_length, preprocess_for_ppo)
eval_env = DummyVecEnv([lambda: eval_env])

# Create an EvalCallback
eval_callback = EvalCallback(eval_env, best_model_save_path='./models/',
                             log_path='./logs/', eval_freq=1000,
                             deterministic=True, render=False)

# Perform evaluation after training is completed
ppo_agent.learn(total_timesteps=1, callback=eval_callback)  # Run learn method for 1 time step to trigger the evaluation



# Load the best model
best_agent = PPO.load(save_path)

# Generate text using the trained agent
obs = env.reset()
done = False
generated_title = []
generated_body = []

while not done:
    action, _states = best_agent.predict(obs)
    obs, reward, done, info = env.step(action)
    generated_title.append(tokenizer.sequences_to_texts([obs['title']])[0])
    generated_body.append(tokenizer.sequences_to_texts([obs['body']])[0])

generated_title = " ".join(generated_title)
generated_body = " ".join(generated_body)
print("Generated title:", generated_title)
print("Generated body:", generated_body)

# Evaluate the agent
total_rewards = []
for _ in range(5):  # You can adjust the number of evaluation episodes
    obs = eval_env.reset()
    done = False
    episode_reward = 0
    while not done:
        action, _ = best_agent.predict(obs, deterministic=True)
        obs, reward, done, _ = eval_env.step(action)
        episode_reward += reward
    total_rewards.append(episode_reward)

avg_reward = np.mean(total_rewards)
print(f"Average reward for the best model: {avg_reward}")


NameError: name 'model' is not defined

In [None]:
from stable_baselines3.common.callbacks import BaseCallback

class SaveBestModelCallback(BaseCallback):
    def __init__(self, save_path, eval_env, check_freq, verbose=0):
        super(SaveBestModelCallback, self).__init__(verbose)
        self.save_path = save_path
        self.eval_env = eval_env
        self.check_freq = check_freq
        self.best_reward = -float("inf")

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            # Evaluate the current model
            total_rewards = []
            for _ in range(5):  # You can adjust the number of evaluation episodes
                obs = self.eval_env.reset()
                done = False
                episode_reward = 0
                while not done:
                    action, _ = self.model.predict(obs, deterministic=True)
                    obs, reward, done, _ = self.eval_env.step(action)
                    episode_reward += reward
                total_rewards.append(episode_reward)

            avg_reward = np.mean(total_rewards)
            print(f"Average reward after {self.n_calls} steps: {avg_reward}")  # Add this line to print the average reward
            if avg_reward > self.best_reward:
                self.best_reward = avg_reward
                self.model.save(self.save_path)
                if self.verbose:
                    print(f"New best model with reward {avg_reward}, model saved.")

        return True


save_path = "best_model"
eval_env = TextGenerationEnv(model, tokenizer, max_sequence_len, max_episode_length)
eval_env = DummyVecEnv([lambda: eval_env])
callback = SaveBestModelCallback(save_path, eval_env, check_freq=5000)


for i, sample in enumerate(generated_samples):
    print(f"Sample {i + 1}:")
    print(sample)
    print("-" * 40)
