In [None]:
# Cell 1: Setup
!pip install transformers tensorflow

In [None]:
# Cell 2: Import Libraries
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random

In [None]:
# Cell 3: Load GPT-2
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
# Cell 4: Generate Text with GPT-2
def generate_gpt2(prompt, max_length=150):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs["input_ids"], max_length=max_length, do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generate_gpt2("Explain the impact of climate change on biodiversity."))

In [None]:
# Cell 5: LSTM Model Data Preprocessing (optional if you use LSTM)
# Load and clean text
text = open('../data/sample_texts.txt').read().lower()
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
input_sequences = []

for line in text.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram = token_list[:i+1]
        input_sequences.append(n_gram)

max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
X, y = input_sequences[:, :-1], input_sequences[:, -1]

In [None]:
# Cell 6: LSTM Model Training
model_lstm = Sequential()
model_lstm.add(Embedding(total_words, 64, input_length=max_seq_len - 1))
model_lstm.add(LSTM(128))
model_lstm.add(Dense(total_words, activation='softmax'))

model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X, y, epochs=30, verbose=1)
model_lstm.save('../models/lstm_model.h5')

In [None]:
# Cell 7: Generate Text with LSTM
def generate_text_lstm(seed_text, next_words=50):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted = np.argmax(model_lstm.predict(token_list, verbose=0), axis=-1)
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                seed_text += " " + word
                break
    return seed_text

print(generate_text_lstm("The future of artificial intelligence"))