In [None]:
!pip install transformers[torch] accelerate -U
!pip install kaggle
!pip install pandas
!pip install Flask pyngrok



In [None]:
!kaggle datasets download -d wcukierski/enron-email-dataset
!unzip enron-email-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/wcukierski/enron-email-dataset
License(s): copyright-authors
enron-email-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  enron-email-dataset.zip
replace emails.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd
import re
from transformers import GPT2Tokenizer

In [None]:
df = pd.read_csv('emails.csv')
df_subset = df.sample(n=100)

In [None]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

df_subset['cleaned_message'] = df_subset['message'].apply(clean_text)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
tokenized_text = tokenizer(df_subset['cleaned_message'].tolist(), truncation=True, padding=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import torch
from transformers import GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = CustomDataset(tokenized_text)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)


In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Step,Training Loss


TrainOutput(global_step=50, training_loss=4.056005859375, metrics={'train_runtime': 2489.0289, 'train_samples_per_second': 0.04, 'train_steps_per_second': 0.02, 'total_flos': 52258406400000.0, 'train_loss': 4.056005859375, 'epoch': 1.0})

In [None]:
model.save_pretrained('./results')
tokenizer.save_pretrained('./results')


('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.json',
 './results/merges.txt',
 './results/added_tokens.json')

In [None]:
tokenized_eval_text = tokenizer(df_subset['cleaned_message'].tolist(), truncation=True, padding=True)
eval_dataset = CustomDataset(tokenized_eval_text)
evaluation_results = trainer.evaluate(eval_dataset=eval_dataset)
print("Evaluation Results:", evaluation_results)


In [None]:
!ngrok authtoken 2imEw8PpNdulBJC0tqD5jhcNjJj_46vrwUp6rziXm2v2aZTqN

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
import torch
from flask import Flask, request, jsonify
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from pyngrok import ngrok

app = Flask(__name__)

tokenizer = GPT2Tokenizer.from_pretrained('./results')
model = GPT2LMHeadModel.from_pretrained('./results')

@app.route('/generate', methods=['POST'])
def generate_text():
    data = request.json
    prompt = data.get('prompt', '')

    if not prompt:
        return jsonify({'error': 'Prompt is required'}), 400

    input_ids = tokenizer.encode(prompt, return_tensors='pt')


    generated_text = model.generate(input_ids, max_length=50, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    decoded_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)

    sentence_end = decoded_text.find('.')
    if sentence_end != -1:
        decoded_text = decoded_text[:sentence_end + 1]
    decoded_text = ' '.join(decoded_text.split())

    return jsonify({'generated_text': decoded_text})

public_url = ngrok.connect(5000)
print('Public URL:', public_url)
app.run()


Public URL: NgrokTunnel: "https://93b1-35-232-95-38.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [08/Jul/2024 10:30:57] "POST /generate HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [08/Jul/2024 10:31:27] "[31m[1mPOST /generate HTTP/1.1[0m" 400 -
INFO:werkzeug:127.0.0.1 - - [08/Jul/2024 10:34:16] "POST /generate HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [08/Jul/2024 10:35:33] "POST /generate HTTP/1.1" 200 -
